In [1]:
import pandas as pd
from collections import *
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 26 days


In [2]:
f = open("../data/wikitext-103/wiki.test.tokens")
txt_test = f.read()
f.close()

f = open("../data/wikitext-103/wiki.valid.tokens")
txt_valid = f.read()
f.close()


f = open("../data/wikitext-103/wiki.train.tokens")
txt_train = f.read()
f.close()

txt = txt_test + txt_valid + txt_train

In [3]:
iters = re.finditer(r"\n = [\w']+ .* = \n \n", txt.lower())
indices = [m.start(0) for m in iters]

In [4]:
articles = []
for i, index in enumerate(indices):
    if i < (len(indices)-1):
        articles.append(txt[indices[i]:indices[i+1]])
    else:
        articles.append(txt[index:])
        
print "The number of articles we have:", len(articles)

The number of articles we have: 25951


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.data import load
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
tagger = load(_POS_TAGGER)  # same tagger as using nltk.pos_tag

regexp_tagger = nltk.tag.RegexpTagger([(r'\(|\)', '--')], backoff = tagger)

def get_concordance(input_text, target_word, left_margin = 10, right_margin = 10):
    text = nltk.Text(nltk.word_tokenize(input_text.decode('utf-8')))
    
    c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())

    concordance_txt = ([text.tokens[map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset])[0]:offset+right_margin] for offset in c.offsets(target_word)])
    return [''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt] 

In [6]:
article_titles = []
first_sentence_articles = []

he_counts = []
she_counts = []
word_counts = []

all_he_words = []
all_she_words = []

he_concordances = []
she_concordances = []
pronoun_concordances = []

for i, article in enumerate(articles):
    # get the title of the article
    article_title = re.search(r"\n = [\w']+ .* =", article.lower()).group()[4:-2]
    article_titles.append(article_title)
    # get the first sentence of article
    if i == 18881:
        # it appears that this article is not a full article and only has one sentence 
        # without a period, so we treat it as a special case
        first_sentence = article
    else:
        first_sentence = re.search(r'([A-Z][^\.!?]*[\.!?])', article).group()
    # remove non-word characters
    first_sentence = " ".join(re.findall("[a-zA-Z()]+", first_sentence))
    # remove contents parenthesis and the content within it
    first_sentence = re.sub(r'\([^)]*\)', '', first_sentence)
    # remove title duplicacy
    first_sentence = article_title + " " + re.sub(article_title.lower(), '', first_sentence.lower()).strip()
    first_sentence_articles.append(first_sentence)
    
    # compute pronoun stats
    he_counts.append(article.lower().count(" he "))
    she_counts.append(article.lower().count(" she "))
    word_counts.append(len(re.findall("[a-zA-Z_]+", article)))
    
    words_after_he = " ".join(re.findall(r" he [\w']+", article.lower())) 
    all_he_words.append(re.sub(r"\bhe\b", '', words_after_he).strip())
    
    words_after_she = " ".join(re.findall(r" she [\w']+", article.lower())) 
    all_she_words.append(re.sub(r"\bshe\b", '', words_after_she).strip())
    
    # get pronoun concordances
    he_concordance = " ".join(get_concordance(article, target_word = "HE"))
    she_concordance = " ".join(get_concordance(article, target_word = "SHE"))
    
    he_concordances.append(he_concordance)
    she_concordances.append(she_concordance)
    pronoun_concordances.append(he_concordance+she_concordance)

In [7]:
df_pronoun_stats = pd.DataFrame({'article_title': article_titles, 'first_sentence': first_sentence_articles,
                                'he_counts': he_counts, 'she_counts': she_counts, 'word_counts': word_counts,
                                "words_after_he": all_he_words, "words_after_she": all_she_words})
df_pronoun_stats.tail()

Unnamed: 0,article_title,first_sentence,he_counts,she_counts,word_counts,words_after_he,words_after_she
25946,al wistert,al wistert albert alexander ox wistert was an...,29,0,912,played was was is was was wore w...,
25947,si una vez,si una vez is a song recorded by american reco...,0,5,1064,,ever will ever said felt
25948,sicklefin lemon shark,sicklefin lemon shark the or sharptooth lemon...,0,0,2784,,
25949,ontario highway 89,ontario highway 89 ontario highway king s high...,0,0,699,,
25950,luke smith ( writer ),luke smith ( writer ) luke smith luke michael...,18,0,1120,is wrote considered left worked left...,


In [8]:
count_vect = CountVectorizer(min_df = 10, analyzer = "word", stop_words="english")
X_train_counts = count_vect.fit_transform(articles)
print X_train_counts.shape

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print X_train_tf.shape

(25951, 79507)
(25951, 79507)


In [9]:
svd = TruncatedSVD(n_components=4, n_iter=7, random_state=42)
svd_articles = svd.fit_transform(X_train_tf)
articles_svd_df = pd.DataFrame(svd_articles)
articles_svd_df.rename(columns=lambda x: "pc" + str(x), inplace=True)

In [10]:
count_vect = CountVectorizer(min_df = 10, analyzer = "word", stop_words="english")
X_train_counts = count_vect.fit_transform(pronoun_concordances)
print X_train_counts.shape

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print X_train_tf.shape

(25951, 22109)
(25951, 22109)


In [11]:
svd_pronoun_words = svd.fit_transform(X_train_tf)
pronoun_svd_df = pd.DataFrame(svd_pronoun_words)
pronoun_svd_df.rename(columns=lambda x: "pronoun_pc" + str(x), inplace=True)

In [12]:
svd_df = articles_svd_df.join(pronoun_svd_df)

In [13]:
res = df_pronoun_stats.join(svd_df)
res.tail(10)

Unnamed: 0,article_title,first_sentence,he_counts,she_counts,word_counts,words_after_he,words_after_she,pc0,pc1,pc2,pc3,pronoun_pc0,pronoun_pc1,pronoun_pc2,pronoun_pc3
25941,peterborough ( uk parliament constituency ),peterborough ( uk parliament constituency ) pe...,6,3,1761,defeated held lost lost was then,was had had,0.217689,-0.086183,-0.058102,0.01838,0.128556,-0.056635,-0.027908,-0.029251
25942,bart 's girlfriend,bart 's girlfriend bart s girlfriend bart s gi...,20,24,1847,discovers is was approaches is is ...,is ignores still likes seems tells ...,0.176964,0.149496,-0.073557,-0.150156,0.183882,0.108048,-0.001484,-0.019044
25943,delhi metro,delhi metro the is a metro system serving del...,0,0,5599,,,0.37635,-0.150125,0.078491,0.046512,0.0,0.0,0.0,0.0
25944,time enough at last,time enough at last is episode of the american...,23,2,2319,wastes eagerly sees sees prepares ha...,has destroys,0.347325,0.165617,-0.058324,-0.259688,0.149348,0.032326,-0.021729,0.009463
25945,dead head fred,dead head fred is a horror themed action adven...,12,0,3611,pieces relies can was could does d...,,0.260642,0.081734,-0.045168,-0.174923,0.064506,0.019422,0.000329,0.015098
25946,al wistert,al wistert albert alexander ox wistert was an...,29,0,912,played was was is was was wore w...,,0.260348,0.024139,-0.122821,-0.18825,0.266892,-0.214507,0.27704,0.096868
25947,si una vez,si una vez is a song recorded by american reco...,0,5,1064,,ever will ever said felt,0.245286,0.325969,0.04681,0.269059,0.149783,0.210442,0.060708,-0.075083
25948,sicklefin lemon shark,sicklefin lemon shark the or sharptooth lemon...,0,0,2784,,,0.281761,-0.120124,0.101113,0.034368,0.0,0.0,0.0,0.0
25949,ontario highway 89,ontario highway 89 ontario highway king s high...,0,0,699,,,0.119769,-0.110401,-0.099086,0.075366,0.0,0.0,0.0,0.0
25950,luke smith ( writer ),luke smith ( writer ) luke smith luke michael...,18,0,1120,is wrote considered left worked left...,,0.180736,0.102987,-0.080158,-0.152136,0.216797,0.120769,0.105034,-0.038383


In [14]:
res.to_csv("../data/wiki_tfidf.csv", index = False)