# Predict Citations with Word Vectors
Here we try to predict whether a sentence needs a 'citation needed' tag or not based on
1) Word vectors only (english, all languages)
2) Word vectors + main sec indicator
3) Word vectors + article position indicator


In [55]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from random import shuffle
import numpy as np
import hashlib
from nltk.stem import SnowballStemmer

DEPTH_SEARCH=[5,10,30, 50, 100, 200]
NTREES_SEARCH=[5,10,30, 50, 100, 200]
TEST_SIZE=0.33

In [58]:
filenames={}
filenames_raw={}

"""
to use this, you will need: 
1) features from https://drive.google.com/open?id=1JZu67psmj2Eou2-8wQEJk4kAQfg8GDs2, to be placed in ../fastText_multilingual/features
"""
languages=['en','fr','it']
language_extended=['english','french','italian']
feadir='../fastText_multilingual/features/stemmed_'
rawdir='../data_clean/'

def load_languages():
    for lan,lext in zip(languages,language_extended):
        filenames[lan]=feadir+lan+'.tsv' #files with vectors
        filenames_raw[lan]=rawdir+lext+'.tsv' #files with raw text

def count_negatives(negatives,positives):
    """
    for balanced data, we need to know how many negatives are out there
    """
    proportion={}
    allneg=0
    for lan in languages:
        proportion[lan]=len(negatives[lan])/float(len(negatives[lan])+len(positives[lan]))
        allneg+=len(negatives[lan])
    print 'proportion of negatives per language'
    print proportion
    return allneg

def get_values_for_crossvalidation(positives,negatives,features):
    """
    positives: list of positives
    negatives: list of negatives
    features: list of feature dictionaries, per type
    """
    values=[]
    y=[]
    ids=[]
    for lan in languages:
        shuffle(positives[lan])
        alldata=set(negatives[lan]+positives[lan][:len(negatives[lan])])
        ids=ids+list(alldata)
        for id in alldata:
            v=[]
            for f in features: #for every type of feature
                if isinstance(f[id], int):
                    v.append(f[id])
                else:
                    for element in f[id]: #append element of feature
                        v.append(element)
            values.append(np.nan_to_num(np.asarray(v)))
            y.append(labels[id])          
    #reshuffle everything for cross_validaton
    ind=range(len(y))
    shuffle(ind)
    y2=[y[i] for i in ind]
    values2=[values[i] for i in ind]
    ids2=[ids[i] for i in ind]
    return y2,values2,ids2

def perform_gridsearch_withRFC(values,y):
    """
    values: list of feature vectors
    y: labels
    returns
    max_ind: depth and estimator values
    max_val: crossval prediction accuracy
    scores: all-scores for each combination of depth and nestimators
    """
    scores={}
    #performs cross_validation in all combiantions
    for d in DEPTH_SEARCH:
        for n in NTREES_SEARCH:
            clf = RandomForestClassifier(max_depth=d, n_estimators=n)
            s = cross_val_score(clf, values, y)
            print s
            scores[str(d)+' '+str(n)]=np.mean(s)
    #computes best combination of parameters
    max_ind=''
    max_val=0
    for s in scores:
        if scores[s]>max_val:
            max_val=scores[s]
            max_ind=s
    print max_ind
    print max_val
    return max_ind,max_val,scores

def train_test_final(val_train,val_test,y_train,d,n):
    """
    just using a Random Forestc classifier on a train/test split for deployment 
    returns model and probability on the test set
    """
    clf = RandomForestClassifier(max_depth=d, n_estimators=n)
    clf.fit(val_train,y_train)
    prob=clf.predict_proba(val_test)
    return clf,prob

def print_top_bottom_sentences(prob,ids_test,y_test,text,labels):
    """
    here we are displaying the 
    """
    pos_proba=(np.asarray(prob).T)[1]
    indexes=np.argsort(-np.asarray(pos_proba))
    for i in indexes[:10]:
        print text[ids_test[i]]
        print y_test[i]
        print labels[ids_test[i]]#checking
    print ('********************************')
    for i in indexes[-10:]:
        print text[ids_test[i]]
        print y_test[i]
        print pos_proba[i]
        print labels[ids_test[i]]#checking

In [59]:
load_languages()

Let's load labels and vectors.. 


Now we load all data for prediction into different variables

In [60]:
"""
raw header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	statement label
feature header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	 label feature
"""
labels={} #whether it needs a citation or not
vectors={} #the word vectors aligned to english
main={} #is it the main section?
language={} #which language is the article from
pages={} #length of the page
start={} #starting point of the statement in the page
pagelength={} #page length, this is for future use, if we want to track where the statement is placed in the page
positives={}#statements with citation
negatives={}#statements without citation
text={}#raw text
for lan in languages:
    positives[lan]=[] #stores the statements needing a citation
    negatives[lan]=[] #stores the statements without a citation (much less than the positives)
    fraw=open(filenames_raw[lan]) #each line in fraw correspond to the line in f
    #for each line in the vector file, record various parameters and then store the corresponding raw text with the same identifier
    with open(filenames[lan]) as f:
        for line in f:
            unique=hashlib.sha224(line).hexdigest() #unique identifier of this line
            #first, we store the raw statement text from the raw file
            lineraw=fraw.readline() #line with raw text
            rowraw=lineraw[:-1].split('\t')
            text[unique]=rowraw[-2] #where the text is placed in the line
            #now, we can get features
            row=line.split('\t')
            labels[unique]=int(row[-2])#where the label sits in the feature file
            #first append to lists of positives and negatives depending on the label
            if labels[unique]==1:
                positives[lan].append(unique)
            else:
                negatives[lan].append(unique)
            #store features
            vectors[unique]=[float(r) for r in row[-1].split(',')]
            main[unique]= 1 if row[4]=='MAIN_SECTION'else 0
            language[unique]=lan
            pages[unique]=int(row[0])
            beginning=int(row[5])
            offset=int(row[6])
            l=beginning+offset
            try:
                base=pagelength[row[0]]
                pagelength[row[0]]=l if l>base else base
            except:
                pagelength[row[0]]=l
            start[unique]=beginning


In [61]:
print line
allneg=count_negatives(negatives,positives)

117454	94475822	f3735f2a-104c-11e8-a6be-fb23e605380c	Simone_Weil	Biografia	19711	20995	1	-0.0234660159402,-0.0142615728868,-0.0366149184486,0.139208157685,-0.100458435923,0.075994903484,0.0457816233781,-0.107146792871,0.0379453973999,0.154462158517,0.107864405814,0.0185702696395,-0.023211463543,-0.0109735415887,-0.0152959217908,-0.188028506457,-0.0559298531445,0.0100458405114,-0.0318245386809,0.15033450955,-0.120968683357,0.096361409248,-0.157447930833,-0.13231923028,-0.137832762829,-0.0141949245363,0.0451553561676,-0.0315552961723,-0.0537654468042,0.0478957474443,-0.0736177318144,0.190859932679,-0.212526190725,0.175684732444,0.0077851947106,-0.156260870839,0.00858716256714,-0.0518954465578,0.0899437313784,-0.0438456770004,0.0826694288738,-0.0602778662965,-0.00933837225453,-0.0360175081812,0.060137250775,0.0343859476608,0.0772921851828,0.00197318333755,0.0192915973712,-0.0151013780001,0.0705501617773,-0.132521172674,0.0109519000898,-0.0225958440121,-0.110062333207,-0.0173919791321,0.05

We now select the data for training: all negatives + an equal number of positives, using only feature vectors

In [62]:
y,values,ids=get_values_for_crossvalidation(positives,negatives,[vectors])

We now run a grid search to find the good random forest parameter

In [40]:
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y)

[ 0.63201255  0.63142409  0.60082385]
[ 0.63201255  0.63868184  0.62808945]
[ 0.65849353  0.65829737  0.64829345]
[ 0.67163594  0.66888976  0.65633582]
[ 0.67065516  0.66908592  0.66084739]
[ 0.68026677  0.67614751  0.66692821]
[ 0.60553158  0.61298548  0.62142016]
[ 0.64162417  0.63534719  0.63397411]
[ 0.6665359   0.67300902  0.66222048]
[ 0.67634366  0.68046293  0.66987054]
[ 0.68575912  0.69399765  0.68242448]
[ 0.6924284   0.69988231  0.69183994]
[ 0.61141624  0.60572774  0.60513927]
[ 0.61004315  0.60592389  0.61533935]
[ 0.66555512  0.6832091   0.66143586]
[ 0.68242448  0.68732836  0.67124362]
[ 0.69537073  0.70007846  0.69537073]
[ 0.70890545  0.71302472  0.70105924]
[ 0.60788545  0.60827776  0.61023931]
[ 0.61847783  0.61690859  0.61533935]
[ 0.66908592  0.66339741  0.65790506]
[ 0.67987446  0.67693213  0.67085131]
[ 0.70125539  0.69615536  0.68909376]
[ 0.70851314  0.70812083  0.69713613]
[ 0.6006277   0.60082385  0.60102001]
[ 0.6173009   0.61455473  0.61671244]
[ 0.65535504

In [43]:
val_train, val_test, y_train, y_test, ids_train, ids_test = train_test_split(values, y, ids, test_size=TEST_SIZE, random_state=42)

In [44]:
clf,prob=train_test_final(val_train,val_test,y_train,30,200)
print_top_bottom_sentences(prob,ids_test,y_test,text,labels)

She did not leave the Homestead unless it was absolutely necessary and as early as 1867, she began to talk to visitors from the other side of a door rather than speaking to them face to face. She acquired local notoriety; she was rarely seen, and when she was, she was usually clothed in white
1
1
Laurent stated that he was more interested in seeing people than in talking to cameras. Though the Liberals outspent the Progressive Conservatives three to one, according to Newman, their campaign had little imagination, and was based on telling voters that their only real option was to re-elect St
1
1
Emily had hailed from Mittagong in the NSW Southern Highlands, and in 1911, when Don Bradman was about two-and-a-half years old, his parents decided to relocate to Bowral, close to Mittagong, to be closer to Emily's family and friends, as life at Yeo Yeo was proving difficult.
1
1
Stevens argued that slavery should not survive the war; he was frustrated by the slowness of President Abraham Linco

We now lok at the effect of adding the 'main' features, i.e. a feature = 1 if the sentence is in the main section

In [64]:
y_m,values_m,ids_m=get_values_for_crossvalidation(positives,negatives,[vectors,main])
print max(main.values())

1


In [66]:
print values_m[2]

[ -2.22932198e-02   2.48287391e-02  -1.34431170e-01   1.93943326e-01
  -1.18193526e-01  -4.53942944e-02  -1.31748940e-02  -1.25637374e-01
  -1.05412933e-02   1.67220691e-01   1.26308949e-01   3.23974193e-02
  -1.02302300e-01  -1.79439683e-01  -4.62725528e-02  -1.89429637e-01
  -1.21451011e-01   2.94070339e-02  -2.13163920e-02   2.42442319e-01
  -1.08893781e-01   1.69510409e-01  -3.63013636e-02  -6.48514700e-02
  -4.40521825e-02  -9.01744864e-02  -7.05358130e-02  -3.85843004e-02
   8.38680463e-02   1.90801300e-01  -6.90839315e-02   1.39425423e-01
  -1.87699502e-01   1.00681938e-01   1.55085991e-01  -3.67203123e-03
   3.38778852e-02  -1.22600270e-01   1.97078258e-02  -5.97593685e-02
  -3.50128612e-02   2.01079759e-02  -5.59682152e-03  -2.76918310e-03
   1.33086467e-01  -4.58580504e-02   9.19841481e-02   2.74420826e-02
   6.73058222e-02   6.61556058e-02   1.28251146e-01  -1.68917551e-01
   8.30966842e-03   1.43958927e-01  -9.91296589e-02   3.57136365e-02
   8.58184537e-02   1.58799900e-01

In [67]:
max_ind,max_val,scores=perform_gridsearch_withRFC(values_m,y_m)

[ 0.68503628  0.66143586  0.64123186]
[ 0.68327123  0.69301687  0.71361318]
[ 0.70955089  0.71694782  0.71459396]
[ 0.71935674  0.73519027  0.73107101]
[ 0.72641694  0.73577874  0.72754021]
[ 0.71621887  0.73597489  0.72969792]
[ 0.6840557   0.66398588  0.69340918]
[ 0.70758972  0.70302079  0.72185171]
[ 0.74347911  0.74068262  0.73891722]
[ 0.73936066  0.74656728  0.74303648]
[ 0.75505001  0.75402118  0.75735583]
[ 0.7503432   0.75676736  0.76225971]
[ 0.64875466  0.67850137  0.64593958]
[ 0.67640714  0.67516673  0.69164378]
[ 0.73269268  0.73362103  0.72950177]
[ 0.7491665   0.74617497  0.74637113]
[ 0.75446166  0.75637505  0.76343664]
[ 0.76230633  0.75931738  0.76480973]
[ 0.67444597  0.66594743  0.65692428]
[ 0.68719357  0.68065908  0.68281679]
[ 0.74151794  0.73028639  0.73872107]
[ 0.74897039  0.73558258  0.74362495]
[ 0.76112963  0.7565712   0.75696352]
[ 0.76524809  0.76441742  0.76147509]
[ 0.67581879  0.6594743   0.67300902]
[ 0.69170426  0.69752844  0.68477834]
[ 0.72778976

In [69]:
val_train, val_test, y_train, y_test, ids_train, ids_test = train_test_split(values, y, ids, test_size=TEST_SIZE, random_state=42)

In [71]:
clf,prob=train_test_final(val_train,val_test,y_train,100,200)
print_top_bottom_sentences(prob,ids_test,y_test,text,labels)

At age 36, Jeter appeared to be in decline; Joe Sheehan of Baseball Prospectus suggested that Jeter, once a "good, not great" shortstop, had declined to become "below average" defensively, to the extent that he would likely need to change positions; Cashman later acknowledged that Jeter might need to shift to the outfield. Though Jeter stated that he wanted to remain with the Yankees, negotiations became tense
1
1
Her assistance and competence allowed Warren Harding to travel to make speeches (his use of the free railroad pass increased greatly after his marriage). Florence Harding made sure no cents escaped her—sometimes sending Warren to the bank with a gallon (3.8 l) bucket full in each hand—and wrote of her husband, "he does well when he listens to me and poorly when he does not."
1
1
When Nixon read the book, he commented, "What I am wondering is where all the money went that we were supposed to have had!"
1
1
Wicket-keeper Barry Jarman said that Craig "had to do it himself...I wa