# Predict Citations with Word Vectors
Here we try to predict whether a sentence needs a 'citation needed' tag or not based on
1) Word vectors only (english, all languages)
2) Word vectors + main sec indicator
3) Word vectors + article position indicator


In [61]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from random import shuffle
import numpy as np
import hashlib

DEPTH_SEARCH=[5,10,30, 50, 100, 200]
NTREES_SEARCH=[5,10,30, 50, 100, 200]
TEST_SIZE=0.33

In [52]:
filenames={}
filenames_raw={}

"""
to use this, you will need: 
1) features from https://drive.google.com/open?id=1JZu67psmj2Eou2-8wQEJk4kAQfg8GDs2, to be placed in ../fastText_multilingual/features
"""
languages=['en','fr','it']
language_extended=['english','french','italian']
feadir='../fastText_multilingual/features/'
rawdir='../data_clean/'

def load_languages():
    for lan,lext in zip(languages,language_extended):
        filenames[lan]=feadir+lan+'.tsv' #files with vectors
        filenames_raw[lan]=rawdir+lext+'.tsv' #files with raw text

def count_negatives(negatives,positives):
    """
    for balanced data, we need to know how many negatives are out there
    """
    proportion={}
    allneg=0
    for lan in languages:
        proportion[lan]=len(negatives[lan])/float(len(negatives[lan])+len(positives[lan]))
        allneg+=len(negatives[lan])
    print 'proportion of negatives per language'
    print proportion
    return allneg

def get_values_for_crossvalidation(positives,negatives,features):
    """
    positives: list of positives
    negatives: list of negatives
    features: list of feature dictionaries, per type
    """
    values=[]
    y=[]
    ids=[]
    for lan in languages:
        shuffle(positives[lan])
        alldata=set(negatives[lan]+positives[lan][:len(negatives[lan])])
        ids=ids+list(alldata)
        for id in alldata:
            v=[]
            for f in features: #for every type of feature
                if isinstance(f[id], int):
                    v.append(f[id])
                else:
                    for element in f[id]: #append element of feature
                        v.append(element)
            values.append(np.nan_to_num(np.asarray(v)))
            y.append(labels[id])          
    #reshuffle everything for cross_validaton
    ind=range(len(y))
    shuffle(ind)
    y2=[y[i] for i in ind]
    values2=[values[i] for i in ind]
    ids2=[ids[i] for i in ind]
    return y2,values2,ids2

def perform_gridsearch_withRFC(values,y):
    """
    values: list of feature vectors
    y: labels
    returns
    max_ind: depth and estimator values
    max_val: crossval prediction accuracy
    scores: all-scores for each combination of depth and nestimators
    """
    scores={}
    #performs cross_validation in all combiantions
    for d in DEPTH_SEARCH:
        for n in NTREES_SEARCH:
            clf = RandomForestClassifier(max_depth=d, n_estimators=n)
            s = cross_val_score(clf, values, y)
            print s
            scores[str(d)+' '+str(n)]=np.mean(s)
    #computes best combination of parameters
    max_ind=''
    max_val=0
    for s in scores:
        if scores[s]>max_val:
            max_val=scores[s]
            max_ind=s
    print max_ind
    print max_val
    return max_ind,max_val,scores

def train_test_final(val_train,val_test,y_train,d,n):
    """
    just using a Random Forestc classifier on a train/test split for deployment 
    returns model and probability on the test set
    """
    clf = RandomForestClassifier(max_depth=d, n_estimators=n)
    clf.fit(val_train,y_train)
    prob=clf.predict_proba(val_test)
    return clf,prob

def print_top_bottom_sentences(prob,ids_test,y_test,text,labels):
    """
    here we are displaying the 
    """
    pos_proba=(np.asarray(prob).T)[1]
    indexes=np.argsort(-np.asarray(pos_proba))
    for i in indexes[:10]:
        print text[ids_test[i]]
        print y_test[i]
        print labels[ids_test[i]]#checking
    print ('********************************')
    for i in indexes[-10:]:
        print text[ids_test[i]]
        print y_test[i]
        print pos_proba[i]
        print labels[ids_test[i]]#checking

Let's load labels and vectors.. 


In [4]:
load_languages()

Now we load all data for prediction into different variables

In [8]:
"""
raw header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	statement label
feature header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	 label feature
"""
labels={} #whether it needs a citation or not
vectors={} #the word vectors aligned to english
main={} #is it the main section?
language={} #which language is the article from
pages={} #length of the page
start={} #starting point of the statement in the page
pagelength={} #page length, this is for future use, if we want to track where the statement is placed in the page
positives={}#statements with citation
negatives={}#statements without citation
text={}#raw text
for lan in languages:
    positives[lan]=[] #stores the statements needing a citation
    negatives[lan]=[] #stores the statements without a citation (much less than the positives)
    fraw=open(filenames_raw[lan]) #each line in fraw correspond to the line in f
    #for each line in the vector file, record various parameters and then store the corresponding raw text with the same identifier
    with open(filenames[lan]) as f:
        for line in f:
            unique=hashlib.sha224(line).hexdigest() #unique identifier of this line
            #first, we store the raw statement text from the raw file
            lineraw=fraw.readline() #line with raw text
            rowraw=lineraw[:-1].split('\t')
            text[unique]=rowraw[-2] #where the text is placed in the line
            #now, we can get features
            row=line.split('\t')
            labels[unique]=int(row[-2])#where the label sits in the feature file
            #first append to lists of positives and negatives depending on the label
            if labels[unique]==1:
                positives[lan].append(unique)
            else:
                negatives[lan].append(unique)
            #store features
            vectors[unique]=[float(r) for r in row[-1].split(',')]
            main[unique]= 1 if row[3]=='MAIN_SECTION'else 0
            language[unique]=lan
            pages[unique]=int(row[0])
            beginning=int(row[5])
            offset=int(row[6])
            l=beginning+offset
            try:
                base=pagelength[row[0]]
                pagelength[row[0]]=l if l>base else base
            except:
                pagelength[row[0]]=l
            start[unique]=beginning


In [62]:
allneg=count_negatives(negatives,positives)

proportion of negatives per language
{'fr': 0.06976161231293711, 'en': 0.05151972709224618, 'it': 0.1304069891327509}


We now select the data for training: all negatives + an equal number of positives, using only feature vectors

In [28]:
y,values,ids=get_values_for_crossvalidation(positives,negatives,[vectors])

We now run a grid search to find the good random forest parameter

In [37]:
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y)

[ 0.63483036  0.63894881  0.62384781]
[ 0.65091194  0.65993332  0.64855854]
[ 0.66699353  0.67209257  0.65934497]
[ 0.67424985  0.68268288  0.66718965]
[ 0.6736615   0.68072171  0.66248284]
[ 0.62816238  0.62992744  0.62561287]
[ 0.65306923  0.65679545  0.65150029]
[ 0.67915278  0.68268288  0.67895666]
[ 0.69346931  0.69543048  0.67032752]
[ 0.70190233  0.70602079  0.68425181]
[ 0.61619925  0.61698372  0.60933516]
[ 0.61600314  0.63012355  0.61129633]
[ 0.66934693  0.68327123  0.66718965]
[ 0.69817611  0.7015101   0.68131006]
[ 0.70857031  0.7142577   0.69405766]
[ 0.60953128  0.63306531  0.62070994]
[ 0.62188664  0.62973132  0.62286723]
[ 0.67424985  0.68797803  0.66679741]
[ 0.69817611  0.7003334   0.68738968]
[ 0.71386546  0.70621691  0.6887625 ]
[ 0.62443616  0.61757207  0.6248284 ]
[ 0.63443813  0.63620318  0.62973132]
[ 0.67385762  0.67817219  0.66267896]
[ 0.68346735  0.69719553  0.67621102]
[ 0.70700137  0.71602275  0.69346931]
30 100
0.705628554619


In [41]:
val_train, val_test, y_train, y_test, ids_train, ids_test = train_test_split(values2, y2, ids2, test_size=TEST_SIZE, random_state=42)

In [57]:
clf,prob=train_test_final(val_train,val_test,y_train,30,100)
print_top_bottom_sentences(prob,ids_test,y_test,text,labels)

so much overcome by the harsh Command, that he actually shed tears." Heywood then cited a different reason for staying aboard Bounty: Bligh's launch was overloaded, and its destruction would be assured "by the least addition to their Number". Finally, Heywood maintained he had intended to join Bligh but had been stopped: "...on hearing it suggested that I should be deem'd Guilty if I staid in the Ship, I went down directly, and in passing Mr
1
1
Groves replied that he could, but if he did then 'Portal will not see very much'." For his efforts, Chadwick received a knighthood in the New Year Honours on 1 January 1945. He considered this to be a recognition of the work of the whole Tube Alloys project.
1
1
Lincoln was loath to give up Sumter, feeling it would only encourage the South in its insurgency.
1
1
She was evidently afraid to give up this outfit even temporarily because it was likely to be confiscated by the judge and she would thereby be left without protection. A woman's dress o

We now lok at the effect of adding the 'main' features, i.e. a feature = 1 if the sentence is in the main section

In [67]:
y_m,values_m,ids_m=get_values_for_crossvalidation(positives,negatives,[vectors,main])

In [68]:
max_ind,max_val,scores=perform_gridsearch_withRFC(values_m,y_m)

[ 0.62424005  0.63169249  0.63463424]
[ 0.64855854  0.66542459  0.6411061 ]
[ 0.67111198  0.67856442  0.65875662]
[ 0.66758188  0.67268092  0.66248284]
[ 0.67679937  0.67405374  0.6609139 ]
[ 0.67934889  0.67621102  0.66503236]
[ 0.61835654  0.63110414  0.62070994]
[ 0.64993136  0.66365954  0.64306727]
[ 0.68111394  0.68895862  0.67758384]
[ 0.68582075  0.69268484  0.68346735]
[ 0.69562659  0.69935281  0.68778192]
[ 0.70072563  0.70190233  0.68562463]
[ 0.60796235  0.61502255  0.61698372]
[ 0.62227888  0.62090606  0.64149833]
[ 0.67562267  0.67895666  0.66973916]
[ 0.69386154  0.69660718  0.68542852]
[ 0.70896254  0.7154344   0.70013728]
[ 0.71582663  0.71621887  0.71072759]
[ 0.62149441  0.62561287  0.61678761]
[ 0.63326142  0.63679153  0.62110218]
[ 0.67424985  0.68954697  0.66012944]
[ 0.69817611  0.69523436  0.68268288]
[ 0.70935478  0.71170818  0.69425378]
[ 0.7177878   0.72112179  0.70837419]
[ 0.61914101  0.6248284   0.61227692]
[ 0.64797019  0.62855462  0.63777211]
[ 0.67385762

In [41]:
val_train, val_test, y_train, y_test, ids_train, ids_test = train_test_split(values2, y2, ids2, test_size=TEST_SIZE, random_state=42)

In [57]:
clf,prob=train_test_final(val_train,val_test,y_train,30,100)
print_top_bottom_sentences(prob,ids_test,y_test,text,labels)

so much overcome by the harsh Command, that he actually shed tears." Heywood then cited a different reason for staying aboard Bounty: Bligh's launch was overloaded, and its destruction would be assured "by the least addition to their Number". Finally, Heywood maintained he had intended to join Bligh but had been stopped: "...on hearing it suggested that I should be deem'd Guilty if I staid in the Ship, I went down directly, and in passing Mr
1
1
Groves replied that he could, but if he did then 'Portal will not see very much'." For his efforts, Chadwick received a knighthood in the New Year Honours on 1 January 1945. He considered this to be a recognition of the work of the whole Tube Alloys project.
1
1
Lincoln was loath to give up Sumter, feeling it would only encourage the South in its insurgency.
1
1
She was evidently afraid to give up this outfit even temporarily because it was likely to be confiscated by the judge and she would thereby be left without protection. A woman's dress o