# Predict Citations with Word Vectors
Here we try to predict whether a sentence needs a 'citation needed' tag or not based on
1) Word vectors only (english, all languages)
2) Word vectors + main sec indicator
3) Word vectors + article position indicator


In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from random import shuffle
import numpy as np
import hashlib

DEPTH_SEARCH=[5,10,30, 50, 100, 200]
NTREES_SEARCH=[5,10,30, 50, 100, 200]
TEST_SIZE=0.33



In [2]:
filenames={}
filenames_raw={}

"""
to use this, you will need: 
1) features from https://drive.google.com/open?id=1JZu67psmj2Eou2-8wQEJk4kAQfg8GDs2, to be placed in ../fastText_multilingual/features
"""
languages=['en','fr','it']
language_extended=['english','french','italian']
feadir='../fastText_multilingual/features/'
rawdir='../data_clean/'

def load_languages():
    for lan,lext in zip(languages,language_extended):
        filenames[lan]=feadir+lan+'.tsv' #files with vectors
        filenames_raw[lan]=rawdir+lext+'.tsv' #files with raw text

def count_negatives(negatives,positives):
    """
    for balanced data, we need to know how many negatives are out there
    """
    proportion={}
    allneg=0
    for lan in languages:
        proportion[lan]=len(negatives[lan])/float(len(negatives[lan])+len(positives[lan]))
        allneg+=len(negatives[lan])
    print 'proportion of negatives per language'
    print proportion
    return allneg

def get_values_for_crossvalidation(positives,negatives,features):
    """
    positives: list of positives
    negatives: list of negatives
    features: list of feature dictionaries, per type
    """
    values=[]
    y=[]
    ids=[]
    for lan in languages:
        shuffle(positives[lan])
        alldata=set(negatives[lan]+positives[lan][:len(negatives[lan])])
        ids=ids+list(alldata)
        for id in alldata:
            v=[]
            for f in features: #for every type of feature
                if isinstance(f[id], int):
                    v.append(f[id])
                else:
                    for element in f[id]: #append element of feature
                        v.append(element)
            values.append(np.nan_to_num(np.asarray(v)))
            y.append(labels[id])          
    #reshuffle everything for cross_validaton
    ind=range(len(y))
    shuffle(ind)
    y2=[y[i] for i in ind]
    values2=[values[i] for i in ind]
    ids2=[ids[i] for i in ind]
    return y2,values2,ids2

def perform_gridsearch_withRFC(values,y):
    """
    values: list of feature vectors
    y: labels
    returns
    max_ind: depth and estimator values
    max_val: crossval prediction accuracy
    scores: all-scores for each combination of depth and nestimators
    """
    scores={}
    #performs cross_validation in all combiantions
    for d in DEPTH_SEARCH:
        for n in NTREES_SEARCH:
            clf = RandomForestClassifier(max_depth=d, n_estimators=n)
            s = cross_val_score(clf, values, y)
            print s
            scores[str(d)+' '+str(n)]=np.mean(s)
    #computes best combination of parameters
    max_ind=''
    max_val=0
    for s in scores:
        if scores[s]>max_val:
            max_val=scores[s]
            max_ind=s
    print max_ind
    print max_val
    return max_ind,max_val,scores

def train_test_final(val_train,val_test,y_train,d,n):
    """
    just using a Random Forestc classifier on a train/test split for deployment 
    returns model and probability on the test set
    """
    clf = RandomForestClassifier(max_depth=d, n_estimators=n)
    clf.fit(val_train,y_train)
    prob=clf.predict_proba(val_test)
    return clf,prob

def print_top_bottom_sentences(prob,ids_test,y_test,text,labels):
    """
    here we are displaying the 
    """
    pos_proba=(np.asarray(prob).T)[1]
    indexes=np.argsort(-np.asarray(pos_proba))
    for i in indexes[:10]:
        print text[ids_test[i]]
        print y_test[i]
        print labels[ids_test[i]]#checking
    print ('********************************')
    for i in indexes[-10:]:
        print text[ids_test[i]]
        print y_test[i]
        print pos_proba[i]
        print labels[ids_test[i]]#checking

Let's load labels and vectors.. 


In [3]:
load_languages()

Now we load all data for prediction into different variables

In [4]:
"""
raw header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	statement label
feature header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	 label feature
"""
labels={} #whether it needs a citation or not
vectors={} #the word vectors aligned to english
main={} #is it the main section?
language={} #which language is the article from
pages={} #length of the page
start={} #starting point of the statement in the page
pagelength={} #page length, this is for future use, if we want to track where the statement is placed in the page
positives={}#statements with citation
negatives={}#statements without citation
text={}#raw text
for lan in languages:
    positives[lan]=[] #stores the statements needing a citation
    negatives[lan]=[] #stores the statements without a citation (much less than the positives)
    fraw=open(filenames_raw[lan]) #each line in fraw correspond to the line in f
    #for each line in the vector file, record various parameters and then store the corresponding raw text with the same identifier
    with open(filenames[lan]) as f:
        for line in f:
            unique=hashlib.sha224(line).hexdigest() #unique identifier of this line
            #first, we store the raw statement text from the raw file
            lineraw=fraw.readline() #line with raw text
            rowraw=lineraw[:-1].split('\t')
            text[unique]=rowraw[-2] #where the text is placed in the line
            #now, we can get features
            row=line.split('\t')
            labels[unique]=int(row[-2])#where the label sits in the feature file
            #first append to lists of positives and negatives depending on the label
            if labels[unique]==1:
                positives[lan].append(unique)
            else:
                negatives[lan].append(unique)
            #store features
            vectors[unique]=[float(r) for r in row[-1].split(',')]
            main[unique]= 1 if row[4]=='MAIN_SECTION'else 0
            language[unique]=lan
            pages[unique]=int(row[0])
            beginning=int(row[5])
            offset=int(row[6])
            l=beginning+offset
            try:
                base=pagelength[row[0]]
                pagelength[row[0]]=l if l>base else base
            except:
                pagelength[row[0]]=l
            start[unique]=beginning


In [5]:
allneg=count_negatives(negatives,positives)

proportion of negatives per language
{'fr': 0.06976161231293711, 'en': 0.05151972709224618, 'it': 0.1304069891327509}


We now select the data for training: all negatives + an equal number of positives, using only feature vectors

In [6]:
y,values,ids=get_values_for_crossvalidation(positives,negatives,[vectors])

We now run a grid search to find the good random forest parameter

In [7]:
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y)

[ 0.62306335  0.62678957  0.62750098]
[ 0.6458129   0.64169445  0.64809729]
[ 0.67444597  0.66875858  0.65476658]
[ 0.67268092  0.67326927  0.65692428]
[ 0.67523044  0.67542655  0.66359357]
[ 0.67895666  0.67660326  0.65653197]
[ 0.63384977  0.62208276  0.61690859]
[ 0.65522652  0.65679545  0.64142801]
[ 0.68013336  0.67777996  0.66555512]
[ 0.68346735  0.68797803  0.68105139]
[ 0.70013728  0.69346931  0.68654374]
[ 0.70190233  0.7003334   0.69203609]
[ 0.62169053  0.61816042  0.61082778]
[ 0.62620122  0.62659345  0.62063554]
[ 0.67660326  0.67934889  0.67143978]
[ 0.69170426  0.69425378  0.69301687]
[ 0.70700137  0.70347127  0.70302079]
[ 0.71582663  0.71347323  0.70635543]
[ 0.61404197  0.6156109   0.60690467]
[ 0.62443616  0.63855658  0.63652413]
[ 0.67228868  0.67503432  0.6612397 ]
[ 0.6852324   0.69601883  0.68732836]
[ 0.70719749  0.70857031  0.70145155]
[ 0.71563052  0.71974897  0.70596312]
[ 0.60835458  0.60462836  0.61435857]
[ 0.62502451  0.62541675  0.62181248]
[ 0.66503236

In [9]:
val_train, val_test, y_train, y_test, ids_train, ids_test = train_test_split(values, y, ids, test_size=TEST_SIZE, random_state=42)

In [10]:
clf,prob=train_test_final(val_train,val_test,y_train,50,200)
print_top_bottom_sentences(prob,ids_test,y_test,text,labels)

The Plainsman (1936) and John Ford's Stagecoach (1939).
1
1
Singleton, Brian (2004). Oscar Asche, Orientalism, and British Musical Comedy. Westport, CT: Greenwood Publishing Group. p.75. ISBN978-0-275-97929-4.
0
0
By 1924, many UFA Members of the Legislative Assembly (MLAs) wanted to see Greenfield leave office, both because they were frustrated with his failings and because they thought it likely that a Greenfield-led government would be defeated in the next election. Their first attempt to replace him failed when Brownlee, their intended replacement, refused to have anything to do with the plan, but a second attempt, in 1925, was successful when Brownlee agreed to take office if Greenfield personally requested that he do so. Greenfield had not wanted the job in the first place, and agreed to resign in Brownlee's favour.
0
0
Thorpe spoke to reporters after his 100m race, saying: "When I started this I wanted to get back into the pool. I wanted to start racing again, I wanted to be com

We now lok at the effect of adding the 'main' features, i.e. a feature = 1 if the sentence is in the main section

In [11]:
y_m,values_m,ids_m=get_values_for_crossvalidation(positives,negatives,[vectors,main])

In [12]:
max_ind,max_val,scores=perform_gridsearch_withRFC(values_m,y_m)

[ 0.65954109  0.69052755  0.670459  ]
[ 0.68248676  0.71680722  0.69203609]
[ 0.69562659  0.7050402   0.72302864]
[ 0.69994116  0.72680918  0.70929776]
[ 0.70798196  0.72504413  0.72499019]
[ 0.7050402   0.72327907  0.71734013]
[ 0.65797215  0.66817023  0.67948215]
[ 0.69464601  0.71484605  0.69615536]
[ 0.73386939  0.74073348  0.73891722]
[ 0.72837811  0.75093156  0.75107885]
[ 0.74014513  0.75583448  0.75284425]
[ 0.74740145  0.75406943  0.75304041]
[ 0.67346539  0.64561679  0.65515889]
[ 0.68385958  0.69346931  0.66888976]
[ 0.71876839  0.72955481  0.72812868]
[ 0.73583056  0.74838204  0.74440957]
[ 0.75053932  0.76407139  0.75676736]
[ 0.758384    0.76250245  0.7618674 ]
[ 0.66424789  0.65601098  0.66928207]
[ 0.68425181  0.70111787  0.68340526]
[ 0.72524024  0.73739949  0.73675951]
[ 0.73641891  0.73661502  0.75205963]
[ 0.75544224  0.75583448  0.7565712 ]
[ 0.75975682  0.76564032  0.76579051]
[ 0.6609139   0.69150814  0.66555512]
[ 0.67699549  0.69111591  0.69556689]
[ 0.72680918

In [13]:
val_train, val_test, y_train, y_test, ids_train, ids_test = train_test_split(values, y, ids, test_size=TEST_SIZE, random_state=42)

In [14]:
clf,prob=train_test_final(val_train,val_test,y_train,100,200)
print_top_bottom_sentences(prob,ids_test,y_test,text,labels)

Singleton, Brian (2004). Oscar Asche, Orientalism, and British Musical Comedy. Westport, CT: Greenwood Publishing Group. p.75. ISBN978-0-275-97929-4.
0
0
The Plainsman (1936) and John Ford's Stagecoach (1939).
1
1
By 1924, many UFA Members of the Legislative Assembly (MLAs) wanted to see Greenfield leave office, both because they were frustrated with his failings and because they thought it likely that a Greenfield-led government would be defeated in the next election. Their first attempt to replace him failed when Brownlee, their intended replacement, refused to have anything to do with the plan, but a second attempt, in 1925, was successful when Brownlee agreed to take office if Greenfield personally requested that he do so. Greenfield had not wanted the job in the first place, and agreed to resign in Brownlee's favour.
0
0
He considered himself vindicated when a move to override his veto failed in the Senate the following day. Johnson believed that the Radicals would now be isolated