# Predict Citations with Word to Watch Vectors
Here we try to predict whether a sentence needs a 'citation needed' tag or not based on
1) Word to watch vectors only (english, all languages)
2) Word to watch vectors + main sec indicator


In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from random import shuffle
import numpy as np
import hashlib

DEPTH_SEARCH=[30, 50, 100, 200]
NTREES_SEARCH=[30, 50, 100, 200]
TEST_SIZE=0.33



In [2]:
filenames={}
filenames_raw={}
filenames_ww={}
"""
to use this, you will need: 
1) features from https://drive.google.com/open?id=1JZu67psmj2Eou2-8wQEJk4kAQfg8GDs2, to be placed in ../fastText_multilingual/features
"""
languages=['en','fr','it']
language_extended=['english','french','italian']
feadir='../fastText_multilingual/features/all_'
rawdir='../data_clean/all_'

def load_languages():
    for lan,lext in zip(languages,language_extended):
        filenames[lan]=feadir+lan+'.tsv' #files with vectors
        filenames_ww[lan]=feadir+'ww_'+lan+'.tsv' #files with vectors
        filenames_raw[lan]=rawdir+lext+'.tsv' #files with raw text

def count_negatives(negatives,positives):
    """
    for balanced data, we need to know how many negatives are out there
    """
    proportion={}
    allneg=0
    for lan in languages:
        proportion[lan]=len(negatives[lan])/float(len(negatives[lan])+len(positives[lan]))
        allneg+=len(negatives[lan])
    print 'proportion of negatives per language'
    print proportion
    return allneg

def get_values_for_crossvalidation(positives,negatives,features):
    """
    positives: list of positives
    negatives: list of negatives
    features: list of feature dictionaries, per type
    """
    values=[]
    y=[]
    ids=[]
    for lan in languages:
        shuffle(positives[lan])
        alldata=set(negatives[lan]+positives[lan][:len(negatives[lan])])
        ids=ids+list(alldata)
        for id in alldata:
            v=[]
            for f in features: #for every type of feature
                if isinstance(f[id], int):
                    v.append(f[id])
                else:
                    for element in f[id]: #append element of feature
                        v.append(element)
            values.append(np.nan_to_num(np.asarray(v)))
            y.append(labels[id])          
    #reshuffle everything for cross_validaton
    ind=range(len(y))
    shuffle(ind)
    y2=[y[i] for i in ind]
    values2=[values[i] for i in ind]
    ids2=[ids[i] for i in ind]
    return y2,values2,ids2

def perform_gridsearch_withRFC(values,y):
    """
    values: list of feature vectors
    y: labels
    returns
    max_ind: depth and estimator values
    max_val: crossval prediction accuracy
    scores: all-scores for each combination of depth and nestimators
    """
    scores={}
    #performs cross_validation in all combiantions
    for d in DEPTH_SEARCH:
        for n in NTREES_SEARCH:
            clf = RandomForestClassifier(max_depth=d, n_estimators=n)
            s = cross_val_score(clf, values, y)
            print s
            scores[str(d)+' '+str(n)]=np.mean(s)
    #computes best combination of parameters
    max_ind=''
    max_val=0
    for s in scores:
        if scores[s]>max_val:
            max_val=scores[s]
            max_ind=s
    print max_ind
    print max_val
    return max_ind,max_val,scores

def train_test_final(val_train,val_test,y_train,d,n):
    """
    just using a Random Forestc classifier on a train/test split for deployment 
    returns model and probability on the test set
    """
    clf = RandomForestClassifier(max_depth=d, n_estimators=n)
    clf.fit(val_train,y_train)
    prob=clf.predict_proba(val_test)
    return clf,prob


In [3]:

def print_top_bottom_sentences(prob,ids_test,y_test,text,labels):
    """
    here we are displaying the 
    """
    pos_proba=(np.asarray(prob).T)[1]
    indexes=np.argsort(-np.asarray(pos_proba))
    for i in indexes[:10]:
        print text[ids_test[i]]
        print y_test[i]
        print labels[ids_test[i]]#checking
    print ('********************************')
    for i in indexes[-40:]:
        print text[ids_test[i]]
        print y_test[i]
        print pos_proba[i]
        print labels[ids_test[i]]#checking

Let's load labels and vectors.. 


In [4]:
load_languages()

Now we load all data for prediction into different variables

In [7]:
"""
raw header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	statement label
feature header is:
entity_id	revision_id	timestamp entity_title	section	start	offset	 label feature
"""
labels={} #whether it needs a citation or not
vectors={} #the word vectors aligned to english
main={} #is it the main section?
language={} #which language is the article from
pages={} #length of the page
start={} #starting point of the statement in the page
pagelength={} #page length, this is for future use, if we want to track where the statement is placed in the page
positives={}#statements with citation
negatives={}#statements without citation
text={}#raw text
ww={}
for lan in languages:
    positives[lan]=[] #stores the statements needing a citation
    negatives[lan]=[] #stores the statements without a citation (much less than the positives)
    fraw=open(filenames_raw[lan]) #each line in fraw correspond to the line in f
    fww=open(filenames_ww[lan]) #each line in fww correspond to the line in f
    #for each line in the vector file, record various parameters and then store the corresponding raw text with the same identifier
    with open(filenames[lan]) as f:
        for line in f:
            unique=hashlib.sha224(line).hexdigest() #unique identifier of this line
            #first, we store the raw statement text from the raw file
            lineraw=fraw.readline() #line with raw text
            rowraw=lineraw[:-1].split('\t')
            text[unique]=rowraw[-2] #where the text is placed in the line
            lineww=fww.readline() #line with raw text
            rowww=lineww[:-1].split('\t')
            ww[unique]=[float(r) for r in rowww[-1].split(',')]#where the text is placed in the line
            #now, we can get features
            row=line.split('\t')
            labels[unique]=int(row[-2])#where the label sits in the feature file
            #first append to lists of positives and negatives depending on the label
            if labels[unique]==1:
                positives[lan].append(unique)
            else:
                negatives[lan].append(unique)
            #store features
            vectors[unique]=[float(r) for r in row[-1].split(',')]
            main[unique]= 1 if row[4]=='MAIN_SECTION'else 0
            language[unique]=lan



In [8]:
allneg=count_negatives(negatives,positives)

proportion of negatives per language
{'fr': 0.11540378613267856, 'en': 0.07616565498075248, 'it': 0.3998286808424871}


We now select the data for training: all negatives + an equal number of positives, using only feature vectors

In [9]:
y,values,ids=get_values_for_crossvalidation(positives,negatives,[ww])

We now run a grid search to find the good random forest parameter

In [10]:
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y)

[ 0.64969092  0.65119849  0.65420044]
[ 0.65800139  0.65772829  0.66165152]
[ 0.66616858  0.6674718   0.66916401]
[ 0.67241168  0.67236403  0.67374928]
[ 0.64721415  0.64827134  0.64861213]
[ 0.65834937  0.66073731  0.6629616 ]
[ 0.67085602  0.67011238  0.67053549]
[ 0.67466328  0.67500461  0.67483419]
[ 0.64987514  0.64982703  0.65018832]
[ 0.66144021  0.6597343   0.66083272]
[ 0.67366029  0.6705013   0.66906166]
[ 0.67460187  0.67375596  0.67610333]
[ 0.65071437  0.65142366  0.64969704]
[ 0.65863594  0.66178126  0.6635757 ]
[ 0.67001678  0.66806542  0.67022845]
[ 0.67675114  0.67289624  0.67620568]
200 200
0.675284352749


In [11]:
print ('ww+main')
y,values,ids=get_values_for_crossvalidation(positives,negatives,[ww,main])
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y)
print ('ww+vecs')
y,values,ids=get_values_for_crossvalidation(positives,negatives,[ww,vectors])
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y)
print ('ww+vecs+main')
y,values,ids=get_values_for_crossvalidation(positives,negatives,[ww,vectors,main])
max_ind,max_val,scores=perform_gridsearch_withRFC(values,y)

ww+main
[ 0.69910345  0.70247477  0.70181364]
[ 0.71032055  0.70740794  0.7089372 ]
[ 0.71680927  0.71670112  0.71522149]
[ 0.72225406  0.72151147  0.72144436]
[ 0.70184632  0.7010419   0.69814951]
[ 0.71017726  0.71058072  0.70660362]
[ 0.71754616  0.71799071  0.71628592]
[ 0.72331846  0.7233128   0.72220175]
[ 0.69996316  0.70165599  0.70038074]
[ 0.71083228  0.71058072  0.71125031]
[ 0.71828305  0.71823634  0.71606075]
[ 0.72278626  0.7231081   0.7227135 ]
[ 0.69631965  0.70161505  0.69864079]
[ 0.70890818  0.70865658  0.71004258]
[ 0.71750522  0.71858432  0.7171866 ]
[ 0.72284767  0.7242544   0.72197658]
200 200
0.723026215578
ww+vecs
[ 0.67449953  0.67418582  0.67276672]
[ 0.68807058  0.68673367  0.68832392]
[ 0.69992222  0.70071439  0.70500696]
[ 0.70796659  0.70994617  0.71182347]
[ 0.6682155   0.67252779  0.67133382]
[ 0.68583944  0.68792091  0.69133301]
[ 0.70070005  0.70470595  0.70291902]
[ 0.7096246   0.71053978  0.71286744]
[ 0.6706718   0.67598715  0.67370834]
[ 0.6842223

In [12]:
val_train, val_test, y_train, y_test, ids_train, ids_test = train_test_split(values, y, ids, test_size=TEST_SIZE, random_state=42)

In [13]:
clf,prob=train_test_final(val_train,val_test,y_train,200,200)
print_top_bottom_sentences(prob,ids_test,y_test,text,labels)

While the writer commented that Thief 2X did not perfectly follow the series' tone and that its voice acting was "not the best", they finished by saying that fans of the Thief series had "no excuse not to play T2X." PC Gamer UK's Kieron Gillen wrote that he had expected the mod to be cancelled, given that the "web is full of [...] five-percent finished masterworks from people who aimed far, far too high." After Thief 2X's release, he lauded it as the best Thief fan work, and as "one of the most impressive achievements of any fan community for any game."
1
1
Instead, they sought to make the best use possible of the material. Norris said that her goal was "to make a show that people would say was too good for children ...
1
1
Al genere Transandinomys appartengono roditori di piccole dimensioni, con lunghezza della testa e del corpo tra 100 e 146 mm, la lunghezza della coda tra 99 e 152 mm e un peso fino a 75 g.
1
1
There was difficulty finding a story that was sufficient for a film, and 

We now lok at the effect of adding the 'main' features, i.e. a feature = 1 if the sentence is in the main section

In [11]:
y_m,values_m,ids_m=get_values_for_crossvalidation(positives,negatives,[vectors,main])

In [12]:
max_ind,max_val,scores=perform_gridsearch_withRFC(values_m,y_m)

[ 0.65954109  0.69052755  0.670459  ]
[ 0.68248676  0.71680722  0.69203609]
[ 0.69562659  0.7050402   0.72302864]
[ 0.69994116  0.72680918  0.70929776]
[ 0.70798196  0.72504413  0.72499019]
[ 0.7050402   0.72327907  0.71734013]
[ 0.65797215  0.66817023  0.67948215]
[ 0.69464601  0.71484605  0.69615536]
[ 0.73386939  0.74073348  0.73891722]
[ 0.72837811  0.75093156  0.75107885]
[ 0.74014513  0.75583448  0.75284425]
[ 0.74740145  0.75406943  0.75304041]
[ 0.67346539  0.64561679  0.65515889]
[ 0.68385958  0.69346931  0.66888976]
[ 0.71876839  0.72955481  0.72812868]
[ 0.73583056  0.74838204  0.74440957]
[ 0.75053932  0.76407139  0.75676736]
[ 0.758384    0.76250245  0.7618674 ]
[ 0.66424789  0.65601098  0.66928207]
[ 0.68425181  0.70111787  0.68340526]
[ 0.72524024  0.73739949  0.73675951]
[ 0.73641891  0.73661502  0.75205963]
[ 0.75544224  0.75583448  0.7565712 ]
[ 0.75975682  0.76564032  0.76579051]
[ 0.6609139   0.69150814  0.66555512]
[ 0.67699549  0.69111591  0.69556689]
[ 0.72680918

In [13]:
val_train, val_test, y_train, y_test, ids_train, ids_test = train_test_split(values, y, ids, test_size=TEST_SIZE, random_state=42)

In [14]:
clf,prob=train_test_final(val_train,val_test,y_train,100,200)
print_top_bottom_sentences(prob,ids_test,y_test,text,labels)

Singleton, Brian (2004). Oscar Asche, Orientalism, and British Musical Comedy. Westport, CT: Greenwood Publishing Group. p.75. ISBN978-0-275-97929-4.
0
0
The Plainsman (1936) and John Ford's Stagecoach (1939).
1
1
By 1924, many UFA Members of the Legislative Assembly (MLAs) wanted to see Greenfield leave office, both because they were frustrated with his failings and because they thought it likely that a Greenfield-led government would be defeated in the next election. Their first attempt to replace him failed when Brownlee, their intended replacement, refused to have anything to do with the plan, but a second attempt, in 1925, was successful when Brownlee agreed to take office if Greenfield personally requested that he do so. Greenfield had not wanted the job in the first place, and agreed to resign in Brownlee's favour.
0
0
He considered himself vindicated when a move to override his veto failed in the Senate the following day. Johnson believed that the Radicals would now be isolated