### 3. Classifiers

This notebook to classify whether fake news will travel from A to B.

In [1]:
%pylab inline
import pandas as pd, pyprind

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score,make_scorer

Populating the interactive namespace from numpy and matplotlib


In [2]:
ls data

ClassificationModelInput.csv  pol_agg.csv
[31memergent.csv[m[m*                 [31mpolitifact.csv[m[m*
fake_localcentralities.csv    politifact_clean.csv
key_mutuality_roshan.csv      real_localcentralities.csv
keys.csv                      [31msnopes.csv[m[m*


In [36]:
d=pd.read_csv('ClassificationModelInput.csv')
sum(d.isnull()) #No NA

Source                                      0
Destination                                 0
page_url                                    0
TRUE.                                       0
FALSE.                                      0
Source_Real_LocalDegreeCentralities         0
Destination_Real_LocalDegreeCentralities    0
Source_Real_LocalBetweenness                0
Destination_Real_LocalBetweenness           0
Source_Real_LocalCloseness                  0
Destination_Real_LocalCloseness             0
Source_Real_LocalEigenCentralities          0
Destination_Real_LocalEigenCentralities     0
Source_Fake_LocalDegreeCentralities         0
Destination_Fake_LocalDegreeCentralities    0
Source_Fake_LocalBetweenness                0
Destination_Fake_LocalBetweenness           0
Source_Fake_LocalCloseness                  0
Destination_Fake_LocalCloseness             0
Source_Fake_LocalEigenCentralities          0
Destination_Fake_LocalEigenCentralities     0
jaccard_coeff_connection          

In [37]:
d['Label']=d['FALSE.'].apply(lambda x: int(x>0))

#Check for class imblanace
print(sum(d['Label']==0)/len(d))

#Ready for model
X=d[['Source_Real_LocalDegreeCentralities',
       'Destination_Real_LocalDegreeCentralities',
       'Source_Real_LocalBetweenness', 'Destination_Real_LocalBetweenness',
       'Source_Real_LocalCloseness', 'Destination_Real_LocalCloseness',
       'Source_Real_LocalEigenCentralities',
       'Destination_Real_LocalEigenCentralities',
       'Source_Fake_LocalDegreeCentralities',
       'Destination_Fake_LocalDegreeCentralities',
       'Source_Fake_LocalBetweenness', 'Destination_Fake_LocalBetweenness',
       'Source_Fake_LocalCloseness', 'Destination_Fake_LocalCloseness',
       'Source_Fake_LocalEigenCentralities',
       'Destination_Fake_LocalEigenCentralities', 'jaccard_coeff_connection',
       'Neighbor_connection', 'jaccard_coeff_common_destination',
       'Neighbor_common_destination', 'jaccard_coeff_common_source',
       'Neighbor_common_source', 'mutuality_ind']]
Y=d['Label']

0.559619604974


In [38]:
#Logistic regression, C-Support Vector Classification, GBM

auc_scorer=make_scorer(roc_auc_score,
                       greater_is_better=True)

models = {'RF':{'model':RandomForestClassifier(),
                     'Params':{'n_estimators':range(20,22),
                              'max_depth':range(2,4),
                              'min_samples_leaf':range(5,8)}},
         'GBM':{'model':GradientBoostingClassifier(),
                     'Params':{'n_estimators':range(10,13),
                               'max_depth':range(2,5),
                               'learning_rate':linspace(0.2,0.7,num=3),
                               'min_samples_leaf':range(4,7)}}}

In [39]:
def modeleva(cvfolds):
    allmodels={}
    for model in list(models.keys()):
        grid = GridSearchCV(estimator=models[model]['model'], 
                            param_grid=models[model]['Params'],
                            cv=cvfolds, n_jobs=-1,scoring=auc_scorer)
        modelstats={}
        modelstats['model']=grid.fit(X, Y)
        modelstats['best_estimator']=modelstats['model'].best_estimator_
        modelstats['best_score']=modelstats['model'].best_score_
        allmodels[model]=modelstats
        del modelstats
        print(model)
    results=pd.DataFrame.from_dict(allmodels).T.sort_values(by='best_score')
    results=results.reset_index()
    return(results)

In [40]:
replicates=2
allresults=[modeleva(2) for i in range(replicates)]
allresults=pd.concat(allresults)

GBM
RF
GBM
RF


In [41]:
print((allresults.groupby('index')['best_score'].sum()/replicates).sort_values(ascending=False))
bestmodel=(allresults.groupby('index')['best_score'].sum()/replicates).sort_values(ascending=False).index[0]
bestmodel=allresults[allresults['index']==bestmodel]['best_estimator'].tolist()[0]

index
GBM    0.501230
RF     0.499784
Name: best_score, dtype: float64


#### 2.2 Add Jaccard Dist

This notebook serves to add local centralities in both real and fake news networks to both source and destination websites, so we will know their position within both the real and fake networks.

The centralities are calculated using the R code.

In [4]:
d=pd.read_csv('data/keys.csv',index_col=False)

In [5]:
def association(method):
    jaccard_dict={}
    faked=d[d['FALSE.']>0]
    #all unique websites
    allwebs=set(faked['Source'].append(faked['Destination']))
    bar = pyprind.ProgBar(len(allwebs))
    for site in allwebs:
        if method=='connection':
            sites=set(faked[faked['Source']==site]['Destination'].append(d[d['Destination']==site]['Source']))
        elif method=='common_destination':
            sites=set(faked[faked['Source']==site]['Destination'])
        elif method=='common_source':
            sites=set(faked[faked['Destination']==site]['Source'])
        jaccard_dict[site]=sites
        bar.update()
    return(jaccard_dict)

In [7]:
k=association('connection')

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:05


In [111]:
for i in ['connection','common_destination','common_source']:
    mapper=association(i)
    def jaccard(pair):
        try:
            numerator=len(mapper[pair[0]].intersection(mapper[pair[1]]))
            denom=len(mapper[pair[0]].union(mapper[pair[1]]))
            return numerator/denom
        except:
            return 0
    d['jaccard_coeff'+'_'+i]=[jaccard(pair) for pair in list(zip(d['Source'],d['Destination']))]

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:05
0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:02
0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:02


In [112]:
d.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Source,Destination,page_url,TRUE.,FALSE.,Source_Real_LocalDegreeCentralities,Destination_Real_LocalDegreeCentralities,Source_Real_LocalBetweenness,...,Destination_Fake_LocalDegreeCentralities,Source_Fake_LocalBetweenness,Destination_Fake_LocalBetweenness,Source_Fake_LocalCloseness,Destination_Fake_LocalCloseness,Source_Fake_LocalEigenCentralities,Destination_Fake_LocalEigenCentralities,jaccard_coeff_connection,jaccard_coeff_common_destination,jaccard_coeff_common_source
0,0,1,www.facebook.com,www.facebook.com,42,5,37,143.333333,143.333333,0.007485,...,1.0,0.0,0.0,0.028924,0.028924,1.0,1.0,1.0,1.0,1.0
1,1,2,www.facebook.com,www.politifact.com,22,0,22,143.333333,143.333333,0.007485,...,1.0,0.0,0.0,0.028924,0.028924,1.0,1.0,0.063415,0.02907,0.121951
2,2,3,nationalreport.net,www.whitehouse.gov,14,0,14,,,,...,1.0,0.0,0.0,0.028315,0.028315,1.0,1.0,0.08,0.0,0.076923
3,3,4,www.youtube.com,www.youtube.com,9,2,7,4.666667,4.666667,0.0,...,1.0,0.0,0.0,0.029065,0.029065,1.0,1.0,1.0,1.0,1.0
4,4,5,www.politifact.com,www.politifact.com,8,0,8,4.666667,4.666667,0.0,...,1.0,0.0,0.0,0.028871,0.028871,1.0,1.0,1.0,1.0,1.0


In [79]:
#Out degree in false & true network for every website
Out_deg_false=dict(d.groupby('Source')['FALSE.'].sum())
Out_deg_true=dict(d.groupby('Source')['TRUE.'].sum())

#In degree in false & true network for every website
in_deg_false=dict(d.groupby('Destination')['FALSE.'].sum())
in_deg_true=dict(d.groupby('Destination')['TRUE.'].sum())

In [83]:
#Outdegrees
d['SourceSite_outdeg_real']=d['Source'].map(out_deg_true)
d['DestSite_outdeg_real']=d['Destination'].map(out_deg_true)
d['SourceSite_outdeg_fake']=d['Source'].map(out_deg_false)
d['DestSite_outdeg_fake']=d['Destination'].map(out_deg_false)

#Indegrees
d['SourceSite_indeg_fake']=d['Source'].map(in_deg_false)
d['DestSite_indeg_fake']=d['Destination'].map(in_deg_false)
d['DestSite_indeg_real']=d['Destination'].map(in_deg_true)
d['SourceSite_indeg_real']=d['Source'].map(in_deg_true)

In [84]:
d.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Source,Destination,page_url,TRUE.,FALSE.,Source_Real_LocalDegreeCentralities,Destination_Real_LocalDegreeCentralities,Source_Real_LocalBetweenness,...,Destination_Fake_LocalEigenCentralities,jaccard_coeff,SourceSite_outdeg_real,DestSite_outdeg_real,SourceSite_indeg_fake,DestSite_indeg_real,SourceSite_outdeg_fake,DestSite_outdeg_fake,DestSite_indeg_fake,SourceSite_indeg_real
0,0,1,www.facebook.com,www.facebook.com,42,5,37,143.333333,143.333333,0.007485,...,1.0,1.0,6,6,47,6,47,47,47,6
1,1,2,www.facebook.com,www.politifact.com,22,0,22,143.333333,143.333333,0.007485,...,1.0,0.053942,6,7,47,7,47,83,83,6
2,2,3,nationalreport.net,www.whitehouse.gov,14,0,14,,,,...,1.0,0.057143,0,5,6,5,6,23,23,0
3,3,4,www.youtube.com,www.youtube.com,9,2,7,4.666667,4.666667,0.0,...,1.0,1.0,4,4,30,4,30,30,30,4
4,4,5,www.politifact.com,www.politifact.com,8,0,8,4.666667,4.666667,0.0,...,1.0,1.0,7,7,83,7,83,83,83,7


#### 2.3 Other features

In [91]:
d.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Source,Destination,page_url,TRUE.,FALSE.,Source_Real_LocalDegreeCentralities,Destination_Real_LocalDegreeCentralities,Source_Real_LocalBetweenness,...,Destination_Fake_LocalEigenCentralities,jaccard_coeff,SourceSite_outdeg_real,DestSite_outdeg_real,SourceSite_indeg_fake,DestSite_indeg_real,SourceSite_outdeg_fake,DestSite_outdeg_fake,DestSite_indeg_fake,SourceSite_indeg_real
0,0,1,www.facebook.com,www.facebook.com,42,5,37,143.333333,143.333333,0.007485,...,1.0,1.0,6,6,47,6,47,47,47,6
1,1,2,www.facebook.com,www.politifact.com,22,0,22,143.333333,143.333333,0.007485,...,1.0,0.053942,6,7,47,7,47,83,83,6
2,2,3,nationalreport.net,www.whitehouse.gov,14,0,14,,,,...,1.0,0.057143,0,5,6,5,6,23,23,0
3,3,4,www.youtube.com,www.youtube.com,9,2,7,4.666667,4.666667,0.0,...,1.0,1.0,4,4,30,4,30,30,30,4
4,4,5,www.politifact.com,www.politifact.com,8,0,8,4.666667,4.666667,0.0,...,1.0,1.0,7,7,83,7,83,83,83,7
