### 3. Classifiers

This notebook to classify the probability of fake news traveling from A to B.

In [8]:
%pylab inline
import pandas as pd, pyprind

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score,make_scorer

Populating the interactive namespace from numpy and matplotlib


In [9]:
ls data

[31mClassificationModelInput.csv[m[m*  [31mpol_agg.csv[m[m*
ClassificationModelInput1.csv  [31mpolitifact.csv[m[m*
[31memergent.csv[m[m*                  [31mpolitifact_clean.csv[m[m*
[31mfake_localcentralities.csv[m[m*    [31mreal_localcentralities.csv[m[m*
[31mkey_mutuality_roshan.csv[m[m*      [31msnopes.csv[m[m*
[31mkeys.csv[m[m*


In [10]:
d=pd.read_csv('data/ClassificationModelInput1.csv')
sum(sum(d.isnull())) #No NA
print d.columns

Index([u'Source', u'Destination', u'Monday', u'Health Care', u'Tuesday',
       u'Friday', u'Wednesday', u'Thursday', u'Elections', u'Religion',
       u'Sunday', u'Military', u'Saturday', u'History', u'Total Volume',
       u'TRUE', u'FAKE', u'Source_Real_LocalDegreeCentralities',
       u'Destination_Real_LocalDegreeCentralities',
       u'Source_Real_LocalBetweenness', u'Destination_Real_LocalBetweenness',
       u'Source_Real_LocalCloseness', u'Destination_Real_LocalCloseness',
       u'Source_Real_LocalEigenCentralities',
       u'Destination_Real_LocalEigenCentralities',
       u'Source_Fake_LocalDegreeCentralities',
       u'Destination_Fake_LocalDegreeCentralities',
       u'Source_Fake_LocalBetweenness', u'Destination_Fake_LocalBetweenness',
       u'Source_Fake_LocalCloseness', u'Destination_Fake_LocalCloseness',
       u'Source_Fake_LocalEigenCentralities',
       u'Destination_Fake_LocalEigenCentralities', u'jaccard_coeff_connection',
       u'Neighbor_connection', u'jaccar

In [4]:
d.head()

Unnamed: 0,Source,Destination,Monday,Health Care,Tuesday,Friday,Wednesday,Thursday,Elections,Religion,...,Source_Fake_LocalEigenCentralities,Destination_Fake_LocalEigenCentralities,jaccard_coeff_connection,Neighbor_connection,jaccard_coeff_common_destination,Neighbor_common_destination,jaccard_coeff_common_source,Neighbor_common_source,mutuality_ind,Label
0,www.facebook.com,www.politifact.com,6.0,0.0,3.0,1.0,8.0,4.0,0.0,0.0,...,1.0,1.0,0.053659,11,0.017442,3,0.073171,3,1.0,1
1,nationalreport.net,www.whitehouse.gov,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,...,1.0,1.0,0.04,1,0.0,0,0.0,0,0.0,1
2,www.naturalnews.com,www.cdc.gov,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0,0.0,0,0.0,0,0.0,1
3,www.facebook.com,www.snopes.com,0.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,...,1.0,1.0,0.028902,5,0.006711,1,0.115385,3,0.0,1
4,www.infowars.com,www.cdc.gov,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0,0.0,0,0.0,0,0.0,1


In [30]:
for col in ['Monday', 'Tuesday', 'Wednesday', 
                            'Thursday','Friday', 'Saturday', 'Sunday']:
    mean = d[col].mean()
    std = d[col].std()
    if std != 0:
        d[col] = d[col].apply(lambda row: (row - mean)/std).apply(lambda row: math.log(row+1))
    else:
        d[col] = 0
d['Label']=d['FAKE'].apply(lambda x: int(x>0))
#d['Label'] = d['FAKE']/(d['TRUE'] + d['FAKE'])
#Check for class imblanace
#print(float(sum(d['Label']==0))/len(d))

#Ready for model
X=d[['Source_Fake_LocalDegreeCentralities',
       'Destination_Fake_LocalDegreeCentralities',
       'Source_Fake_LocalBetweenness', 'Destination_Fake_LocalBetweenness',
       'Source_Fake_LocalCloseness', 'Destination_Fake_LocalCloseness',
       'Source_Fake_LocalEigenCentralities',
       'Destination_Fake_LocalEigenCentralities', 'jaccard_coeff_connection',
       'Neighbor_connection', 'jaccard_coeff_common_destination',
       'Neighbor_common_destination', 'jaccard_coeff_common_source',
       'Neighbor_common_source', 'mutuality_ind','Monday', 'Tuesday', 'Wednesday', 
                            'Thursday','Friday', 'Saturday', 'Sunday']]
                   # 'Elections', 'Health Care', 'Military', 'Religion', 'History']]
Y=d['Label']
d['Monday'].head()

0    2.897463
1   -0.274892
2   -0.274892
3   -0.274892
4   -0.274892
Name: Monday, dtype: float64

In [21]:
len(d)

2734

In [22]:
#Random Forest, GBM

#Make AUC the performance metric

models = {'RF':{'model':RandomForestClassifier(),
                     'Params':{'n_estimators':range(50,70,10),
                              'max_depth':range(2,5),}},
                
         'GBM':{'model':GradientBoostingClassifier(),
                     'Params':{'n_estimators':range(50,100,10),
                               'max_depth':range(2,5),
                               'learning_rate':linspace(0.1,0.5,num=20)}}}

In [23]:
print('Number of features:', len(X.columns))
print('Number of records', len(d))

('Number of features:', 15)
('Number of records', 2734)


In [24]:
def modeleva(cvfolds):
    allmodels={}
    for model in models.keys():
        grid = GridSearchCV(estimator=models[model]['model'], 
                            param_grid=models[model]['Params'],
                            cv=cvfolds, n_jobs=-1,scoring='roc_auc')
        modelstats={}
        modelstats['model']=grid.fit(X, Y)
        modelstats['best_estimator']=modelstats['model'].best_estimator_
        modelstats['best_score']=modelstats['model'].best_score_
        allmodels[model]=modelstats
        del modelstats
        print(model)
    results=pd.DataFrame.from_dict(allmodels).T.sort_values(by='best_score')
    results=results.reset_index()
    return(results)

In [31]:
allresults=modeleva(10)
allresults

RF
GBM


Unnamed: 0,index,best_estimator,best_score,model
0,RF,"(DecisionTreeClassifier(class_weight=None, cri...",0.9808,"GridSearchCV(cv=10, error_score='raise',\n ..."
1,GBM,([DecisionTreeRegressor(criterion='friedman_ms...,0.985638,"GridSearchCV(cv=10, error_score='raise',\n ..."


In [13]:
allresults['best_estimator'][0]

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=60, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [41]:
bestmodel

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.10000000000000001, loss='deviance',
              max_depth=4, max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=2,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=40, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)