### 3. Classifiers

This notebook to classify the probability of fake news traveling from A to B.

In [2]:
ls data

ClassificationModelInput.csv  pol_agg_new.csv
[31memergent.csv[m[m*                 [31mpolitifact.csv[m[m*
fake_localcentralities.csv    politifact_clean.csv
key_mutuality_roshan.csv      real_localcentralities.csv
keys.csv                      [31msnopes.csv[m[m*


In [3]:
%pylab inline
import pandas as pd, pyprind

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score,make_scorer

Populating the interactive namespace from numpy and matplotlib


In [10]:
d=pd.read_csv('data/ClassificationModelInput.csv')
sum(sum(d.isnull())) #No NA
print(d.columns)

Index(['Source', 'Destination', 'Monday', 'Elections', 'Military', 'Thursday',
       'Religion', 'Friday', 'Saturday', 'TRUE', 'History', 'Tuesday',
       'Wednesday', 'Sunday', 'page_url', 'Health Care', 'FALSE',
       'jaccard_coeff_connection', 'Neighbor_connection',
       'jaccard_coeff_common_destination', 'Neighbor_common_destination',
       'jaccard_coeff_common_source', 'Neighbor_common_source',
       'SourceSite_outdeg_real', 'DestSite_outdeg_real',
       'SourceSite_outdeg_fake', 'DestSite_outdeg_fake',
       'SourceSite_indeg_fake', 'DestSite_indeg_fake', 'DestSite_indeg_real',
       'SourceSite_indeg_real', 'mutuality_ind', 'Label'],
      dtype='object')


In [11]:
d.head()

Unnamed: 0,Source,Destination,Monday,Elections,Military,Thursday,Religion,Friday,Saturday,TRUE,...,SourceSite_outdeg_real,DestSite_outdeg_real,SourceSite_outdeg_fake,DestSite_outdeg_fake,SourceSite_indeg_fake,DestSite_indeg_fake,DestSite_indeg_real,SourceSite_indeg_real,mutuality_ind,Label
0,www.facebook.com,www.politifact.com,6,0,0,4,0,1,0,0,...,30,0.0,225,35.0,10.0,75,7,1.0,1.0,0
1,nationalreport.net,www.whitehouse.gov,0,0,0,14,0,0,0,0,...,0,0.0,25,0.0,4.0,23,5,0.0,0.0,0
2,www.naturalnews.com,www.cdc.gov,0,0,0,0,0,0,0,0,...,0,0.0,17,0.0,0.0,18,0,0.0,0.0,0
3,www.facebook.com,www.snopes.com,0,0,0,0,0,2,0,1,...,30,0.0,225,6.0,10.0,33,1,1.0,0.0,0
4,www.infowars.com,www.cdc.gov,0,0,0,0,0,0,0,0,...,0,0.0,21,0.0,3.0,18,0,0.0,0.0,0


In [26]:
features=['Monday', 'Elections', 'Military', 'Thursday',
       'Religion', 'Friday', 'Saturday','History', 'Tuesday',
       'Wednesday', 'Sunday', 'page_url', 'Health Care',
       'jaccard_coeff_connection', 'Neighbor_connection',
       'jaccard_coeff_common_destination', 'Neighbor_common_destination',
       'jaccard_coeff_common_source', 'Neighbor_common_source',
       'SourceSite_outdeg_fake', 'DestSite_outdeg_fake',
       'SourceSite_indeg_fake', 'DestSite_indeg_fake', 'mutuality_ind']

In [27]:
for col in ['Monday', 'Tuesday', 'Wednesday', 
                            'Thursday','Friday', 'Saturday', 'Sunday']:
    mean = d[col].mean()
    std = d[col].std()
    if std != 0:
        d[col] = d[col].apply(lambda row: (row - mean)/std).apply(lambda row: math.log(row+1))
    else:
        d[col] = 0
d['Label']=d['FALSE'].apply(lambda x: int(x>0))
#d['Label'] = d['FAKE']/(d['TRUE'] + d['FAKE'])
#Check for class imblanace
#print(float(sum(d['Label']==0))/len(d))

#Ready for model
X=d[features]
Y=d['Label']
d['Monday'].head()

0    1.797147
1   -0.523984
2   -0.523984
3   -0.523984
4   -0.523984
Name: Monday, dtype: float64

In [28]:
#Random Forest, GBM

#Make AUC the performance metric
models = {'RF':{'model':RandomForestClassifier(),
                     'Params':{'n_estimators':range(50,70,10),
                              'max_depth':range(2,5),}},
                
         'GBM':{'model':GradientBoostingClassifier(),
                     'Params':{'n_estimators':range(50,100,10),
                               'max_depth':range(2,5),
                               'learning_rate':linspace(0.1,0.5,num=20)}}}

In [29]:
print('Number of features:', len(X.columns))
print('Number of records', len(d))

Number of features: 24
Number of records 1286


In [31]:
def modeleva(cvfolds):
    allmodels={}
    for model in models.keys():
        grid = GridSearchCV(estimator=models[model]['model'], 
                            param_grid=models[model]['Params'],
                            cv=cvfolds, n_jobs=-1,scoring='roc_auc')
        modelstats={}
        modelstats['model']=grid.fit(X, Y)
        modelstats['best_estimator']=modelstats['model'].best_estimator_
        modelstats['best_score']=modelstats['model'].best_score_
        allmodels[model]=modelstats
        del modelstats
        print(model)
    results=pd.DataFrame.from_dict(allmodels).T.sort_values(by='best_score')
    results=results.reset_index()
    return(results)

In [32]:
allresults=modeleva(3)
allresults

RF
GBM


Unnamed: 0,index,best_estimator,best_score,model
0,RF,"(DecisionTreeClassifier(class_weight=None, cri...",0.902952,"GridSearchCV(cv=3, error_score='raise',\n ..."
1,GBM,([DecisionTreeRegressor(criterion='friedman_ms...,0.914975,"GridSearchCV(cv=3, error_score='raise',\n ..."


In [13]:
allresults['best_estimator'][0]

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=60, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [41]:
bestmodel

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.10000000000000001, loss='deviance',
              max_depth=4, max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=2,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=40, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)