# Sklearn Classifiers
Classifiers trained on only the claimant (metadata) data. No article content is used.

In [1]:
import pandas as pd
import json
import numpy as np
import pickle
# from datacup_utils import score1

## Getting the data

In [2]:
labels_dict = {0:"FALSE", 1:"PARTIALLY_TRUE", 2:"TRUE"}
def get_data_from_json():
    path_to_data = "data/train.json"
    data = pd.read_json(path_to_data)
    print("There is a total of %s observations." % (len(data)))
    print("There is a total of %s features." % (len(data.columns)))
    print("There is a total of %s classes." % (len(data.label.unique())))
    print("Classes are: %s" % ((data.label.unique())))
    print("Here's the count of labels:")
    vc = data.label.value_counts()
    [print("    " + labels_dict[label] + " (" + str(label) + "): " + str(count)) for count,label \
                     in zip(vc.values,vc.index)]
    return data
data = get_data_from_json()

There is a total of 15555 observations.
There is a total of 6 features.
There is a total of 3 classes.
Classes are: [0 2 1]
Here's the count of labels:
    FALSE (0): 7408
    PARTIALLY_TRUE (1): 6451
    TRUE (2): 1696


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[["date","claim","id","claimant","related_articles"]],
                                                    data[['label']], test_size=0.3, random_state=24)

X_train.to_json("X_train.json")
X_test.to_json("X_test.json")
y_train.to_json("y_train.json")
y_test.to_json("y_test.json")

## Create Features
#### Using only the training set --> X_train.json

In [4]:
X = pd.read_json("X_train.json")
y = pd.read_json("y_train.json")

In [5]:
X.claimant.value_counts()

                                                 3481
Donald Trump                                      895
Bloggers                                          258
Barack Obama                                      163
Hillary Clinton                                   155
Viral image                                        98
Bernie Sanders                                     74
Ted Cruz                                           74
Facebook posts                                     73
Various websites                                   73
Marco Rubio                                        67
Scott Walker                                       62
John McCain                                        58
Rick Perry                                         57
Rick Scott                                         51
Facebook user                                      48
multiple sources                                   46
Chain email                                        44
Mike Pence                  

In [6]:
# Adding new features
# number of related articles
X["num_related_articles"] = X["related_articles"].apply(lambda x: len(x))
# claimant as one-hot-encoding
X['date'] = pd.to_numeric(X['date'])

s = X['claimant'].value_counts()
X['claimant'] = np.where(X['claimant'].isin(s.index[s <= 3]), 'Other', X['claimant'])

dummies = pd.get_dummies(X.claimant, prefix="claimant", prefix_sep="_")
X = pd.concat([X[["date","id", "num_related_articles"]],dummies], axis=1).sort_index(axis=1)

col_in_train = np.append(dummies.columns.values, [["date","id", "num_related_articles"]])
claimant_columns = dummies.columns.values

with open("claiment_list.txt", "wb") as fp:   #Pickling
    pickle.dump(claimant_columns, fp)

## Train all classifiers

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.model_selection import learning_curve, GridSearchCV


from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
#     KNeighborsClassifier(59,n_jobs=-1),
    DecisionTreeClassifier(max_depth=8),
    RandomForestClassifier(n_estimators=100, max_depth=50,n_jobs=-1),
    AdaBoostClassifier(n_estimators=96, learning_rate=1.5),
    GaussianNB(),

#     SVC(kernel="linear", C=0.025),
#     SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
#     MLPClassifier(alpha=1, max_iter=1000),
#     QuadraticDiscriminantAnalysis()
]

names = [
#          "Nearest Neighbors",
         "Decision Tree", 
         "Random Forest", 
         "AdaBoost",
         "Naive Babes",
        
#          "Linear SVM", 
#          "RBF SVM",
#          "Gaussian Process",
#          "Neural Net",
#          "QDA"
        ]

weights = []
for clf,name in zip(classifiers,names):
    print(name)
    trained_models = cross_validate(clf, X, y.values.ravel(),
                                    cv=3, scoring=make_scorer(score1, greater_is_better=True),
                                    return_estimator=1)
    results = trained_models['test_score']

    print("MAX: %s\nMIN: %s\nAVG: %s\n" % ('{:.2%}'.format(max(results)),
                                           '{:.2%}'.format(min(results)),
                                           '{:.2%}'.format(np.mean(results))))
    weights.append(np.mean(results))
    best_trained_model = trained_models['estimator'][\
                        list(trained_models['test_score']).index(max(trained_models['test_score']))]

    pickle.dump(best_trained_model, open("models/" + name + ".sav", 'wb'))

Decision Tree
MAX: 41.91%
MIN: 40.93%
AVG: 41.46%

Random Forest
MAX: 41.81%
MIN: 40.49%
AVG: 41.19%

AdaBoost


  'precision', 'predicted', average, warn_for)


MAX: 41.92%
MIN: 40.02%
AVG: 40.75%

Naive Babes
MAX: 35.61%
MIN: 35.24%
AVG: 35.39%



  'precision', 'predicted', average, warn_for)
