In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [2]:
df = pd.read_csv('liar_plus/train2.tsv', delimiter='\t', header = None)

In [3]:
df = df.drop(columns = [0])

In [4]:
df.rename({1: 'id', 2: 'label', 3: 'statement', 4: 'subject', 5: 'speaker', 6: 'job-title',
           7: 'state_info', 8: 'party_affiliation', 9: 'barely_true_counts', 10: 'false_counts',
           11: 'half_true_counts', 12: 'mostly_true_counts', 13: 'pants_on_fire_counts', 14: 'context',
           15: 'justification'
          }, axis = 1, inplace = True)

In [5]:
df['party_affiliation'].value_counts()

party_affiliation
republican                      4497
democrat                        3336
none                            1744
organization                     219
independent                      147
newsmaker                         56
libertarian                       40
activist                          39
journalist                        38
columnist                         35
talk-show-host                    26
state-official                    20
labor-leader                      11
tea-party-member                  10
business-leader                    9
green                              3
education-official                 2
liberal-party-canada               1
government-body                    1
Moderate                           1
democratic-farmer-labor            1
ocean-state-tea-party-action       1
constitution-party                 1
Name: count, dtype: int64

In [6]:
uninformative = {'organization', 'newsmaker', 'activist', 'state-official', 'government-body',
'journalist', 'columnist', 'talk-show-host', 'education-official', 'business-leader', 
 'Moderate', 'democratic-farmer-labor', 'ocean-state-tea-party-action' }
df = df[~df['party_affiliation'].isin(uninformative)]

In [7]:
df = df[~(df['statement'].isna() | df['party_affiliation'].isna())]

In [8]:
df['party_affiliation'].value_counts()

party_affiliation
republican              4497
democrat                3336
none                    1744
independent              147
libertarian               40
labor-leader              11
tea-party-member          10
green                      3
liberal-party-canada       1
constitution-party         1
Name: count, dtype: int64

# Baseline model (non-deep)

In [9]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", #"Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]
         #"Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

X, y = df['statement'], df['party_affiliation']


X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.2)

tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)

X_test = tfidf.transform(X_test)

# TODO (Apply): All cross-validation

max_score = 0.0
max_class = ''
# iterate over classifiers
for name, clf in zip(names, classifiers):
    
    print(clf)
    
    clf.fit(X_train, y_train)
    score = 100.0 * clf.score(X_test, y_test)
    print('Classifier = %s, Score (test, accuracy) = %.2f,' %(name, score))
    
    if score > max_score:
        clf_best = clf
        max_score = score
        max_class = name

print(80*'-' )
print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %(max_class, max_score))

KNeighborsClassifier(n_neighbors=2)
Classifier = Nearest Neighbors, Score (test, accuracy) = 42.03,
SVC(C=0.025, kernel='linear')
Classifier = Linear SVM, Score (test, accuracy) = 45.81,
SVC(C=1, gamma=2)
Classifier = RBF SVM, Score (test, accuracy) = 51.94,
DecisionTreeClassifier(max_depth=5)
Classifier = Decision Tree, Score (test, accuracy) = 46.53,
RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10)
Classifier = Random Forest, Score (test, accuracy) = 45.81,
MLPClassifier(alpha=1, max_iter=1000)




Classifier = Neural Net, Score (test, accuracy) = 51.07,
AdaBoostClassifier()
Classifier = AdaBoost, Score (test, accuracy) = 45.81,
--------------------------------------------------------------------------------
Best --> Classifier = RBF SVM, Score (test, accuracy) = 51.94
