In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from naive_bayes import NaiveBayesClassifier

In [4]:
x_train_scaled = pd.read_csv('cleaned/x_train.csv', header = None)
x_test_scaled = pd.read_csv('cleaned/x_test.csv', header = None)

y_train = pd.read_csv('cleaned/y_train.csv')
y_test = pd.read_csv('cleaned/y_test.csv')

## K-Nearest Neighbors

In [None]:
ks = np.arange(1, 11)

knn_cv_scores = []
knn_test_scores = []

for k in ks:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn_cv_scores.append(cross_val_score(knn, x_train_scaled, y_train, cv = 10, scoring = 'accuracy'))

    knn.fit(x_train_scaled, y_train)
    knn_test_scores.append(accuracy_score(y_test, knn.predict(x_test_scaled)))

In [None]:
plt.plot(ks, np.array(knn_cv_scores).mean(axis = 1))
plt.plot(ks, knn_test_scores)

## Linear Support Vector Machine

In [None]:
Cs = np.logspace(-2, 0, 10)

svm_cv_scores = []
svm_test_scores = []

for C in Cs:
    svm = LinearSVC(C = C, random_state = 1, max_iter = 2000)
    svm_cv_scores.append(cross_val_score(svm, x_train_scaled, y_train, cv = 10, scoring = 'accuracy'))

    svm.fit(x_train_scaled, y_train)
    svm_test_scores.append(accuracy_score(y_test, svm.predict(x_test_scaled)))

In [None]:
plt.plot(Cs, np.array(svm_cv_scores).mean(axis = 1))
plt.plot(Cs, svm_test_scores)

## Random Forest Classifier

In [None]:
ns = np.linspace(100, 1000, 10)

rfc_cv_scores = []
rfc_test_scores = []

for n in ns:
    rfc = RandomForestClassifier(n_estimators = int(n), random_state = 1, n_jobs = -1)
    rfc_cv_scores.append(cross_val_score(rfc, x_train_scaled, y_train, cv = 5, scoring = 'accuracy'))

    rfc.fit(x_train_scaled, y_train)
    rfc_test_scores.append(accuracy_score(y_test, rfc.predict(x_test_scaled)))

In [None]:
plt.plot(ns, np.array(rfc_cv_scores).mean(axis = 1))
plt.plot(ns, rfc_test_scores)

## Logistic Regression

In [None]:
Cs = np.logspace(-2, -1, 100)

lreg = LogisticRegressionCV(Cs = Cs, random_state = 1).fit(x_train_scaled, y_train)

In [None]:
accuracy_score(y_test, lreg.predict(x_test_scaled))

## Bag-of-words Naive Bayes Classifier

In [2]:
nbc = NaiveBayesClassifier('tweets/tweets_10k.csv')

nbc.train()

9953    0
3850    0
4962    0
3886    1
5437    1
       ..
3919    0
162     1
7903    0
2242    1
2745    1
Name: text, Length: 2000, dtype: int64

In [8]:
accuracy_score(y_test, nbc.predict())

0.7