In [None]:
import string
import csv
import numpy as np
import pandas as pd
import scipy as sp
import scipy.sparse
from scipy.sparse import hstack
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.preprocessing import MinMaxScaler
import pickle
import scipy.sparse
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Neural net imports
import keras
import tensorflow as tf
from keras.models import Sequential
from keras import layers
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import History 
from keras.models import load_model
from keras import optimizers

In [None]:
train = np.genfromtxt("train_clean.csv", encoding="utf-8", delimiter="\t", dtype = str)
test = np.genfromtxt("test_clean.csv", encoding="utf-8", delimiter="\t", dtype = str)

In [None]:
tf = TfidfVectorizer()
X = tf.fit_transform(train[:, 0])
y = train[:, 1]
test = tf.transform(test)
encoder = LabelBinarizer()
X_dim1 = X.shape[1]

Defining all the models:

In [None]:
mnb_model = MultinomialNB(alpha=0.175)
svm_model = LinearSVC(C=0.25, loss='squared_hinge', penalty='l2')
svm_model = CalibratedClassifierCV(svm_model) #to predict probabilities
logit_model = LogisticRegression(C=2.4)
tree_model = DecisionTreeClassifier(max_depth=1000)
rf_model = RandomForestClassifier

models = [mnb_model, svm_model, logit_model, tree_model, rf_model]

Cross-validation function:

In [None]:
def kfold(X, y, model, n):
    accuracies = []
    for i in range(n):
        print(i)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1/n))
        model = model
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
    
        count = 0
        for j, prediction in enumerate(predictions):
            if y_test[j] == prediction:
                count += 1
        accuracies.append(count / y_test.shape[0])
    print("Your accuracies are: " + str(accuracies))
    print("Your average accuracy is: " + str(sum(accuracies) / len(accuracies)))

Cross-validating each model individually:

In [None]:
for model in models:
    kfold(X, y, model, 20)

Ensembling:

In [None]:
mnb_svm = VotingClassifier(estimators=[('mnb', mnb_model), ('svm', svm_model)], voting='soft', n_jobs=6)
mnb_svm_logit = VotingClassifier(estimators=[('mnb', mnb_model), ('svm', svm_model), ('logit', logit_model)], voting='soft', n_jobs=6)
mnb_logit = VotingClassifier(estimators=[('mnb', mnb_model), ('logit', logit_model)],  voting='soft', n_jobs=6)

ensembles = [mnb_svm_classifier, mnb_svm_logit, mnb_logit]

In [None]:
for ensemble in ensembles:
    kfold(X, y, ensemble, 10)

Outputting predictions on full data:

In [None]:
mnb_svm.fit(X, y)
mnb_svm_logit.fit(X, y)
mnb_logit.fit(X, y)

mnb_svm_preds = mnb_svm.predict(test)
mnb_svm_logit_preds = mnb_svm_logit.predict(test)
mnb_logit_preds = mnb_logit.predict(test)

In [None]:
wtr = csv.writer(open ('mnb_svm_predictions.csv', 'w'), delimiter=',', lineterminator='\n')
for p in mnb_svm_preds : wtr.writerow ([p])

wtr = csv.writer(open ('mnb_svm_logit_predictions.csv', 'w'), delimiter=',', lineterminator='\n')
for p in mnb_svm_logit_preds : wtr.writerow ([p])
    
wtr = csv.writer(open ('mnb_logit_predictions.csv', 'w'), delimiter=',', lineterminator='\n')
for p in mnb_logit_preds : wtr.writerow ([p])