In [1]:
import getAllData as gd
import numpy as np
import pandas as pd
import random
import json
import re

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB, ComplementNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.util import mark_negation

data = pd.read_csv('merged.csv', index_col=0)
token = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize

stemmed = gd.stemData(data.copy())
bow_stem_tfidf_data = gd.tfidf(stemmed, token, 'bow_stem_tfidf.txt')

X_train, X_test, y_train, y_test = train_test_split(bow_stem_tfidf_data, data['Rating'], test_size=0.2, random_state=1) # Note test is never used!!
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)


classifiers = [
    (LogisticRegression(max_iter=1000), 'Logistic (max_iter:1000)'),
    (MultinomialNB(), 'Multinomial NB'),
    (LinearSVC(max_iter=15000), 'Linear SVC (max_iter:15000)'),
    (KNeighborsClassifier(n_neighbors=1), 'KNN K:1'),
    (KNeighborsClassifier(n_neighbors=3), 'KNN K:3'),
    (KNeighborsClassifier(n_neighbors=5), 'KNN K:5'),
    (DecisionTreeClassifier(), 'DT (Baseline)'),
    (DecisionTreeClassifier(criterion="entropy", max_depth=3), 'DT (criterion:entropy, max_depth:3)'),
    (DecisionTreeClassifier(max_depth=5), 'DT (max_depth:5)')
    
]

for clf, name in classifiers:
    print('------------------------------------------------------')
    print(name)
    clf.fit(X_train, y_train)
    predictTrain = clf.predict(X_train)
    accuracy_score = metrics.accuracy_score(predictTrain, y_train)
    print('STEM TFIDF-Train ' + str(accuracy_score * 100))
    print(metrics.confusion_matrix(y_train, predictTrain))
    predictval = clf.predict(X_val)
    accuracy_score = metrics.accuracy_score(predictval, y_val)
    print('STEM TFIDF-Valid ' + str(accuracy_score * 100))
    print(metrics.confusion_matrix(y_val, predictval))
    predicttest = clf.predict(X_test)
    accuracy_score = metrics.accuracy_score(predicttest, y_test)
    print('STEM TFIDF-Test ' + str(accuracy_score * 100))
    print(metrics.confusion_matrix(y_test, predicttest))