# Paper Grading Assistant

## Modeling

Data comes from this link:
- https://www.kaggle.com/c/asap-aes/data

Heavy inspiration drawn from:
- https://towardsdatascience.com/topic-modeling-articles-with-nmf-8c6b2a227a45

(Use incognito window when opening that link)

In [1]:
# !pip install gensim
import os, sys
from gensim import corpora, models
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re


In [2]:
# Helper Functions
# Run the utilty functions from a seperate notebook
%run topic_model_utils.ipynb


In [None]:
# data = grab_text("D:\\Kaggle\\paul-graham-essays\\paul_graham_essay.txt")
data = pd.read_csv("D:\\Kaggle\\asap-aes\\training_set_rel3.tsv", sep='\t')
# data.head()

In [None]:
data['tokenized_essay'] = data.essay.apply(process_text)
data['max_score'] = 0

In [None]:
# replace NaN w/ 0
data = data.fillna(0)
# add a max_score column to use later 
# for standardizing scores
data['max_score'] = 0
data.head()

In [None]:
# change max score col based on essay set
# max vals:
# set 1: 12
# set 2: 10 or 24, needs some experimenting
# set 3: 3
# set 4: 3
# set 5: 4
# set 6: 4
# set 7: 30
# set 8: 60
essay_sets = data.essay_set.unique()
for set_ in essay_sets:
    print(set_)
    if set_ == 1:
        data.loc[data.essay_set == set_, 'max_score'] = 12
    if set_ == 2:
        data.loc[data.essay_set == set_, 'max_score'] = 10
    if set_ == 3 or set_ == 4:
        data.loc[data.essay_set == set_, 'max_score'] = 3
    if set_ == 5 or set_ == 6:
        data.loc[data.essay_set == set_, 'max_score'] = 4
    if set_ == 7:
        data.loc[data.essay_set == set_, 'max_score'] = 30
    if set_ == 8:
        data.loc[data.essay_set == set_, 'max_score'] = 60
print(data.loc[data.essay_set == 1, 'max_score'])
print(data.loc[data.essay_set == 4, 'max_score'])
print(data.loc[data.essay_set == 7, 'max_score'])
print(data.loc[data.essay_set == 8, 'max_score'])

In [None]:
# create temp column for 
# models later internal classes
data['temp'] = 0
for set_ in essay_sets:
    if set_ == 2:
        data.loc[data.essay_set == set_, 'temp'] = data.loc[data.essay_set==set_,'domain1_score'] + data.loc[data.essay_set==set_,'domain2_score'] / data.loc[data.essay_set==set_,'max_score']
        continue
    else:
        data.loc[data.essay_set == set_, 'temp'] = data.loc[data.essay_set==set_,'domain1_score'] / data.loc[data.essay_set==set_,'max_score']

In [None]:
data['class'] = 1
for x in range(len(data)):
    if (data.temp[x]) >= .9:
        data['class'][x] = 5
        continue
    elif data.temp[x] >= .8 and data.temp[x] < .9:
        data['class'][x] = 4
        continue
    elif data.temp[x] >= .7 and data.temp[x] < .8:
        data['class'][x] = 3
        continue
    elif data.temp[x] >= .6 and data.temp[x] < .7:
        data['class'][x] = 2

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
%run topic_model_utils.ipynb # for debugging

no_features = 1000

# Initialize tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, 
                                   min_df=3, 
                                   max_features=no_features, 
                                   stop_words='english', 
                                   preprocessor=' '.join)
tfidf = tfidf_vectorizer.fit_transform(data['tokenized_essay'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# Bag of words
tf_vectorizer = CountVectorizer(max_df=0.85, 
                                min_df=3, 
                                max_features=no_features, 
                                stop_words='english', 
                                preprocessor=' '.join)
tf = tf_vectorizer.fit_transform(data['tokenized_essay'])
tf_feature_names = tf_vectorizer.get_feature_names()

# Word2Vec
word2vec = WordEmbeddingsService()
word2vec_model = word2vec.train_w2v_model(tokenized_text=data['tokenized_essay'])

In [None]:
X_tfidf = tfidf
X_tf = tf
X_w2v = word2vec.create_word_embeddings(data['tokenized_essay'], word2vec_model)
y = data['class']

In [None]:
print((y.unique()))

In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

from xgboost import XGBClassifier

In [None]:
def make_classification(classifier, X, y, rs=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = rs)
    try:
        classifier.fit(X_train, y_train)
    except:
        X_train = np.array(X_train)
        X_test = np.array(X_test)
        y_train = np.array(y_train)
        y_test = np.array(y_test)
        classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    cm, acc_score, prec_score, rec_score = make_confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    return cm, acc_score, f1, prec_score, rec_score

def make_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    acc_score = accuracy_score(y_test, y_pred)
    prec_score = precision_score(y_test, y_pred, average='weighted')
    rec_score = recall_score(y_test, y_pred, average='weighted')
    return cm, acc_score, prec_score, rec_score

In [None]:
classifiers = {
    "log_reg": LogisticRegression(random_state=0),
    "knn": KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2),
    "lin_svm" : SVC(kernel = 'linear', random_state = 0), # took too long with word2vec (more than 5000 secs)
    "rbf_svm" : SVC(kernel = 'rbf', random_state = 0),
    "nb" : MultinomialNB(), 
    "tree" : DecisionTreeClassifier(criterion = 'entropy', random_state = 0),
    "rf" : RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0),
    "ada" : AdaBoostClassifier(random_state = 0),
    "gb" : GradientBoostingClassifier(random_state = 0),
    "xgb" : XGBClassifier(random_state = 0),
}

In [None]:
# tfidf vectors first, 15 min
tfidf_res = {}
for key in classifiers.keys():
    print(key)
    cm, acc, f1, prec, rec = make_classification(classifiers[key], X_tfidf, y)
    tfidf_res[key] = {
        'cm' : cm,
        'acc' : acc,
        'f1' : f1,
        'prec' : prec,
        'rec' : rec
    }
    print("==============")

In [None]:
# repeat classification with bag of words models, 6 min
tf_res = {}
for key in classifiers.keys():
    print(key)
    cm, acc, f1, prec, rec = make_classification(classifiers[key], X_tf, y)
    print("==============")
    tf_res[key] = {
        'cm' : cm,
        'acc' : acc,
        'f1' : f1,
        'prec' : prec,
        'rec' : rec
    }

In [None]:
# repeat classification with word2vec models, 45 min
w2v_res = {}
for key in classifiers.keys():
    # if key == 'lin_svm':
    #     continue
    print(key)
    cm, acc, f1, prec, rec = make_classification(classifiers[key], X_w2v, y)
    print("==============")
    w2v_res[key] = {
        'cm' : cm,
        'acc' : acc,
        'f1' : f1,
        'prec' : prec,
        'rec' : rec
    }

In [None]:
# everything else being equal,
# we want the one with highest precisions 
# (precision is affected by FP, which would be 
# overestimation of the grade of the paper)

for key in classifiers.keys():
    try:
        print(key)
        print("==================")
        print("tfidf acc: ", tfidf_res[key]['acc'])
        print("tfidf f1: ", tfidf_res[key]['f1'])
        print("tfidf precision: ", tfidf_res[key]['prec'])
        print("tfidf recall: ", tfidf_res[key]['rec'])
        print("==================")
        print("tf acc: ", tf_res[key]['acc'])
        print("tf f1: ", tf_res[key]['f1'])
        print("tf precision: ", tf_res[key]['prec'])
        print("tf recall: ", tf_res[key]['rec'])
        print("==================")
        print("w2v acc: ", w2v_res[key]['acc'])
        print("w2v f1: ", w2v_res[key]['f1'])
        print("w2v precision: ", w2v_res[key]['prec'])
        print("w2v recall: ", w2v_res[key]['rec'])
        print("==================")
    except:
        pass

Here are the best results from the training above.

### log_reg
- tfidf acc:  0.6879815100154083
- tfidf f1:  0.6826731058258083
- tfidf precision:  0.6840742515662983
- tfidf recall:  0.6879815100154083
### lin_svm
- tfidf acc:  0.6798921417565486
- tfidf f1:  0.6796612464208107
- tfidf precision:  0.6851511607249138
- tfidf recall:  0.6798921417565486
### rbf_svm
- tf acc:  0.714175654853621
- tf f1:  0.7137245090277426
- tf precision:  0.7245950249260565
- tf recall:  0.714175654853621

### tree
- tfidf acc:  0.6302003081664098
- tfidf f1:  0.6289259312408813
- tfidf precision:  0.6278195501435767
- tfidf recall:  0.6302003081664098
### rf
- tfidf acc:  0.687211093990755
- tfidf f1:  0.6795152273421196
- tfidf precision:  0.6789466187123883
- tfidf recall:  0.687211093990755
### ada
- tf acc:  0.5670261941448382
- tf f1:  0.5388070054332165
- tf precision:  0.5626726131804644
- tf recall:  0.5670261941448382


### gb
- tfidf acc:  0.7126348228043143
- tfidf f1:  0.7112178825280995
- tfidf precision:  0.7124145728645078
- tfidf recall:  0.7126348228043143

- w2v acc:  0.6771956856702619
- w2v f1:  0.6786444062419466
- w2v precision:  0.6813704223813889
- w2v recall:  0.6771956856702619

### xgb
- tfidf acc:  0.7245762711864406
- tfidf f1:  0.7223852160700287
- tfidf precision:  0.7212926330136676
- tfidf recall:  0.7245762711864406


