In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

from scipy.stats import randint as sp_randint

In [2]:
unpickled_df = pd.read_pickle("./engineered_data.pkl")

In [3]:
unpickled_df.columns

Index(['words_length_4', 'words_length_6', 'words_length_8', 'words_length_10',
       'words_length_12', 'mean_word_length', 'variance_word_length',
       'type_token_ratio', 'essay_length', 'num_words', 'num_sentences',
       'mean_sentence_length', 'num_characters', 'fourth_root_num_characters',
       'num_commas', 'num_periods', 'num_exclaim', 'num_question',
       'num_semicolon', 'num_colon', 'vocab_size', 'yules_k',
       'very_short_sentences', 'short_sentences', 'medium_sentences',
       'long_sentences', 'variance_sentence_length', 'max_height',
       'sum_heights', 'mean_heights', 'pos_trigram_ratio',
       'pos_fourgram_ratio', 'mean_trigram_tfTF', 'mean_fourgram_tfTF',
       'connectives', 'flesch_kincaid_grade_level', 'flesch_reading_ease',
       'gunning_fog_index', 'coleman_liau_index',
       'automated_readability_index', 'lix', 'gulpease_index',
       'wiener_sachtextformel', 'score', 'essay_set', 'essay_id'],
      dtype='object')

In [4]:
# trainingdata = unpickled_df[unpickled_df.essay_set < 7]
trainingdata = unpickled_df
DivSeries = pd.DataFrame({'div': [12,5,3,3,4,4,25,50],'essay_set':[1,2,3,4,5,6,7,8]})
trainingdata = trainingdata.merge(DivSeries, on='essay_set')
cols_at_end = ['score']
trainingdata = trainingdata[[c for c in trainingdata if c not in cols_at_end] + [c for c in cols_at_end if c in trainingdata]]
# trainingdata = trainingdata.drop(['tree_heights'], axis=1)


In [5]:
null_columns=trainingdata.columns[trainingdata.isnull().any()]
print(trainingdata[null_columns].isnull().sum())
trainingdata = trainingdata.dropna()

mean_trigram_tfTF     1
mean_fourgram_tfTF    1
score                 1
dtype: int64


In [38]:
# X, Y = trainingdata.iloc[:,:-3], trainingdata.loc[:,['score','essay_set']]
# X, Y = trainingdata.drop(['score','essay_id'], axis=0), trainingdata.loc[:,['score','essay_set']]

In [6]:
traincomp, testcomp = train_test_split(trainingdata,random_state=42,test_size=0.2)

set_train = traincomp.loc[:,['essay_set','essay_id','div']]  # setting asside essay set and scale
set_test = testcomp.loc[:,['essay_set','essay_id','div']] # setting asside essay set and scale

x_train = np.asarray(traincomp.iloc[:,:-3])
y_train = np.reshape(np.asarray(traincomp.loc[:,['score']]),(-1,))

x_test = np.asarray(testcomp.iloc[:,:-3])
y_test = np.reshape(np.asarray(testcomp.loc[:,['score']]),(-1,))

In [8]:
# null_columns=classification_data.columns[classification_data.isnull().any()]
# classification_data[null_columns].isnull().sum()

In [7]:
# print(sum(np.isnan(y_train)))
# print(sum(np.isnan(y_test)))
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(10380, 44)
(2596, 44)
(10380,)
(2596,)


In [18]:
lr = LinearRegression(fit_intercept=True)
rf = RandomForestRegressor(n_estimators=500, min_samples_split=10, min_samples_leaf=3, max_features='auto',max_depth=44, bootstrap=True, random_state=42)
sv = svm.SVR(kernel='rbf',C=0.1,gamma='scale')

In [19]:
lr.fit(x_train, y_train)
rf.fit(x_train, y_train)
sv.fit(x_train, y_train)

SVR(C=0.1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [20]:
# scoringFrame_baselines = pd.DataFrame([y_test,set_test[:,0],set_test[:,1],rf.predict(x_test), sv.predict(x_test), lr.predict(x_test)]).transpose()
# scoringFrame_baselines.columns = ['actual','essay_set','div','rf','sv','lr']

In [21]:
#RMSE is one possible eval score
def RMSE(actual, predict):
    diff = actual - predict
    diff = sum(diff**2) / len(actual)
    return np.sqrt(diff)

def evaluate(model, x_test, y_test, div, essay_set):
    predictions = model.predict(x_test)
    y_rescaled = y_test * div
    y_rescaled = y_rescaled.astype(int)
    predictions_rescaled = predictions * div
    predictions_round = predictions_rescaled.round()
    predictions_round = predictions_round.astype(int)
    rmse = RMSE(y_rescaled, predictions_round)
    cohen = cohen_kappa_score(y_rescaled, predictions_round, weights=None)
    quad_cohen = cohen_kappa_score(y_rescaled, predictions_round, weights='quadratic')
    accuracy = accuracy_score(y_rescaled, predictions_round)
    return {'rmse':rmse,'kappa':cohen,'quad_kappa':quad_cohen,'accuracy':accuracy}

In [22]:
lr_metrics = evaluate(lr,x_test,y_test,div=set_test.loc[:,'div'],essay_set=set_test.loc[:,'essay_set'])
rf_metrics = evaluate(rf,x_test,y_test,div=set_test.loc[:,'div'],essay_set=set_test.loc[:,'essay_set'])
sv_metrics = evaluate(sv,x_test,y_test,div=set_test.loc[:,'div'],essay_set=set_test.loc[:,'essay_set'])
print("Linear Regression RMSE: {0} Cohen Kappa: {1} Cohen QW-Kappa: {2} Accuracy: {3}".format(lr_metrics['rmse'], lr_metrics['kappa'],lr_metrics['quad_kappa'],lr_metrics['accuracy']))
print("Random Forests RMSE: {0} Cohen Kappa: {1} Cohen QW-Kappa: {2} Accuracy: {3}".format(rf_metrics['rmse'], rf_metrics['kappa'], rf_metrics['quad_kappa'],rf_metrics['accuracy']))
print("Support Vector RMSE: {0} Cohen Kappa: {1} Cohen QW-Kappa: {2} Accuracy: {3}".format(sv_metrics['rmse'], sv_metrics['kappa'],sv_metrics['quad_kappa'],sv_metrics['accuracy']))

Linear Regression RMSE: 2.02250435917868 Cohen Kappa: 0.4001262423530352 Cohen QW-Kappa: 0.9741565966034074 Accuracy: 0.4761171032357473
Random Forests RMSE: 1.5422903345729375 Cohen Kappa: 0.4665936551347196 Cohen QW-Kappa: 0.9843781904283819 Accuracy: 0.5342835130970724
Support Vector RMSE: 1.8743257647535838 Cohen Kappa: 0.40089386711048913 Cohen QW-Kappa: 0.9774461651725512 Accuracy: 0.4761171032357473


In [23]:
rf_scoring_df = pd.DataFrame([y_test,set_test.loc[:,'essay_set'],set_test.loc[:,'div'],rf.predict(x_test)]).transpose()
rf_scoring_df.columns = ['actual','essay_set','div','prediction']

lr_scoring_df = pd.DataFrame([y_test,set_test.loc[:,'essay_set'],set_test.loc[:,'div'],lr.predict(x_test)]).transpose()
lr_scoring_df.columns = ['actual','essay_set','div','prediction']

sv_scoring_df = pd.DataFrame([y_test,set_test.loc[:,'essay_set'],set_test.loc[:,'div'],sv.predict(x_test)]).transpose()
sv_scoring_df.columns = ['actual','essay_set','div','prediction']

def essay_set_metrics(df):
    df['y_rescaled'] = df.actual * df['div']
    df.y_rescaled = df.y_rescaled.astype(int)
    df['prediction_rescaled'] = df.prediction * df['div']
    df['prediction_round'] = df.prediction_rescaled.round()
    df.prediction_round = df.prediction_round.astype(int)
    set_df = pd.DataFrame(columns=['essay_set','RMSE','Kappa','Accuracy'])
    e_sets = np.unique(df.essay_set)
    for e_s in e_sets:
        df_s = df[df.essay_set == e_s]
        original_score = df_s.y_rescaled.values
        predicted_score = df_s.prediction_round.values
        rmse = RMSE(original_score,predicted_score)
        kappa = cohen_kappa_score(original_score,predicted_score, weights=None)
        quad_kappa = cohen_kappa_score(original_score,predicted_score, weights='quadratic')
        accuracy = accuracy_score(original_score,predicted_score)
        set_df = set_df.append({'essay_set':e_s,'RMSE':rmse,'Kappa':kappa,'QW-Kappa':kappa,'Accuracy':accuracy},
                             ignore_index=True)
    return set_df

print(essay_set_metrics(rf_scoring_df))
print(essay_set_metrics(lr_scoring_df))
print(essay_set_metrics(sv_scoring_df))

   essay_set      RMSE     Kappa  Accuracy  QW-Kappa
0        1.0  0.837965  0.337357  0.500000  0.337357
1        2.0  0.573478  0.525508  0.708556  0.525508
2        3.0  0.670151  0.427363  0.628743  0.427363
3        4.0  0.654047  0.451525  0.627778  0.451525
4        5.0  0.594662  0.524375  0.663768  0.524375
5        6.0  0.747129  0.328884  0.560847  0.328884
6        7.0  3.053063  0.101793  0.162252  0.101793
7        8.0  4.135479  0.047277  0.094891  0.047277
   essay_set      RMSE     Kappa  Accuracy  QW-Kappa
0        1.0  1.113749  0.185354  0.374317  0.185354
1        2.0  0.685994  0.341279  0.598930  0.341279
2        3.0  0.689962  0.338168  0.577844  0.338168
3        4.0  0.679052  0.364771  0.586111  0.364771
4        5.0  0.623222  0.438439  0.620290  0.438439
5        6.0  0.755929  0.275904  0.558201  0.275904
6        7.0  3.162278  0.049620  0.115894  0.049620
7        8.0  6.778024  0.032291  0.080292  0.032291
   essay_set      RMSE     Kappa  Accuracy  QW

array([0.01548982, 0.00414467, 0.01638003, 0.00832988, 0.01119178,
       0.0111116 , 0.0137667 , 0.01336969, 0.01160804, 0.00466289,
       0.0081578 , 0.00689729, 0.01037676, 0.00478127, 0.00709852,
       0.00502207, 0.00808653, 0.0558265 ])

In [54]:
features = trainingdata.columns[:-3]
importances = rf.feature_importances_
indices = np.argsort(importances)
# plt.barh(range(len(indices)), importances[indices], color='b', align='center')
# plt.yticks(range(len(indices)), [features[i] for i in indices])
# plt.xlabel('Relative Importance')
# plt.show()

a = pd.DataFrame(data=importances,index=features,columns=['importances']).sort_values(by=['importances'],ascending=False)
a[a.importances > 0.01]

Unnamed: 0,importances
vocab_size,0.25292
essay_length,0.209961
fourth_root_num_characters,0.09546
num_characters,0.08396
essay_set,0.055826
mean_word_length,0.020335
variance_word_length,0.016607
sum_heights,0.01638
variance_sentence_length,0.01549
mean_trigram_tfTF,0.013767


In [None]:
# # # Create the parameter grid based on the results of random search 
# param_grid = {
#     'n_estimators': [500],
#     'min_samples_split': [10, 12],
#     'min_samples_leaf': [2,3],
#     'max_features': ['auto'],
#     'max_depth': [40, 44],
#     'bootstrap': [True]  
# }
# # Create a based model
# rf = RandomForestRegressor()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)
# grid_search.fit(x_train, y_train)

In [10]:
# grid = {
#     'kernel': ['linear','rbf'],
#     'gamma': ['auto'],
#     'C':[0.1,1]
# }

# sv = svm.SVR()

# sv_grid = GridSearchCV(estimator = sv, param_grid= grid, cv = 3, verbose=4, n_jobs = -1)
# sv_grid.fit(x_train, y_train)