In [1]:
import pandas as pd
import numpy as np
# Import Data 
# open augmented_essays.csv
import pandas as pd

augmented_df = pd.read_csv('augmented_essays.tsv', encoding='utf_8',sep='\t')

zaebuc_df = pd.read_csv('raw_essays.tsv', encoding='utf_8',sep='\t')
zaebuc_df = zaebuc_df[zaebuc_df['grade'] != 'Unassessable']


In [2]:
# create a new dataframe with only the columns we need
essays = augmented_df['Raw']
# remove first and last character (')
essays = essays.str[1:-1]

In [3]:
# add Raw of zaebuc_df to essays and reset index
essays = essays.append(zaebuc_df['Raw']).reset_index(drop=True)

In [4]:
# create a new dataframe with essays
essays_df = pd.DataFrame(essays, columns=['Raw'])
# open documents_df.csv
documents_df = pd.read_csv('documents_features.csv', encoding='utf_8',sep='\t')

In [5]:
documents_df = documents_df[documents_df['grade'] != 'Unassessable']
# fix the index
documents_df = documents_df.reset_index(drop=True)

In [6]:
from camel_tools.tokenizers.word import simple_word_tokenize

# tokenize raw data and add it to the dataframe
essays_df['Tokenized'] = essays_df['Raw'].apply(lambda x: simple_word_tokenize(x))

In [7]:
# concatenate essays_df and documents_df
df = pd.concat([essays_df, documents_df], axis=1)
# remove 'Unnamed: 0' column
df = df.drop(['Unnamed: 0'], axis=1)


In [8]:
# keep only those with augmented = 0
df = df[df['augmented'] == 0]

In [10]:
# fix the index
df = df.reset_index(drop=True)

In [11]:
#get doc2vec vectors for raw data
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#tag the documents
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(df['Tokenized'])]

#train the model
max_epochs = 100
vec_size = 100
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("base_d2v.model")

In [12]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

model= Doc2Vec.load("base_d2v.model")

# get vectors for the raw data
vectors = []
for i in range(len(df)):
    vectors.append(model.docvecs[i])

#add vectors to dataframe with Document as index
df['Doc2Vec Embeddings'] = vectors

  


In [13]:
df = pd.concat([df.drop(['Doc2Vec Embeddings'], axis=1), df['Doc2Vec Embeddings'].apply(pd.Series)], axis=1)

In [14]:
# change all A1 and A2 to A
df['grade'] = df['grade'].replace(['A1', 'A2'], 'A')

# change all C1 and C2 to C
df['grade'] = df['grade'].replace(['C1', 'C2'], 'C')

# change all B1 and B2 to B
# df['grade'] = df['grade'].replace(['B1', 'B2'], 'B')

In [15]:
y = df['grade']
# remove grade and Raw amd Tokenized columns from df
X = df.drop(columns = [ 'grade', 'Raw',  'Document', 'augmented', 'Tokenized']) #'grade', 'Tokenized', 'noun_ratio', 'verb_ratio', 'adj_ratio', 'adv_ratio', 'pron_ratio', 'prep_ratio', 'conj_ratio'
# X = df.drop(columns = ['augmented', 'Raw', 'Tokenized', 'readability_0', 'readability_5', 'Document', 'readability_1', 'readability_2', 'readability_3', 'readability_4', 'readability_5', 'noun_ratio', 'verb_ratio', 'adj_ratio', 'adv_ratio', 'pron_ratio', 'prep_ratio', 'conj_ratio'])
#,'readability_0', 'readability_1', 'readability_2', 'readability_3', 'readability_4', 'readability_5', 'noun_ratio', 'verb_ratio', 'adj_ratio', 'adv_ratio', 'pron_ratio', 'prep_ratio', 'conj_ratio'
# ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# replace NaN values with 0
X = X.fillna(0)

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Separate the dataset into non-augmented and augmented subsets
mask_not_augmented = df['augmented'] == 0
mask_augmented = df['augmented'] == 1

X_non_augmented = X[mask_not_augmented]
y_non_augmented = y[mask_not_augmented]

X_augmented = X[mask_augmented]
y_augmented = y[mask_augmented]

# Split the non-augmented data into train and test sets
X_train_partial, X_test, y_train_partial, y_test = train_test_split(
    X_non_augmented, y_non_augmented,
    test_size=0.2,  # Adjust the test size as per your requirement
    stratify=y_non_augmented,
    random_state=42
)

# Combine the augmented data with the partial non-augmented training data
X_train = pd.concat([X_augmented, X_train_partial])
y_train = pd.concat([y_augmented, y_train_partial])

# Now, X_train and y_train contain both augmented and non-augmented data, 
# while X_test and y_test only contain non-augmented data


In [21]:
y_test.value_counts()

B1    22
B2    16
C      2
A      2
Name: grade, dtype: int64

In [19]:
# SVM classifier for X and grades

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify=y, random_state = 30)

svclassifier = SVC(kernel='rbf', C = 2, gamma= 0.0001, random_state=42)
# svclassifier = SVC(kernel='linear', C = 0.1, gamma= 0.1)
svclassifier.fit(X_train, y_train)

y_pred = svclassifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 0  2  0  0]
 [ 0 19  3  0]
 [ 0  6 10  0]
 [ 0  0  2  0]]
              precision    recall  f1-score   support

           A       0.00      0.00      0.00         2
          B1       0.70      0.86      0.78        22
          B2       0.67      0.62      0.65        16
           C       0.00      0.00      0.00         2

    accuracy                           0.69        42
   macro avg       0.34      0.37      0.36        42
weighted avg       0.62      0.69      0.65        42



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svclassifier, X_train, y_train, cv=5)
print(scores)

# print the mean score
print(scores.mean())

# print the standard deviation
print(scores.std())

# predict grades for augmented data
y_pred = svclassifier.predict(X_train)


[0.58823529 0.57575758 0.54545455 0.72727273 0.78787879]
0.6449197860962567
0.09498332503844188


In [None]:
# label_map = {'Unassessable': 0, 'A': 1, 'B1': 2, 'B2': 3, 'C': 4}
label_map = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5, 'Unassessable': 0,}
y_pred_num = [label_map[i] for i in y_pred]
y_test_num = [label_map[i] for i in y_test]
fuzzy_accuracy(y_test_num, y_pred_num)

In [18]:
# grid search for SVM

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# defining parameter range
param_grid = {'C': [0.1, 1, 2, 4, 5, 6, 7, 10, 100, 1000],
                'gamma': [0.1, 0.01, 0.001, 0.002, 0.003, 0.0001],
                'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}

# scoring f1_macro
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
# {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
# SVC(C=1000, gamma=0.0001)

grid_predictions = grid.predict(X_test)

# print classification report
print(classification_report(y_test, grid_predictions))


Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 1/5] END ................C=0.1, gamma=0.1, kernel=linear; total time=   0.0s
[CV 2/5] END ................C=0.1, gamma=0.1, kernel=linear; total time=   0.0s
[CV 3/5] END ................C=0.1, gamma=0.1, kernel=linear; total time=   0.0s
[CV 4/5] END ................C=0.1, gamma=0.1, kernel=linear; total time=   0.0s
[CV 5/5] END ................C=0.1, gamma=0.1, kernel=linear; total time=   0.0s
[CV 1/5] END ..................C=0.1, gamma=0.1, kernel=poly; total time=   0.0s
[CV 2/5] END ..................C=0.1, gamma=0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.20, stratify=y, random_state = 30)

clf = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=42)
# clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 0  2  0  0]
 [ 0 20  2  0]
 [ 0  7  9  0]
 [ 0  1  1  0]]
              precision    recall  f1-score   support

           A       0.00      0.00      0.00         2
          B1       0.67      0.91      0.77        22
          B2       0.75      0.56      0.64        16
           C       0.00      0.00      0.00         2

    accuracy                           0.69        42
   macro avg       0.35      0.37      0.35        42
weighted avg       0.63      0.69      0.65        42



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)

# print the mean score
print(scores.mean())

# print the standard deviation
print(scores.std())

[0.67647059 0.63636364 0.63636364 0.66666667 0.54545455]
0.6322638146167557
0.04627582814411479


In [23]:
# grid search for Random Forest

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# defining parameter range
param_grid = {'n_estimators': [10, 50, 80, 100],
                'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
# scoring f1_macro
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
# {'max_depth': 10, 'n_estimators': 500}

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
# RandomForestClassifier(max_depth=10, n_estimators=500)

grid_predictions = grid.predict(X_test)

# print classification report
print(classification_report(y_test, grid_predictions))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END ...................max_depth=2, n_estimators=10; total time=   0.0s
[CV 2/5] END ...................max_depth=2, n_estimators=10; total time=   0.0s
[CV 3/5] END ...................max_depth=2, n_estimators=10; total time=   0.0s
[CV 4/5] END ...................max_depth=2, n_estimators=10; total time=   0.0s
[CV 5/5] END ...................max_depth=2, n_estimators=10; total time=   0.0s
[CV 1/5] END ...................max_depth=2, n_estimators=50; total time=   0.1s
[CV 2/5] END ...................max_depth=2, n_estimators=50; total time=   0.1s
[CV 3/5] END ...................max_depth=2, n_estimators=50; total time=   0.1s
[CV 4/5] END ...................max_depth=2, n_estimators=50; total time=   0.1s
[CV 5/5] END ...................max_depth=2, n_estimators=50; total time=   0.1s
[CV 1/5] END ...................max_depth=2, n_estimators=80; total time=   0.2s
[CV 2/5] END ...................max_depth=2, n_

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
