In [42]:
import pandas as pd
import numpy as np
# Import Data 
# open augmented_essays.csv
import pandas as pd

augmented_df = pd.read_csv('augmented_essays.tsv', encoding='utf_8',sep='\t')

zaebuc_df = pd.read_csv('raw_essays.tsv', encoding='utf_8',sep='\t')
zaebuc_df = zaebuc_df[zaebuc_df['grade'] != 'Unassessable']

# create a new dataframe with only the columns we need
essays = augmented_df['Raw']
# remove first and last character (')
essays = essays.str[1:-1]
# add Raw of zaebuc_df to essays and reset index
essays = essays.append(zaebuc_df['Raw']).reset_index(drop=True)
# create a new dataframe with essays
essays_df = pd.DataFrame(essays, columns=['Raw'])
# open documents_df.csv
documents_df = pd.read_csv('documents_features.csv', encoding='utf_8',sep='\t')

In [2]:
len(documents_df)

624

In [43]:
grades = documents_df['grade']
grades = grades[grades != 'Unassessable']

In [5]:
grades.value_counts()

C1    116
A2    111
B1    110
A1    105
C2     96
B2     80
Name: grade, dtype: int64

In [44]:
documents_df = documents_df[documents_df['grade'] != 'Unassessable']
# fix the index
documents_df = documents_df.reset_index(drop=True)

In [45]:
from camel_tools.tokenizers.word import simple_word_tokenize

# tokenize raw data and add it to the dataframe
essays_df['Tokenized'] = essays_df['Raw'].apply(lambda x: simple_word_tokenize(x))
# concatenate essays_df and documents_df
df = pd.concat([essays_df, documents_df], axis=1)
# remove 'Unnamed: 0' column
df = df.drop(['Unnamed: 0'], axis=1)

In [79]:
#get doc2vec vectors for raw data
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#tag the documents
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(df['Tokenized'])]

#train the model
max_epochs = 100
vec_size = 100
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")

In [46]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

model= Doc2Vec.load("d2v.model")

# get vectors for the raw data
vectors = []
for i in range(len(df)):
    vectors.append(model.docvecs[i])

#add vectors to dataframe with Document as index
df['Doc2Vec Embeddings'] = vectors

  


In [47]:
# change all A1 and A2 to A
df['grade'] = df['grade'].replace(['A1', 'A2'], 'A')

# change all C1 and C2 to C
df['grade'] = df['grade'].replace(['C1', 'C2'], 'C')

# change all B1 and B2 to B
# df['grade'] = df['grade'].replace(['B1', 'B2'], 'B')

In [48]:
# flatten Doc2Vec Embeddings column
df = pd.concat([df.drop(['Doc2Vec Embeddings'], axis=1), df['Doc2Vec Embeddings'].apply(pd.Series)], axis=1)
y = df['grade']
# remove grade and Raw amd Tokenized columns from df
X = df.drop(columns = [ 'grade','Raw', 'Tokenized', 'readability_0', 'readability_5', 'Document']) # 'grade',
#,'readability_0', 'readability_1', 'readability_2', 'readability_3', 'readability_4', 'readability_5', 'noun_ratio', 'verb_ratio', 'adj_ratio', 'adv_ratio', 'pron_ratio', 'prep_ratio', 'conj_ratio'
# ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# replace NaN values with 0
X = X.fillna(0)

X

Unnamed: 0,readability_1,readability_3,readability_2,readability_4,noun_ratio,verb_ratio,adj_ratio,adv_ratio,prep_ratio,conj_ratio,...,90,91,92,93,94,95,96,97,98,99
0,0.587719,0.131579,0.061404,0.017544,0.385965,0.131579,0.061404,0.026316,0.122807,0.087719,...,-1.164104,4.349712,3.254559,-0.531110,3.654962,0.230786,1.026072,-1.068444,-2.384245,4.848377
1,0.563107,0.155340,0.048544,0.029126,0.417476,0.165049,0.067961,0.019417,0.116505,0.009709,...,1.280848,1.549968,1.749696,0.601716,5.797670,5.229796,3.746816,0.156839,-5.438986,4.569999
2,0.428571,0.220238,0.101190,0.065476,0.386905,0.136905,0.136905,0.005952,0.130952,0.000000,...,-1.027658,-0.169038,2.459741,0.418934,7.791171,5.558947,6.597505,-0.807560,-1.421728,4.696105
3,0.393548,0.225806,0.077419,0.090323,0.464516,0.103226,0.116129,0.006452,0.129032,0.000000,...,2.092911,-2.561067,2.664188,-0.502632,2.867926,5.306192,2.735043,-2.377146,-2.957879,3.941447
4,0.303571,0.348214,0.187500,0.062500,0.339286,0.116071,0.232143,0.000000,0.116071,0.026786,...,5.663603,-1.813225,0.216913,0.946437,6.843854,2.023906,1.602668,-2.953294,-0.892307,2.708489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
613,0.405882,0.241176,0.170588,0.017647,0.394118,0.076471,0.123529,0.005882,0.135294,0.100000,...,2.339314,4.573346,-2.656280,1.711765,1.416503,2.454271,3.847449,-6.358982,0.517902,3.217355
614,0.474510,0.160784,0.145098,0.050980,0.419608,0.137255,0.086275,0.003922,0.141176,0.031373,...,2.223941,-0.902198,2.731066,3.729349,2.098410,4.323120,-5.512739,-2.180889,1.790371,1.780517
615,0.381910,0.206030,0.190955,0.070352,0.452261,0.100503,0.105528,0.005025,0.105528,0.005025,...,5.851831,5.742457,8.158543,1.492768,5.220162,4.387180,3.728474,2.705418,3.673166,1.467261
616,0.431193,0.155963,0.183486,0.045872,0.422018,0.146789,0.100917,0.000000,0.100917,0.009174,...,4.750781,-0.863330,3.807645,0.437162,2.909085,1.235796,1.918473,1.941098,0.911788,5.164681


In [49]:
grades_to_num = { 'A': 1, 'B1': 2, 'B2': 3, 'C': 4}
num_grades = list(map(lambda x: grades_to_num[x], y))

In [50]:
# make num_grades a dataframe
num_grades = pd.DataFrame(num_grades, columns=['grade'])

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Separate the dataset into non-augmented and augmented subsets
mask_not_augmented = df['augmented'] == 0
mask_augmented = df['augmented'] == 1

X_non_augmented = X[mask_not_augmented]
y_non_augmented = num_grades[mask_not_augmented]

X_augmented = X[mask_augmented]
y_augmented = num_grades[mask_augmented]

# Split the non-augmented data into train and test sets
X_train_partial, X_test, y_train_partial, y_test = train_test_split(
    X_non_augmented, y_non_augmented,
    test_size=0.2,  # Adjust the test size as per your requirement
    stratify=y_non_augmented,
    random_state=42
)

# Combine the augmented data with the partial non-augmented training data
X_train = pd.concat([X_augmented, X_train_partial])
y_train = pd.concat([y_augmented, y_train_partial])

# Now, X_train and y_train contain both augmented and non-augmented data, 
# while X_test and y_test only contain non-augmented data

In [53]:
y_train

Unnamed: 0,grade
0,1
1,1
2,4
3,4
4,4
...,...
565,4
598,3
486,3
556,3


In [55]:
# fix the index
X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = np.array(X_test)
y_test = np.array(y_test)

In [36]:
# import train_test_split
from sklearn.model_selection import train_test_split

# X to numpy array
X = np.array(X)
X_train, X_test, y_train, y_test = train_test_split(X, num_grades, test_size = 0.20, stratify = num_grades)

In [56]:
## given a dataset X and grades y, return a dataset of pair-wise differences and labels (+,-) 
def to_pairs(X, y):
    paired_X = list()
    paired_y = list()
    for i in range(len(X)):
        for k in range(i+1, len(X), 1):
                paired_X.append(np.subtract(X[i], X[k]))
                paired_y.append(y[i] > y[k])
    return paired_X, paired_y

In [57]:
X_train_diff, y_train_diff = to_pairs(X_train, y_train)
X_test_diff, y_test_diff = to_pairs(X_test, y_test)

In [58]:
len(y_train_diff)

165600

In [59]:
# sample 60% of the data X_train_diff
import random
random.seed(42)
X_train_diff_sample = random.sample(X_train_diff, int(len(X_train_diff)*0.6))
random.seed(42)
y_train_diff_sample = random.sample(y_train_diff, int(len(y_train_diff)*0.6))

In [None]:
# add a few 20  as and cs and run it on 100%

In [60]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train_diff_sample, y_train_diff_sample)

y_pred = svclassifier.predict(X_test_diff)

print(confusion_matrix(y_test_diff,y_pred))
print(classification_report(y_test_diff,y_pred))

  return f(*args, **kwargs)


### Final Mapping to Grades with a Linear Classifier (SVC)

In [72]:
svc_fitted_X_train = svclassifier.predict(X_train).reshape(-1,1)
svc_fitted_X_test = svclassifier.predict(X_test).reshape(-1,1)

In [73]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(svc_fitted_X_train, y_train) 
svm_predictions = svm_model_linear.predict(svc_fitted_X_test)

# model accuracy for X_test   
accuracy = svm_model_linear.score(svc_fitted_X_test, y_test) 
print('accuracy= {}'.format(accuracy))
# creating a confusion matrix 
cm = confusion_matrix(y_test, svm_predictions)
print(cm)

accuracy= 0.3793103448275862
[[ 0  0  4  0  0  0]
 [ 0  0  6  0  0  0]
 [ 0  0 22  0  0  0]
 [ 0  0 16  0  0  0]
 [ 0  0  6  0  0  0]
 [ 0  0  4  0  0  0]]


In [74]:
print(classification_report(y_test,svm_predictions))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         6
           3       0.38      1.00      0.55        22
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00         6
           6       0.00      0.00      0.00         4

    accuracy                           0.38        58
   macro avg       0.06      0.17      0.09        58
weighted avg       0.14      0.38      0.21        58



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
