In [None]:
import re
import nltk
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!ls -alh ./dataset/

ls: cannot access './dataset/': No such file or directory


## Preprocessing the dataset

### Loading train and validation datasets

In [None]:
train_data = pd.read_csv(
    '/content/training_set_rel3.tsv', 
    usecols=['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1', 'domain1_score'], 
    sep='\t', 
    encoding='ISO-8859-1'
)

train_data.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",4,4,8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,10
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,8


In [None]:
val_data = pd.read_csv(
    '/content/valid_set.tsv', 
    usecols = ['essay_id', 'essay'],
    sep='\t', 
    encoding='ISO-8859-1'
)

val_data.head()

Unnamed: 0,essay_id,essay
0,1788,"Dear @ORGANIZATION1, @CAPS1 more and more peop..."
1,1789,Dear @LOCATION1 Time @CAPS1 me tell you what I...
2,1790,"Dear Local newspaper, Have you been spending a..."
3,1791,"Dear Readers, @CAPS1 you imagine how life woul..."
4,1792,"Dear newspaper, I strongly believe that comput..."


### Checking weather train and validation set have any NaN 

In [None]:
train_data.isna().sum()

essay_id          0
essay_set         0
essay             0
rater1_domain1    0
rater2_domain1    0
domain1_score     0
dtype: int64

In [None]:
val_data.isna().sum()

essay_id    0
essay       0
dtype: int64

# Data Pre-Processing

In [None]:
from nltk.corpus import stopwords
def process_stopwords(sentence):
    s_words = stopwords.words('english')
    sentence = re.sub("[^A-Za-z]", " ", sentence)
    sentence = sentence.lower()
    sentence = sentence.split()
    
    processed_words = [
        word
        for word in sentence
        if word not in s_words
    ]
    
    return ' '.join(processed_words)

def process_sentences(essay):
    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    unprocessed_sentences = sentence_tokenizer.tokenize(essay.strip())
    processed_sentences = []
    
    for sentence in unprocessed_sentences:
        if len(sentence) > 0:
            processed_sentences += [process_stopwords(sentence).split()]
            
    return processed_sentences

def preprocess_data(raw_corpus):
    processed_corpus = []
    for essay in raw_corpus:
        processed_sentences = process_stopwords(essay)
        processed_words = process_sentences(processed_sentences)
        processed_corpus += processed_words
        
    return processed_corpus

# Wrord2Vector training and transformation

### Training Word2Vector using Gensim

In [None]:
from gensim.models import Word2Vec

processed_corpus = preprocess_data(train_data['essay'].tolist())
processed_corpus_val = preprocess_data(val_data['essay'].tolist())
W2V_model = Word2Vec(sentences=processed_corpus, workers=4, vector_size=300, min_count=1, window=10)

### Generating word vectors using trained gensim model

In [None]:
def generate_feature_vector(essay, vector_size=300):
    feature_vector = np.zeros((vector_size,), dtype=np.float32)
    word_counter = 0
    
    for word in essay:
        if word in W2V_model.wv.index_to_key:
            word_counter += 1
            feature_vector += W2V_model.wv.get_vector(word)
            
    feature_vector = np.array(feature_vector)
    feature_vector /= word_counter
    return feature_vector

def convert_corpus(corpus):
    vector_samples = []
    for essay in corpus:
        feature_vector = generate_feature_vector(essay)
        vector_samples.append(feature_vector)
    
    vector_samples = np.array(vector_samples)
        
    return vector_samples

### Creating Train and Test samples

In [None]:
train_samples = convert_corpus(processed_corpus)
train_samples = train_samples[:, None, :]
train_samples.shape

(12976, 1, 300)

In [None]:
val_samples = convert_corpus(processed_corpus_val)
val_samples = val_samples[:, None, :]
val_samples.shape

(4218, 1, 300)

# Training and Evaluation

### Using GPU for Training and Evaluating the model

In [None]:
!nvidia-smi

Mon Apr 17 03:32:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P0    30W /  70W |    679MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense, Embedding, Flatten, Lambda, Dropout
from tensorflow.keras.models import model_from_config, load_model, Sequential
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


### Creating the model

In [None]:
def generate_model():
    grader = Sequential()
    grader.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    grader.add(LSTM(64, recurrent_dropout=0.4))
    grader.add(Dropout(.5))
    grader.add(Dense(1, activation='relu'))
    
    grader.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    grader.summary()
    
    return grader

### Training annd Cross-validating the model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

In [None]:
targets = train_data['domain1_score'].values
X_train, X_test, Y_train, Y_test = train_test_split(train_samples, targets, test_size=.3, shuffle=True)

In [None]:
grader = generate_model()
grader.fit(X_train, Y_train, batch_size=64, epochs=50)



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 1, 300)            721200    
                                                                 
 lstm_3 (LSTM)               (None, 64)                93440     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/

<keras.callbacks.History at 0x7f92fae43220>

In [None]:
train_preds = grader.predict(X_train)
train_score = cohen_kappa_score(Y_train, np.around(train_preds), weights='quadratic')

test_preds = grader.predict(X_test)
test_score = cohen_kappa_score(Y_test, np.around(test_preds), weights='quadratic')

print("------------------------ Train Score ------------------------")
print('Cohen Kappa Score : ', train_score)

print("------------------------ Valid Score ------------------------")
print('Cohen Kappa Score : ', test_score)

------------------------ Train Score ------------------------
Cohen Kappa Score :  0.9735765566486034
------------------------ Valid Score ------------------------
Cohen Kappa Score :  0.960872992952125


In [None]:
def grade_assignment(assignment):
  processed_assignment = preprocess_data([assignment])
  processed_assignment = convert_corpus(processed_assignment)
  processed_assignment = processed_assignment[:, None, :]
  score = grader.predict(processed_assignment)  
  score = np.around(score)
  return score.item()

In [None]:
assignment = "Dear @CAPS1 @CAPS2, I believe that using computers will benefit us in many ways like talking and becoming friends will others through websites like facebook and mysace. Using computers can help us find coordibates, locations, and able ourselfs to millions of information. Also computers will benefit us by helping with jobs as in planning a house plan and typing a @NUM1 page report for one of our jobs in less than writing it. Now lets go into the wonder world of technology. Using a computer will help us in life by talking or making friends on line. Many people have myspace, facebooks, aim, these all benefit us by having conversations with one another. Many people believe computers are bad but how can you make friends if you can never talk to them? I am very fortunate for having a computer that can help with not only school work but my social life and how I make friends. Computers help us with finding our locations, coordibates and millions of information online. If we didn't go on the internet a lot we wouldn't know how to go onto websites that @MONTH1 help us with locations and coordinates like @LOCATION1. Would you rather use a computer or be in @LOCATION3. When your supposed to be vacationing in @LOCATION2. Million of information is found on the internet. You can as almost every question and a computer will have it. Would you rather easily draw up a house plan on the computers or take @NUM1 hours doing one by hand with ugly erazer marks all over it, you are garrenteed that to find a job with a drawing like that. Also when appling for a job many workers must write very long papers like a @NUM3 word essay on why this job fits you the most, and many people I know don't like writing @NUM3 words non-stopp for hours when it could take them I hav an a computer. That is why computers we needed a lot now adays. I hope this essay has impacted your descion on computers because they are great machines to work with. The other day I showed my mom how to use a computer and she said it was the greatest invention sense sliced bread! Now go out and buy a computer to help you chat online with friends, find locations and millions of information on one click of the button and help your self with getting a job with neat, prepared, printed work that your boss will love."
score = grade_assignment(assignment)
print("The score for the assignment is : ", score)

The score for the assignment is :  9.0
