In [2]:
import random
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
def simplify_word(word):
    if word.lower() in stop_words or len(word) <= 3:
        return word
    synonyms = wordnet.synsets(word)
    if synonyms:
        simple = synonyms[0].lemmas()[0].name()
        if simple.lower() != word.lower():
            return simple
    return word

In [6]:
def misspell_word(word):
    if len(word) <= 3:
        return word
    words = list(word)
    i = random.randint(0, len(word) - 2)
    if random.random() < 0.5:
        words[i], words[i+1] = words[i+1], words[i]
    else:
        words.pop(i)
    return ''.join(words)

In [7]:
def inject_grammar_errors(sentence):
    sentence = re.sub(r'\bis\b', 'are', sentence)
    sentence = re.sub(r'\bwas\b', 'were', sentence)
    sentence = re.sub(r'\bhas\b', 'have', sentence)
    sentence = re.sub(r'\bare\b', 'is', sentence)
    sentence = re.sub(r'\bhave\b', 'has', sentence)
    sentence = re.sub(r'\ban\b', '', sentence)
    sentence = re.sub(r'\ba\b', 'the', sentence)
    sentence = re.sub(r'\bthe\b', 'a', sentence)
    sentence = re.sub(r'\bdoes\b', '', sentence)
    sentence = re.sub(r'\bdid\b', '', sentence)

    preposition_errors = [
        (r'\bin\b', 'on'),
        (r'\bon\b', 'at'),
        (r'\bat\b', 'to'),
        (r'\bto\b', 'into'),
        (r'\binto\b', 'onto'),
        (r'\bonto\b', 'with'),
        (r'\bwith\b', 'by'),
        (r'\bby\b', 'of'),
        (r'\bof\b', 'in')
    ]
    for pattern, replacement in preposition_errors:
        if random.random() < 0.8:
            sentence = re.sub(pattern, replacement, sentence)

    return sentence

In [8]:
def corrupt_sentence(sentence):
    words = word_tokenize(sentence)
    corrupted = []
    for word in words:
        if random.random() < 0.9:
            word = simplify_word(word)
        if random.random() < 0.35:
            word = misspell_word(word)
        corrupted.append(word)
    if len(corrupted) > 5 and random.random() < 0.5:
        del corrupted[random.randint(0, len(corrupted)-1)]
    return ' '.join(corrupted)

In [9]:
def corrupt_essay(text):
    sentences = sent_tokenize(text)
    corrupted_sentences = [inject_grammar_errors(corrupt_sentence(s)) for s in sentences]

    if random.random() < 0.5:
        random.shuffle(corrupted_sentences)

    if random.random() < 0.8:
        corrupted_sentences = [re.sub(r'[.,;:!?]', '', s) for s in corrupted_sentences]

    return ' '.join(corrupted_sentences)

In [10]:
df_gen = pd.read_excel('df_generated_new.xlsx', index_col=0)

In [11]:
df_gen['full text'] = df_gen['full text'].apply(lambda x: x.split('>')[1])

In [12]:
df_gen['full text'] = df_gen['full text'].apply(corrupt_essay)

In [13]:
df_gen

Unnamed: 0,full text,General,Cohesion,syntax,vocabulary,term,grammar,Convention
0,Distnce learinng very important in me ecause I...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Distance learning is better because you can su...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,Distance learning is a good ieda because you c...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Distance learning is very present as peopel ca...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,Distance learning is very difficlut because yo...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
92,When you is hnoest you kow you is doing a righ...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
93,* * Sumer undetaking * * Summer undertaking is...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
94,* benefit in a Good Attitude * * A good attitu...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
95,* * OGAL * * oal setting is very important int...,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
super_simple = ['The cat sat on the mat.',
                'I like to eat apples',
                'She walks to school every day',
                'He has a red bike',
                'We went to the park',
                'It is raining outside now',
                'They play soccer after class',
                'I saw a dog in the street',
                'The book is on the table',
                'My mom made dinner tonight']

In [15]:
complete_nonsence = ['asdfjklqwerzxcv',
                     'lmnop99qqttrbbbaaa',
                     'ghjkkllaa###;;;...',
                     'wueoipncbvmrq##!!',
                     'zzzappplemartin??!',
                     'qwopdjklsn!!!mkp',
                     'rrrrrffffffghhhhh',
                     '🥴🤡🤖💥asdoipoqw112',
                     'aaaaaa',
                     'brtzzzmrrnngleop']

In [16]:
for text in super_simple:
    new_row = {'full text': text,
               ' General': 1.0,
               ' Cohesion': 1.0,
               ' syntax': 1.0,
               ' vocabulary': 1.0,
               ' term': 1.0,
               ' grammar': 1.0,
               ' Convention': 1.0}
    df_gen.loc[len(df_gen)] = new_row

In [17]:
for text in complete_nonsence:
    new_row = {'full text': text,
               ' General': 1.0,
               ' Cohesion': 1.0,
               ' syntax': 1.0,
               ' vocabulary': 1.0,
               ' term': 1.0,
               ' grammar': 1.0,
               ' Convention': 1.0}
    df_gen.loc[len(df_gen)] = new_row

In [18]:
df_gen.columns

Index(['full text', ' General', ' Cohesion', ' syntax', ' vocabulary', ' term',
       ' grammar', ' Convention'],
      dtype='object')

In [19]:
rename_dict = {'full text': 'full_text',
               ' General': 'Overall',
               ' Cohesion': 'Cohesion',
               ' syntax': 'Syntax',
               ' vocabulary': 'Vocabulary',
               ' term': 'Phraseology',
               ' grammar': 'Grammar',
               ' Convention': 'Conventions'}

df_gen = df_gen.rename(columns=rename_dict)

In [20]:
df_gen

Unnamed: 0,full_text,Overall,Cohesion,Syntax,Vocabulary,Phraseology,Grammar,Conventions
0,Distnce learinng very important in me ecause I...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Distance learning is better because you can su...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,Distance learning is a good ieda because you c...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Distance learning is very present as peopel ca...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,Distance learning is very difficlut because yo...,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
112,qwopdjklsn!!!mkp,1.0,1.0,1.0,1.0,1.0,1.0,1.0
113,rrrrrffffffghhhhh,1.0,1.0,1.0,1.0,1.0,1.0,1.0
114,🥴🤡🤖💥asdoipoqw112,1.0,1.0,1.0,1.0,1.0,1.0,1.0
115,aaaaaa,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
df_1 = pd.read_csv('/content/ELLIPSE_Final_github_train.csv', index_col=0)

In [22]:
df_2 = pd.read_csv('/content/ELLIPSE_Final_github_test.csv', index_col=0)

In [23]:
df = pd.concat([df_1, df_2])

In [24]:
df = df.drop(['set'], axis=1).reset_index()

In [25]:
df = df.drop(['text_id_kaggle'], axis=1)

In [26]:
df = df[['full_text', 'Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions']]

In [27]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [28]:
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [29]:
train_df = pd.concat([train_df, df_gen], ignore_index=True)

In [30]:
train_df.to_csv('train.csv')

In [31]:
from sklearn.preprocessing import MinMaxScaler

In [32]:
scores_cols = ['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions']

In [33]:
scaler = MinMaxScaler()

train_df[scores_cols] = scaler.fit_transform(train_df[scores_cols])
val_df[scores_cols] = scaler.transform(val_df[scores_cols])
test_df[scores_cols] = scaler.transform(test_df[scores_cols])

In [34]:
train_df

Unnamed: 0,full_text,Overall,Cohesion,Syntax,Vocabulary,Phraseology,Grammar,Conventions
0,"The American jazz legend Duke Ellington said ""...",0.875,0.875,0.875,0.875,0.875,1.000,1.000
1,Students will struggle to learn the subjects t...,0.625,0.500,0.500,0.750,0.625,0.500,0.375
2,"One of John Lubbock famous quotes is, "" Your c...",0.500,0.750,0.500,0.500,0.375,0.625,0.500
3,"My mom always tell me "" keep going until you r...",0.750,0.750,0.625,0.750,0.625,0.625,0.750
4,Technology allows people to complete many task...,0.750,0.875,0.750,0.750,0.750,0.750,0.750
...,...,...,...,...,...,...,...,...
4000,qwopdjklsn!!!mkp,0.000,0.000,0.000,0.000,0.000,0.000,0.000
4001,rrrrrffffffghhhhh,0.000,0.000,0.000,0.000,0.000,0.000,0.000
4002,🥴🤡🤖💥asdoipoqw112,0.000,0.000,0.000,0.000,0.000,0.000,0.000
4003,aaaaaa,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [35]:
train_df.to_csv('train.csv')

In [36]:
val_df.to_csv('val.csv')

In [37]:
test_df.to_csv('test.csv')