In [1]:
!pip install pyspellchecker



In [2]:
import transformers
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn import model_selection
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn import metrics
import os
from spellchecker import SpellChecker

import warnings
warnings.filterwarnings("ignore")

In [3]:
VERSION = 'v6.5'

MAX_LEN_DISCOURSE_TEXT = 256
MAX_LEN_ESSAY = 512
TRAIN_BATCH_SIZE  = 18
VALID_BATCH_SIZE = 4
EPOCHS = 10
DROP_OUT = 0.2
TEST_SIZE = 0.1
LEARNING_RATE = 6e-6


BERT_LAYERS = 3
BERT_PATH = './bert_base_cased'
MODEL_PATH = './Model/model' + VERSION + '.bin'

TRAINING_FILE =  '../Data/train_berkeley.csv'
TEST_FILE = '../Data/test_berkeley.csv'
ESSAY_FOLDER = '../feedback-prize-effectiveness/train'

TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=False
)


CLASS_MAPPING = {
    'Adequate': 1,
    'Effective': 0,
    'Ineffective' : 2
}

DISCOURSE_TYPE_MAPPING = {
    'Lead': 0,
    'Position': 1,
    'Claim' : 2,
    'Evidence' : 3,
    'Counterclaim' : 4,
    'Rebuttal' : 5,
    'Concluding Statement' : 6
}

In [3]:


df_train = pd.read_csv(TRAINING_FILE)
discourse_text_len = []
essay_len = []
combined_len = []

index = 0
for text in df_train.discourse_text.values:
    
    if index % 1000 == 0:
        print(f'Processed {index} rows')
    
    text_tokenized_len = len(TOKENIZER.tokenize(text))    
    discourse_text_len.append(text_tokenized_len)
    
    essay_id = df_train.essay_id.values[index]
    essay_path = os.path.join(ESSAY_FOLDER, f"{essay_id}.txt")
    essay = open(essay_path, 'r').read()
    essay_tokenized_len = len(TOKENIZER.tokenize(essay))
    essay_len.append(essay_tokenized_len)
    
    combined_len.append(text_tokenized_len + essay_tokenized_len)
    index += 1
    
    
def get_stats(lengths, percentile):
    stats = {}
    for p in percentile:
        stats[p] = np.percentile(np.array(lengths), p)
    
    return stats


percentile = [10, 25, 50, 90, 95, 99, 99.9, 99.99]

print(get_stats(discourse_text_len, percentile))
print(get_stats(essay_len, percentile))
print(get_stats(combined_len, percentile))



Processed 0 rows
Processed 1000 rows
Processed 2000 rows
Processed 3000 rows
Processed 4000 rows
Processed 5000 rows
Processed 6000 rows
Processed 7000 rows
Processed 8000 rows
Processed 9000 rows
Processed 10000 rows
Processed 11000 rows
Processed 12000 rows
Processed 13000 rows
Processed 14000 rows
Processed 15000 rows
Processed 16000 rows
Processed 17000 rows
Processed 18000 rows
Processed 19000 rows
Processed 20000 rows
Processed 21000 rows
Processed 22000 rows
Processed 23000 rows
Processed 24000 rows
Processed 25000 rows
Processed 26000 rows
Processed 27000 rows
Processed 28000 rows
Processed 29000 rows
Processed 30000 rows
Processed 31000 rows
Processed 32000 rows
Processed 33000 rows
{10: 11.0, 25: 18.0, 50: 32.0, 90: 119.0, 95: 160.0, 99: 264.0, 99.9: 468.7040000000052, 99.99: 727.8271999998033}
{10: 259.0, 25: 346.0, 50: 486.0, 90: 950.2000000000044, 95: 1088.0, 99: 1260.0, 99.9: 1522.0, 99.99: 4779.0}
{10: 293.0, 25: 386.0, 50: 535.0, 90: 1025.0, 95: 1167.0, 99: 1405.0800000

In [4]:
import nltk
from nltk.corpus import words
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
nltk.download('words')
nltk.download('punkt')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string

ps = PorterStemmer()
spell = SpellChecker()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
"okay" in words.words()

False

In [6]:
sentence = 'There are many sides to people giving up their cars. Some people are truly happy and some are not. It may not be that bad, i mean how did people manige before cars were even invented'

sentence_tokenize_before = word_tokenize(sentence)
print(sentence_tokenize_before)

sentence = sentence.translate(str.maketrans('', '', string.punctuation))
sentence =sentence.lower()

sentence_tokenize_after = word_tokenize(sentence)
print(sentence_tokenize_after)

['There', 'are', 'many', 'sides', 'to', 'people', 'giving', 'up', 'their', 'cars', '.', 'Some', 'people', 'are', 'truly', 'happy', 'and', 'some', 'are', 'not', '.', 'It', 'may', 'not', 'be', 'that', 'bad', ',', 'i', 'mean', 'how', 'did', 'people', 'manige', 'before', 'cars', 'were', 'even', 'invented']
['there', 'are', 'many', 'sides', 'to', 'people', 'giving', 'up', 'their', 'cars', 'some', 'people', 'are', 'truly', 'happy', 'and', 'some', 'are', 'not', 'it', 'may', 'not', 'be', 'that', 'bad', 'i', 'mean', 'how', 'did', 'people', 'manige', 'before', 'cars', 'were', 'even', 'invented']


In [7]:
word_set = {'sample'}
index = 0
for word in brown.words():
    
    word = word.lower()
    
    if index % 100000 == 0:
        print(f'Processed {index} words')
    index += 1
    
    
    if not word in word_set:
        word_set.add(word)

print(len(word_set))


index = 0
for word in words.words():
    
    word = word.lower()
    
    if index % 100000 == 0:
        print(f'Processed {index} words')
    index += 1
    
    
    if not word in word_set:
        word_set.add(word)
        
print(len(word_set))


index = 0
for word in wordnet.words():
    
    word = word.lower()
    
    if index % 100000 == 0:
        print(f'Processed {index} words')
    index += 1
    
    
    if not word in word_set:
        word_set.add(word)
        
print(len(word_set))

Processed 0 words
Processed 100000 words
Processed 200000 words
Processed 300000 words
Processed 400000 words
Processed 500000 words
Processed 600000 words
Processed 700000 words
Processed 800000 words
Processed 900000 words
Processed 1000000 words
Processed 1100000 words
49815
Processed 0 words
Processed 100000 words
Processed 200000 words
261552
Processed 0 words
Processed 100000 words
346423


In [9]:
def count_nii_words(text):
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    text = text.lower()
    tokens = word_tokenize(text)
    
    count = 0    
    words = []
    
    for token in tokens:        
        if not token in word_set and not str.isdigit(token):
            token_stem = ps.stem(token)
            if not token_stem in word_set:
                count += 1
                words.append(token)

    return count, words
    

def count_spelling_isses(text):
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    text = text.lower()
    tokens = word_tokenize(text)
    
    misspelled = spell.unknown(tokens)
    return len(misspelled)

    
df_valid = pd.read_csv(TRAINING_FILE)
df_valid['not_in_dictionary'] = -1
df_valid['not_in_dictionary_words'] = ''
df_valid['not_in_dictionary_essay'] = -1
df_valid['not_in_dictionary_words_essay'] = ''
df_valid['spelling_issues'] = -1

index = 0
for text in df_valid.discourse_text.values:
    
    if index % 300 == 0:
        print(f'Processed {index} rows')
    
    count, words = count_nii_words(text)
    
    df_valid['not_in_dictionary'][index] = count
    df_valid['not_in_dictionary_words'][index] = ';'.join(words)
    
    
    essay_id = df_valid.essay_id.values[index]
    essay_path = os.path.join(ESSAY_FOLDER, f"{essay_id}.txt")
    essay = open(essay_path, 'r').read()
    
    count, words = count_nii_words(essay)
    
    df_valid['not_in_dictionary_essay'][index] = count
    df_valid['not_in_dictionary_words_essay'][index] = ';'.join(words)
    
    
    count = -1
    df_valid['spelling_issues'][index] = count
    
    index += 1
    
    
df_valid.to_csv('../Data/NII_spelling_essay_data_training.csv', sep=',')

Processed 0 rows
Processed 300 rows
Processed 600 rows
Processed 900 rows
Processed 1200 rows
Processed 1500 rows
Processed 1800 rows
Processed 2100 rows
Processed 2400 rows
Processed 2700 rows
Processed 3000 rows
Processed 3300 rows
Processed 3600 rows
Processed 3900 rows
Processed 4200 rows
Processed 4500 rows
Processed 4800 rows
Processed 5100 rows
Processed 5400 rows
Processed 5700 rows
Processed 6000 rows
Processed 6300 rows
Processed 6600 rows
Processed 6900 rows
Processed 7200 rows
Processed 7500 rows
Processed 7800 rows
Processed 8100 rows
Processed 8400 rows
Processed 8700 rows
Processed 9000 rows
Processed 9300 rows
Processed 9600 rows
Processed 9900 rows
Processed 10200 rows
Processed 10500 rows
Processed 10800 rows
Processed 11100 rows
Processed 11400 rows
Processed 11700 rows
Processed 12000 rows
Processed 12300 rows
Processed 12600 rows
Processed 12900 rows
Processed 13200 rows
Processed 13500 rows
Processed 13800 rows
Processed 14100 rows
Processed 14400 rows
Processed 1