In [1]:
!pip install textblob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn 
import re
import nltk 
from nltk.corpus import stopwords
from textblob import Word
import spacy
  

from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## **Feature Extraction**

In [3]:
from google.colab import drive
drive.mount('/content/drive')
Data_Essay_01 = pd.read_csv("/content/drive/MyDrive/IntelliTech-DataSet/EssaySet01.csv")
Data_Essay_01.head()

Mounted at /content/drive


Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0


#### Counting Spelling Mistakes

In [4]:
def removePunctuations(sentence):
  """
    Removes punctuations from text
    Args:
      sentence: Essay of each student
    
    Returns: 
      String

  """
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  newSentence = ""
  for word in sentence:
      if (word in punctuations):
          newSentence = newSentence + " "
      else: 
          newSentence = newSentence + word
  return newSentence

def SplitWords(sentence):
  """
    Creates tokens of sentence

    Args:
      sentence: Essay of each student
    
    Returns: 
      String[]
      
  """
  return sentence.split()

def check_word_spelling(word):
  """
    Checks spelling of each word

    Args:
      word: Words (Tokens) of each essay 
    
    Returns: 
      int
      
  """
  word = Word(word)
  
  result = word.spellcheck()

  # result [0][0] contains the bool value if the spelling is correct or not
  # result [0][1] contains the confidence for the suggest correct spelling

  if word != result[0][0]:
    # print(f'Spelling of "{word}" is not correct!')
    # print(f'Correct spelling of "{word}": "{result[0][0]}" (with {result[0][1]} confidence).')
    return 1
  return 0

In [5]:
# df_1 = data.query('essay_set == 1')
# cols = ["Essay" , "Number of Spelling Mistakes"]
# Mistakes = []
# for index, row in df_1.iterrows():
#   sentence = row["essay"]
#   PreProcessed0 = removePunctuations(sentence)
#   PreProcessed1 = PreProcessed0.lower()
#   Words = SplitWords(PreProcessed1)
#   count = 0 
#   for word in Words:
#     count = count + check_word_spelling(word)
#   Mistakes.append(count)

# Featured_df = pd.DataFrame(list(zip(df_1 , Mistakes)) , columns = cols)
# Featured_df.head()

## **Word Tokenization using Stanford CoreNLP**

### Removal Of Named Entity Recognition(NER)

In [6]:
def removal_of_NERs(essay):
  return ' '.join(word for word in essay.split(' ') if not word.startswith('@'))

### Word Tokenization

In [7]:
def tokenize_essaySet(essay):
    essayWithoutPuncs=removePunctuations(essay)
    return " ".join(word_tokenize(essayWithoutPuncs))

## Lower Case and Removal of Numbers 

In [8]:
def lower_case(essay):
  result = re.sub('[0-9]+','', result).lower() 

In [9]:
def preprocess_essays():
  # Essay set without NERs
  Data_Essay_01['Preprocessed_Essay']=Data_Essay_01['Essay'].apply(removal_of_NERs)

  #Punctuation Removal + Word Tokens + Without NERs
  
  Data_Essay_01['Preprocessed_Essay']=Data_Essay_01['Preprocessed_Essay'].apply(tokenize_essaySet)
  

preprocess_essays()
Data_Essay_01

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,Dear local newspaper I think effects computers...
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0,Dear I believe that using computers will benef...
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0,Dear More and more people use computers but no...
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0,Dear Local Newspaper I have found that many ex...
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0,Dear I know having computers has a positive ef...
...,...,...,...,...,...,...
1778,1783,"Dear @CAPS1, @CAPS2 several reasons on way I t...",4.0,4.0,8.0,Dear several reasons on way I that advances in...
1779,1784,Do a adults and kids spend to much time on the...,3.0,4.0,7.0,Do a adults and kids spend to much time on the...
1780,1785,My opinion is that people should have computer...,4.0,4.0,8.0,My opinion is that people should have computer...
1781,1786,"Dear readers, I think that its good and bad to...",1.0,1.0,2.0,Dear readers I think that its good and bad to ...


## **PART OF SPEECH COUNTS**

In [10]:
def pos_tag_count(preproc_essay):

  tagged_doc = nlp(preproc_essay)

  adj_count=0
  verb_count=0
  noun_count=0
  pNoun_count=0
  adverb_count=0
  conj_count=0

  for token in tagged_doc:
    #print(token, token.pos_)
    
    if(token.pos_ == 'ADJ'):
      adj_count+=1
    
    elif(token.pos_ =='NOUN'):
      noun_count+=1

    elif (token.pos_ =='PRON'):
      pNoun_count+=1

    elif (token.pos_ =='VERB'):
      verb_count+=1

    elif (token.pos_ =='ADV'):
      adverb_count+=1
    
    elif(token.pos_=='CCONJ'):
      conj_count+=1

  return verb_count,noun_count, adj_count, conj_count, adverb_count,pNoun_count


In [11]:
Data_Essay_01['verb_count'], Data_Essay_01['noun_count'], Data_Essay_01['adj_count'], Data_Essay_01['conj_count'], Data_Essay_01['adverb_count'], Data_Essay_01['pNoun_count']=zip(*Data_Essay_01["Preprocessed_Essay"].map(pos_tag_count))


## **Sentence Count per Essay**

In [12]:
def sentence_count(essay):
    
    sentence_no = nltk.sent_tokenize(essay)
    return len(sentence_no)
  

Data_Essay_01['Sent_Count'] = Data_Essay_01['Essay'].apply(sentence_count)
Data_Essay_01

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,verb_count,noun_count,adj_count,conj_count,adverb_count,pNoun_count,Sent_Count
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,Dear local newspaper I think effects computers...,55,74,18,14,15,48,16
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0,Dear I believe that using computers will benef...,71,97,19,18,19,49,20
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0,Dear More and more people use computers but no...,42,69,17,16,11,25,14
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0,Dear Local Newspaper I have found that many ex...,71,126,39,17,21,33,27
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0,Dear I know having computers has a positive ef...,61,107,30,15,34,41,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1778,1783,"Dear @CAPS1, @CAPS2 several reasons on way I t...",4.0,4.0,8.0,Dear several reasons on way I that advances in...,79,80,32,17,45,87,21
1779,1784,Do a adults and kids spend to much time on the...,3.0,4.0,7.0,Do a adults and kids spend to much time on the...,41,47,7,9,9,29,18
1780,1785,My opinion is that people should have computer...,4.0,4.0,8.0,My opinion is that people should have computer...,43,82,13,15,9,39,18
1781,1786,"Dear readers, I think that its good and bad to...",1.0,1.0,2.0,Dear readers I think that its good and bad to ...,2,2,4,1,0,2,1


## **Words Count per Essay**

In [13]:
def word_count(essay):
  
  #cleaned_essay = re.sub('[^a-zA-Z]','',essay) 
  word_no = nltk.word_tokenize(essay)
  return len(word_no)
 
Data_Essay_01['Word_Count'] = Data_Essay_01['Essay'].apply(word_count)
Data_Essay_01.sample()

#Observation: These word count are more than the original count coz of nltk tokenization. Punctations are treated as seperate words.


Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,verb_count,noun_count,adj_count,conj_count,adverb_count,pNoun_count,Sent_Count,Word_Count
812,815,"Dear Newspaper, Computers today have a negativ...",5.0,5.0,10.0,Dear Newspaper Computers today have a negative...,51,82,38,8,26,59,32,468


## **Character Count per Essay**

In [14]:
def char_count(essay):

  #cleaned_essay = re.sub('[^a-zA-Z]',' ',essay) 
  return len([character for character in essay])


Data_Essay_01['char_count'] = Data_Essay_01['Essay'].apply(char_count)
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,verb_count,noun_count,adj_count,conj_count,adverb_count,pNoun_count,Sent_Count,Word_Count,char_count
1474,1479,My oponion is computers should be used for man...,3.0,4.0,7.0,My oponion is computers should be used for man...,49,41,10,9,25,52,13,301,1449


## **Average Word Length of Essay**

In [15]:
# 
def avg_word_count(essay):

  word_list = nltk.word_tokenize(essay)
  total = sum(map(len, word_list))/len(word_list)
  return total

Data_Essay_01['Avg_Word_Count'] = Data_Essay_01['Essay'].apply(avg_word_count)
Data_Essay_01.sample()




Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,verb_count,noun_count,adj_count,conj_count,adverb_count,pNoun_count,Sent_Count,Word_Count,char_count,Avg_Word_Count
1387,1392,I think that computers have a good effect on p...,2.0,3.0,5.0,I think that computers have a good effect on p...,53,74,23,18,21,46,26,428,2056,3.92757


In [16]:
def Check_Spelling(Sentence):
  """
    Checks spelling of each word

    Args:
      word: Words (Tokens) of each essay 
    
    Returns: 
      int
      
  """
  abcd=1
  count = 0
  Sentence = word_tokenize(Sentence)
  for word in Sentence:
    word = Word(word)
  
    result = word.spellcheck()

    # result [0][0] contains the bool value if the spelling is correct or not
    # result [0][1] contains the confidence for the suggest correct spelling

    if word != result[0][0]:
      # print(f'Spelling of "{word}" is not correct!')
      # print(f'Correct spelling of "{word}": "{result[0][0]}" (with {result[0][1]} confidence).')
      count = count + 1

  return count,abcd

In [21]:
Data_Essay_01.describe()

Unnamed: 0,ID,Rater_1 Score,Rater_2 Score,Total Score,verb_count,noun_count,adj_count,conj_count,adverb_count,pNoun_count,Sent_Count,Word_Count,char_count,Avg_Word_Count
count,1783.0,1783.0,1783.0,1783.0,1783.0,1783.0,1783.0,1783.0,1783.0,1783.0,1783.0,1783.0,1783.0,1783.0
mean,894.310151,4.260796,4.267527,8.528323,56.616938,78.758833,23.196859,14.91475,20.702187,45.899047,22.778463,417.560292,2029.385306,3.986131
std,516.143993,0.842119,0.816287,1.538565,19.063827,27.999883,10.18686,6.667468,9.910794,18.002772,9.083152,140.418553,690.591705,0.210709
min,1.0,1.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,10.0,47.0,3.339416
25%,448.5,4.0,4.0,8.0,44.0,60.0,16.0,10.0,14.0,34.0,17.0,322.5,1577.0,3.836833
50%,894.0,4.0,4.0,8.0,57.0,77.0,22.0,14.0,20.0,45.0,23.0,415.0,2018.0,3.981102
75%,1341.5,5.0,5.0,10.0,69.0,96.0,30.0,19.0,27.0,57.0,29.0,504.5,2450.0,4.123433
max,1787.0,6.0,6.0,12.0,125.0,192.0,70.0,44.0,66.0,120.0,68.0,956.0,4616.0,4.783439


In [None]:
# tempDf = pd.DataFrame()
# tempDf=pd.DataFrame(Data_Essay_01[0:1782].values, dtype=str , columns=Data_Essay_01.columns)
# tempDf

In [22]:
Data_Essay_01["Preprocessed_Essay"] = Data_Essay_01["Essay"].apply(removal_of_NERs)
Data_Essay_01["Preprocessed_Essay"] = Data_Essay_01["Preprocessed_Essay"].apply(removePunctuations)

In [None]:
# Remove NER from the essay

Data_Essay_01["Spelling_Mistakes_Count"]  = Data_Essay_01["Preprocessed_Essay"].map(Check_Spelling)


In [None]:
Data_Essay_01.head(1000)