In [1]:
!pip install textblob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn 
import re
import nltk 
from nltk.corpus import stopwords
from textblob import Word
import spacy
  

from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('content')
nlp = spacy.load("en_core_web_sm")
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Error loading content: Package 'content' not found in
[nltk_data]     index
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## **Feature Extraction**

In [3]:
from google.colab import drive
drive.mount('/content/drive')
Data_Essay_01 = pd.read_csv("/content/drive/MyDrive/IntelliTech-DataSet/EssaySet01.csv")
Data_Essay_01.head()

Mounted at /content/drive


Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0


#### Counting Spelling Mistakes

In [4]:
def removePunctuations(sentence):
  """
    Removes punctuations from text
    Args:
      sentence: Essay of each student
    
    Returns: 
      String

  """
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  newSentence = ""
  for word in sentence:
      if (word in punctuations):
          newSentence = newSentence + " "
      else: 
          newSentence = newSentence + word
  return newSentence

def SplitWords(sentence):
  """
    Creates tokens of sentence

    Args:
      sentence: Essay of each student
    
    Returns: 
      String[]
      
  """
  return sentence.split()

def check_word_spelling(word):
  """
    Checks spelling of each word

    Args:
      word: Words (Tokens) of each essay 
    
    Returns: 
      int
      
  """
  word = Word(word)
  
  result = word.spellcheck()

  # result [0][0] contains the bool value if the spelling is correct or not
  # result [0][1] contains the confidence for the suggest correct spelling

  if word != result[0][0]:
    # print(f'Spelling of "{word}" is not correct!')
    # print(f'Correct spelling of "{word}": "{result[0][0]}" (with {result[0][1]} confidence).')
    return 1
  return 0

In [5]:
# df_1 = data.query('essay_set == 1')
# cols = ["Essay" , "Number of Spelling Mistakes"]
# Mistakes = []
# for index, row in df_1.iterrows():
#   sentence = row["essay"]
#   PreProcessed0 = removePunctuations(sentence)
#   PreProcessed1 = PreProcessed0.lower()
#   Words = SplitWords(PreProcessed1)
#   count = 0 
#   for word in Words:
#     count = count + check_word_spelling(word)
#   Mistakes.append(count)

# Featured_df = pd.DataFrame(list(zip(df_1 , Mistakes)) , columns = cols)
# Featured_df.head()

## **Word Tokenization using Stanford CoreNLP**

### Removal Of Named Entity Recognition(NER)

In [14]:
def removal_of_NERs(essay):
  return ' '.join(word for word in essay.split(' ') if not word.startswith('@'))

### Word Tokenization

In [15]:
def tokenize_essaySet(essay):
    essayWithoutPuncs=removePunctuations(essay)
    return " ".join(word_tokenize(essayWithoutPuncs))

## Lower Case and Removal of Numbers 

In [16]:
def lower_case(essay):
  result = re.sub('[0-9]+','', result).lower() 

## White Spaces Removal


In [88]:
def remove_white_space(essay):
  return " ".join(essay.split())

In [17]:
def preprocess_essays():
  # Essay set without NERs
  Data_Essay_01['Preprocessed_Essay']=Data_Essay_01['Essay'].apply(removal_of_NERs)

  #Punctuation Removal + Word Tokens + Without NERs
  
  Data_Essay_01['Preprocessed_Essay']=Data_Essay_01['Preprocessed_Essay'].apply(tokenize_essaySet)
  

preprocess_essays()
Data_Essay_01

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Sent_Count,Word_Count,char_count,Avg_Word_Count,Preprocessed_Essay
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,16,386,1875,3.984456,Dear local newspaper I think effects computers...
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0,20,464,2288,4.030172,Dear I believe that using computers will benef...
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0,14,313,1541,4.035144,Dear More and more people use computers but no...
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0,27,611,3165,4.328969,Dear Local Newspaper I have found that many ex...
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0,30,517,2569,4.071567,Dear I know having computers has a positive ef...
...,...,...,...,...,...,...,...,...,...,...
1778,1783,"Dear @CAPS1, @CAPS2 several reasons on way I t...",4.0,4.0,8.0,21,548,2590,3.824818,Dear several reasons on way I that advances in...
1779,1784,Do a adults and kids spend to much time on the...,3.0,4.0,7.0,18,235,1091,3.795745,Do a adults and kids spend to much time on the...
1780,1785,My opinion is that people should have computer...,4.0,4.0,8.0,18,314,1642,4.305732,My opinion is that people should have computer...
1781,1786,"Dear readers, I think that its good and bad to...",1.0,1.0,2.0,1,16,71,3.562500,Dear readers I think that its good and bad to ...


## **PART OF SPEECH COUNTS**

In [None]:
def pos_tag_count(preproc_essay):

  tagged_doc = nlp(preproc_essay)

  adj_count=0
  verb_count=0
  noun_count=0
  pNoun_count=0
  adverb_count=0
  conj_count=0

  for token in tagged_doc:
    print(token, token.pos_)
    
    if(token.pos_ == 'ADJ'):
      adj_count+=1
    
    elif(token.pos_ =='NOUN'):
      noun_count+=1

    elif (token.pos_ =='PRON'):
      pNoun_count+=1

    elif (token.pos_ =='VERB'):
      verb_count+=1

    elif (token.pos_ =='ADV'):
      adverb_count+=1
    
    elif(token.pos_=='CCONJ'):
      conj_count+=1

  return verb_count,noun_count, adj_count, conj_count, adverb_count,pNoun_count


In [None]:
Data_Essay_01['verb_count'], Data_Essay_01['noun_count'], Data_Essay_01['adj_count'], Data_Essay_01['conj_count'], Data_Essay_01['adverb_count'], Data_Essay_01['pNoun_count']=zip(*Data_Essay_01["Preprocessed_Essay"].map(pos_tag_count))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chat VERB
over ADP
the DET
web NOUN
you PRON
can AUX
Also ADV
its PRON
a DET
great ADJ
idea NOUN
if SCONJ
you PRON
want VERB
to PART
send VERB
a DET
small ADJ
mesage NOUN
like INTJ
see VERB
you PRON
at ADP
instead ADV
of ADP
saying VERB
it PRON
on ADP
the DET
phone NOUN
and CCONJ
paying VERB
more ADJ
when SCONJ
its PRON
free ADJ
on ADP
the DET
computer NOUN
The DET
computer NOUN
also ADV
teachs VERB
hand NOUN
eye NOUN
coordiantion NOUN
that PRON
is AUX
important ADJ
to PART
use VERB
in ADP
the DET
world NOUN
You PRON
can AUX
play VERB
games NOUN
take VERB
quiza NOUN
and CCONJ
tests NOUN
go VERB
to ADP
school NOUN
on ADP
the DET
internet NOUN
even ADV
listin NOUN
to ADP
music NOUN
that PRON
is AUX
important ADJ
for SCONJ
us PRON
to PART
express VERB
Also ADV
there PRON
are VERB
sites NOUN
that PRON
allow VERB
you PRON
to PART
learn VERB
how SCONJ
to PART
use VERB
a DET
keyboard NOUN
and CCONJ
work NOUN
electronics NOUN
lik

## **Sentence Count per Essay**

In [19]:
def sentence_count(essay):
    
    sentence_no = nltk.sent_tokenize(essay)
    return len(sentence_no)
  

Data_Essay_01['Sent_Count'] = Data_Essay_01['Essay'].apply(sentence_count)
Data_Essay_01

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Sent_Count,Word_Count,char_count,Avg_Word_Count,Preprocessed_Essay
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,16,386,1875,3.984456,Dear local newspaper I think effects computers...
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0,20,464,2288,4.030172,Dear I believe that using computers will benef...
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0,14,313,1541,4.035144,Dear More and more people use computers but no...
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0,27,611,3165,4.328969,Dear Local Newspaper I have found that many ex...
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0,30,517,2569,4.071567,Dear I know having computers has a positive ef...
...,...,...,...,...,...,...,...,...,...,...
1778,1783,"Dear @CAPS1, @CAPS2 several reasons on way I t...",4.0,4.0,8.0,21,548,2590,3.824818,Dear several reasons on way I that advances in...
1779,1784,Do a adults and kids spend to much time on the...,3.0,4.0,7.0,18,235,1091,3.795745,Do a adults and kids spend to much time on the...
1780,1785,My opinion is that people should have computer...,4.0,4.0,8.0,18,314,1642,4.305732,My opinion is that people should have computer...
1781,1786,"Dear readers, I think that its good and bad to...",1.0,1.0,2.0,1,16,71,3.562500,Dear readers I think that its good and bad to ...


## **Words Count per Essay**

In [20]:
def word_count(essay):
  
  #cleaned_essay = re.sub('[^a-zA-Z]','',essay) 
  word_no = nltk.word_tokenize(essay)
  return len(word_no)
 
Data_Essay_01['Word_Count'] = Data_Essay_01['Essay'].apply(word_count)
Data_Essay_01.sample()

#Observation: These word count are more than the original count coz of nltk tokenization. Punctations are treated as seperate words.


Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Sent_Count,Word_Count,char_count,Avg_Word_Count,Preprocessed_Essay
1425,1430,"Dear Local Newspaper, @CAPS1 you think compute...",4.0,4.0,8.0,16,276,1389,4.134058,Dear Local Newspaper you think computers have ...


## **Character Count per Essay**

In [21]:
def char_count(essay):

  #cleaned_essay = re.sub('[^a-zA-Z]',' ',essay) 
  return len([character for character in essay])


Data_Essay_01['char_count'] = Data_Essay_01['Essay'].apply(char_count)
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Sent_Count,Word_Count,char_count,Avg_Word_Count,Preprocessed_Essay
725,728,Dear local newspaper did you know that the ave...,4.0,4.0,8.0,20,377,1855,4.018568,Dear local newspaper did you know that the ave...


## **Average Word Length of Essay**

In [22]:
# 
def avg_word_count(essay):

  word_list = nltk.word_tokenize(essay)
  total = sum(map(len, word_list))/len(word_list)
  return total

Data_Essay_01['Avg_Word_Count'] = Data_Essay_01['Essay'].apply(avg_word_count)
Data_Essay_01.sample()




Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Sent_Count,Word_Count,char_count,Avg_Word_Count,Preprocessed_Essay
710,713,"Dear, local news time editor using a @CAPS1 da...",5.0,5.0,10.0,22,387,1809,3.813953,Dear local news time editor using a daily has ...


## **Grammar Error Detection**

In [59]:
# from nltk.translate.bleu_score import sentence_bleu
# reference = result.text.split()

# candidate = 'Dear local newspaper, @CAPS1 best friend, @LOCATION2, was once a nerd with no hand-eye coordination, @CAPS2, he started to use a computer and now he has better hand-eye coordination than me.'.split()
# print('BLEU score -> {}'.format(sentence_bleu(reference, candidate )))

BLEU score -> 7.720899511627474e-232


In [49]:
df1 = Data_Essay_01[['Essay', 'Sent_Count']]
df1.head()

Unnamed: 0,Essay,Sent_Count
0,"Dear local newspaper, I think effects computer...",16
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",20
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",14
3,"Dear Local Newspaper, @CAPS1 I have found that...",27
4,"Dear @LOCATION1, I know having computers has a...",30


In [114]:
# Grammar Error via CFG

# def grammar_error(essays_1,sent_count):
#     sentences = nltk.sent_tokenize(essays_1[1])
#     for sent in range(0,sent_count):
#        wrong =1
#        sent_split = sentences[sent].split()  
#        tagged = nltk.pos_tag(sent_split) 
#        tags = [x[1].lower() for x in tagged] 

#        try:
#         parser = nltk.RecursiveDescentParser(grammar)
        
#         for tree in parser.parse(tags):
#             s = tree
#             wrong =0
#             print("Correct Grammar!!!!")
#             print("*"*20)
        
#         if wrong ==1:
#             print("Wrong Grammar!!!")
#             print("*"*20)
    
#        except ValueError:
#         print("Sorry! Some words are not covered in the grammar yet :)")

    
# essays_1 = df1['Essay_Clean'].tolist()
# sent_count = df1['Clean_Sent_Count'].tolist()
# grammar_error(essays_1,sent_count[1])


In [93]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

In [92]:
df1 = Data_Essay_01[['Essay', 'Sent_Count']]
df1['Essay'] = df1['Essay'].apply(remove_white_space)   # to avoid whitespace error
df1['Essay']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0       Dear local newspaper, I think effects computer...
1       Dear @CAPS1 @CAPS2, I believe that using compu...
2       Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...
3       Dear Local Newspaper, @CAPS1 I have found that...
4       Dear @LOCATION1, I know having computers has a...
                              ...                        
1778    Dear @CAPS1, @CAPS2 several reasons on way I t...
1779    Do a adults and kids spend to much time on the...
1780    My opinion is that people should have computer...
1781    Dear readers, I think that its good and bad to...
1782    Dear - Local Newspaper I agree thats computers...
Name: Essay, Length: 1783, dtype: object

In [131]:
def grammar_errors(essays):
    matches = tool.check(essays)
    errors = []
    #language_tool_python.utils.correct(text, matches)   # to correct it
    for i in range(0, len(matches)):
      errors.append(matches[i].ruleId)  # or category of the error (Misc, Whitespace, Typography)
    return len(matches), errors

In [140]:
# Data_Essay_01['Grammar_Errors'], Data_Essay_01['Grammar_Error_List'] = zip(*df1_copy['Essay'].map(grammar_errors))
Data_Essay_01['Grammar_Errors'], Data_Essay_01['Grammar_Error_List'] = zip(*df1['Essay'].map(grammar_errors))

In [141]:
Data_Essay_01.head()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Sent_Count,Word_Count,char_count,Avg_Word_Count,Preprocessed_Essay,Grammar_Errors,Grammar_Error_List
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,16,386,1875,3.984456,Dear local newspaper I think effects computers...,16,"[MORFOLOGIK_RULE_EN_US, EN_CONTRACTION_SPELLIN..."
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0,20,464,2288,4.030172,Dear I believe that using computers will benef...,25,"[MORFOLOGIK_RULE_EN_US, MORFOLOGIK_RULE_EN_US,..."
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0,14,313,1541,4.035144,Dear More and more people use computers but no...,17,"[MORFOLOGIK_RULE_EN_US, CONFUSION_OF_MANS_MEN,..."
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0,27,611,3165,4.328969,Dear Local Newspaper I have found that many ex...,29,"[MORFOLOGIK_RULE_EN_US, MORFOLOGIK_RULE_EN_US,..."
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0,30,517,2569,4.071567,Dear I know having computers has a positive ef...,17,"[MORFOLOGIK_RULE_EN_US, MORFOLOGIK_RULE_EN_US,..."


In [183]:
out = Data_Essay_01['Grammar_Error_List'].explode().value_counts()
out

MORFOLOGIK_RULE_EN_US                          14260
COMMA_COMPOUND_SENTENCE                         1996
EN_CONTRACTION_SPELLING                         1190
UPPERCASE_SENTENCE_START                         888
SENT_START_CONJUNCTIVE_LINKING_ADVERB_COMMA      620
                                               ...  
AND_SO_ONE                                         1
WEATHER_WHETHER                                    1
FASTLY                                             1
SHELL_COMPOUNDS                                    1
SHUTDOWN                                           1
Name: Grammar_Error_List, Length: 724, dtype: int64

In [184]:
out.to_csv('GrammarErrors.csv')