## **Installing Packages**

In [1]:
!pip install textblob
!pip install sentencepiece  
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Restart Runtime after installing 

## **Importing Packages**

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn 
import re
import nltk 
from nltk.corpus import stopwords
from textblob import Word
import spacy
  

from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Reading Data From Google Drive**

In [7]:
from google.colab import drive
drive.mount('/content/drive')
Data_Essay_01 = pd.read_csv("/content/drive/MyDrive/IntelliTech-DataSet/EssaySet08.csv")
Data_Essay_01.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Rater_3 Score,Domain 1 Total Score,Rater 1 Trait 1,Rater 1 Trait 2,Rater 1 Trait 3,Rater 1 Trait 4,...,Rater 2 Trait 3,Rater 2 Trait 4,Rater 2 Trait 5,Rater 2 Trait 6,Rater 3 Trait 1,Rater 3 Trait 2,Rater 3 Trait 3,Rater 3 Trait 4,Rater 3 Trait 5,Rater 3 Trait 6
0,20716,A long time ago when I was in third grade I h...,18.0,16.0,,34.0,4.0,4.0,4.0,4.0,...,4.0,4.0,3.0,3.0,,,,,,
1,20717,Softball has to be one of the single most gre...,21.0,26.0,46.0,46.0,5.0,4.0,5.0,4.0,...,6.0,6.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0
2,20718,"Some people like making people laugh, I love ...",15.0,20.0,40.0,40.0,3.0,3.0,3.0,3.0,...,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
3,20719,"""LAUGHTER"" @CAPS1 I hang out with my friends...",12.0,20.0,30.0,30.0,3.0,3.0,3.0,3.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0
4,20721,Well ima tell a story about the time i got @CA...,11.0,15.0,,26.0,3.0,2.0,3.0,3.0,...,3.0,3.0,3.0,3.0,,,,,,


# **Feature Extraction**

## **Essay Pre Processing**

In [8]:
def Remove_NER(Essay):
  """
    Removes Named Entity Recognition (NER) from each essay

    Args:
      Sentence: Essay of each student 
    
    Returns: 
      String

  """
  return ' '.join (word for word in Essay.split(' ') if not word.startswith('@'))

def Remove_Punctuations(sentence):
  """
    Removes punctuations from text
    Args:
      sentence: Essay of each student
    
    Returns: 
      String
  """
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  newSentence = ""
  for word in sentence:
      if (word in punctuations):
          newSentence = newSentence + " "
      else: 
          newSentence = newSentence + word
  return newSentence

def LowerCase_Words(Essay):
  """
    Lower case all the words in an essay

    Args:
      Sentence: Essay of each student
    
    Returns: 
      String
  """
  return re.sub('[0-9]+','', Essay).lower() 

def Tokenize_Essay(Essay):
    """
      Create Tokens of each Essay

      Args:
        Essay: Essay of each student
      
      Returns: 
        String
    """
    Preprocessed = Remove_Punctuations(Essay)
    return " ".join(word_tokenize(Preprocessed))

def Remove_White_Spaces(Essay):
  """
    Removes Extra White Spaces

    Args:
      Essay: Essay of each student
    
    Returns: 
      String
  """
  return " ".join(Essay.split())

Removing NERs, Punctuations and Lower Casing

In [9]:
Data_Essay_01['Preprocessed_Essay'] = Data_Essay_01['Essay'].apply(Remove_NER)
Data_Essay_01['Preprocessed_Essay'] = Data_Essay_01['Preprocessed_Essay'].apply(Tokenize_Essay)
Data_Essay_01.head()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Rater_3 Score,Domain 1 Total Score,Rater 1 Trait 1,Rater 1 Trait 2,Rater 1 Trait 3,Rater 1 Trait 4,...,Rater 2 Trait 4,Rater 2 Trait 5,Rater 2 Trait 6,Rater 3 Trait 1,Rater 3 Trait 2,Rater 3 Trait 3,Rater 3 Trait 4,Rater 3 Trait 5,Rater 3 Trait 6,Preprocessed_Essay
0,20716,A long time ago when I was in third grade I h...,18.0,16.0,,34.0,4.0,4.0,4.0,4.0,...,4.0,3.0,3.0,,,,,,,A long time ago when I was in third grade I ha...
1,20717,Softball has to be one of the single most gre...,21.0,26.0,46.0,46.0,5.0,4.0,5.0,4.0,...,6.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,Softball has to be one of the single most grea...
2,20718,"Some people like making people laugh, I love ...",15.0,20.0,40.0,40.0,3.0,3.0,3.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,Some people like making people laugh I love it...
3,20719,"""LAUGHTER"" @CAPS1 I hang out with my friends...",12.0,20.0,30.0,30.0,3.0,3.0,3.0,3.0,...,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,LAUGHTER I hang out with my friends the one th...
4,20721,Well ima tell a story about the time i got @CA...,11.0,15.0,,26.0,3.0,2.0,3.0,3.0,...,3.0,3.0,3.0,,,,,,,Well ima tell a story about the time i got dow...


## **Basic Count Features**

#### 1. Counting Sentences per Essay

In [None]:
def Sentence_Count(Essay):
    """
    Counts sentences in an essay

    Args:
      Essay: Essay of each student 
    
    Returns: 
      int
      
  """
    sentence_no = nltk.sent_tokenize(Essay)
    return len(sentence_no)
  
Data_Essay_01['Sent_Count'] = Data_Essay_01['Essay'].apply(Sentence_Count)
Data_Essay_01

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,dear local newspaper i think effects computers...,16
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0,dear i believe that using computers will benef...,20
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0,dear more and more people use computers but no...,14
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0,dear local newspaper i have found that many ex...,27
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0,dear i know having computers has a positive ef...,30
...,...,...,...,...,...,...,...
1778,1783,"Dear @CAPS1, @CAPS2 several reasons on way I t...",4.0,4.0,8.0,dear several reasons on way i that advances in...,21
1779,1784,Do a adults and kids spend to much time on the...,3.0,4.0,7.0,do a adults and kids spend to much time on the...,18
1780,1785,My opinion is that people should have computer...,4.0,4.0,8.0,my opinion is that people should have computer...,18
1781,1786,"Dear readers, I think that its good and bad to...",1.0,1.0,2.0,dear readers i think that its good and bad to ...,1


#### 2. Counting Words per Essay

**Observation:** These word count are more than the original count coz of nltk tokenization. Punctations are treated as seperate words.


In [None]:
def Word_Count(Essay):
  """
    Counts words in an essay

    Args:
      Essay: Essay of each student 
    
    Returns: 
      int
      
  """
  #cleaned_essay = re.sub('[^a-zA-Z]','',essay) 
  word_no = nltk.word_tokenize(Essay)
  return len(word_no)
 
Data_Essay_01['Word_Count'] = Data_Essay_01['Essay'].apply(Word_Count)
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count,Word_Count
351,354,"Dear local Newspaper, @CAPS1 the years compute...",4.0,4.0,8.0,dear local newspaper the years computers has b...,20,302


#### 3. Counting Characters per Essay

In [None]:
def Char_Count(Essay):
  """
    Counts characters in an essay

    Args:
      Essay: Essay of each student 
    
    Returns: 
      int
      
  """
  #cleaned_essay = re.sub('[^a-zA-Z]',' ',Essay) 
  return len([character for character in Essay])

Data_Essay_01['Char_Count'] = Data_Essay_01['Essay'].apply(Char_Count)
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count,Word_Count,Char_Count
1094,1097,"Dear fellow citizens, @CAPS1 think if you had ...",5.0,5.0,10.0,dear fellow citizens think if you had to get a...,23,447,2099


#### 4. Average Word Length of Essay

In [None]:
def Avg_Word_Count(Essay):
  """
    Calculates Average Word Count In An Essay Set

    Args:
      Essay: Essay of each student 
    
    Returns: 
      float
      
  """
  word_list = nltk.word_tokenize(Essay)
  total = sum(map(len, word_list))/len(word_list)
  return total

Data_Essay_01['Avg_Word_Count'] = Data_Essay_01['Essay'].apply(Avg_Word_Count)
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count,Word_Count,Char_Count,Avg_Word_Count
1114,1117,Statistics show that more and more people ever...,3.0,4.0,7.0,statistics show that more and more people ever...,14,372,1793,3.948925


## **Parts Of Speech Counts**

In [None]:
def Pos_Tag_Count(Essay):
  """
    Counts Parts of Speech in an Essay

    Args:
      Essay: Essay of each student 
    
    Returns: 
      int,int,int,int,int,int
      
  """
  tagged_doc = nlp(Essay)

  adj_count=0
  verb_count=0
  noun_count=0
  pNoun_count=0
  adverb_count=0
  conj_count=0

  for token in tagged_doc:

    if(token.pos_ == 'ADJ'):
      adj_count+=1
    
    elif(token.pos_ =='NOUN'):
      noun_count+=1

    elif (token.pos_ =='PRON'):
      pNoun_count+=1

    elif (token.pos_ =='VERB'):
      verb_count+=1

    elif (token.pos_ =='ADV'):
      adverb_count+=1
    
    elif(token.pos_=='CCONJ'):
      conj_count+=1

  return verb_count,noun_count, adj_count, conj_count, adverb_count,pNoun_count

In [None]:
Data_Essay_01['Verb_Count'], Data_Essay_01['Noun_Count'], Data_Essay_01['Adj_Count'], Data_Essay_01['Conj_Count'], Data_Essay_01['Adverb_Count'], Data_Essay_01['pNoun_Count']=zip(*Data_Essay_01["Preprocessed_Essay"].map(Pos_Tag_Count))
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count,Word_Count,Char_Count,Avg_Word_Count,Verb_Count,Noun_Count,Adj_Count,Conj_Count,Adverb_Count,pNoun_Count
1065,1068,"Computers are a huge impact on society, but do...",5.0,4.0,9.0,computers are a huge impact on society but do ...,32,566,2670,3.791519,83,90,38,25,34,86


# **Evaluating Writing Attributes**

## **Style**

### **Mechanics**

#### Counting Spelling Mistakes

In [57]:
def Check_Spelling(Sentence):
  """
    Checks spelling of each word

    Args:
      word: Words (Tokens) of each essay 
    
    Returns: 
      int
      
  """
  count = 0
  Sentence = word_tokenize(Sentence)
  for word in Sentence:
    word = Word(word)
  
    result = word.spellcheck()

    # result [0][0] contains the bool value if the spelling is correct or not
    # result [0][1] contains the confidence for the suggest correct spelling

    if word != result[0][0]:
      # print(f'Spelling of "{word}" is not correct!')
      # print(f'Correct spelling of "{word}": "{result[0][0]}" (with {result[0][1]} confidence).')
      count = count + 1

  return count

In [58]:
Data_Essay_01["Preprocessed_Essay"] = Data_Essay_01["Essay"].apply(Remove_NER)
Data_Essay_01["Preprocessed_Essay"] = Data_Essay_01["Preprocessed_Essay"].apply(Remove_Punctuations)

In [None]:
Data_Essay_01["Spelling_Mistakes_Count"]  = Data_Essay_01["Preprocessed_Essay"].map(Check_Spelling)

#### Checking Punctuation Mistakes **(Incomplete)**

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification , pipeline

tokenizer = AutoTokenizer.from_pretrained('oliverguhr/fullstop-punctuation-multilang-large')

In [11]:
model = AutoModelForTokenClassification.from_pretrained('oliverguhr/fullstop-punctuation-multilang-large')

In [12]:
pun = pipeline('ner' , model = model , tokenizer = tokenizer)

Correcting Spelling Mistakes

In [13]:
def Correct_Spelling(Sentence):
  """
    Checks spelling of each word

    Args:
      word: Words (Tokens) of each essay 
    
    Returns: 
      int
      
  """
  Tokens = word_tokenize(Sentence)
  for word in Tokens:
    word = Word(word)
  
    result = word.spellcheck()

    # result [0][0] contains the bool value if the spelling is correct or not
    # result [0][1] contains the confidence for the suggest correct spelling
    
    if word != result[0][0]:
      if(result[0][1] > 0.95):
        if (not "/" in word):
          Sentence.replace(word , result[0][0])
          print(word , result[0][0])
  return Sentence

In [None]:
text = Correct_Spelling(Data_Essay_01['Preprocessed_Essay'][1])
text

Checking Punctuation Mistakes

In [15]:
output = pun(text)

In [16]:
new_string = ''

for n in output:
  result = n['word'].replace('▁' , ' ') + n['entity'].replace('0', '')
  new_string += result

new_string

' Softball has to be one of the single most greatest sports ali.ve;. playing softball in college has always been a goal of mine. I love the dirt that sticks to your face, the sweat dripping from your forehead, and the gallons and gallons of water you poor all over yourself to keep cool in the bli.s.tering. @.CAP.S.2.. Although I love soft,ball, I feel that the memories you acquire from the times you have with your teammates, are the things you remember the most through out your softball career. I have always had great memories through soft-ball- many laughs, tears, and frustration.s. so when I had the opportunity to play on a top notch team, I looked forward to all the many more memories I would have... Hoo,d, @.CAP.S.1. @CAPS2 was my team name, I had played almost four years with this local team. Many of these girls were like sisters to me. they had treated me amazingly through out my years playing with them. I felt like I had my set team, I was going to play with these girls all thou

#### Checking Capitalization Mistakes

In [43]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [55]:
from nltk.tag import pos_tag
def Check_Captialization(Essay):
  """
    Checks capitalization in each sentence of an essay

    Args:
    Essay: Words (Tokens) of each essay 

    Returns: 
    int

  """
  count = 0

  words = word_tokenize(Essay)

  # Checking Capital Letters in start of every sentence & start of every quote
  for i in range(len(words) - 1):
    
    if words[i] == '.' or words[i] == '"':
        match = words[i+1]
        if match != words[i+1].title():
          count = count + 1

  # Checking if all proper nouns are capital or not
  tagged_sent = pos_tag(words)

  for word,pos in tagged_sent:
    if(pos == 'NNP'):
      if word != word.title():
        count = count + 1

  return count

In [56]:
Check_Captialization(Remove_NER(Data_Essay_01["Essay"][9]))

2

In [47]:
Remove_NER(Data_Essay_01["Essay"][12])

'dear local newspaper i raed ur argument on the computers and i think they are a positive effect on people. the first reson i think they are a good effect is because you can do so much with them like if you live in mane and ur cuzin lives in califan you and him could have a wed chat. the second thing you could do is look up news any were in the world you could be stuck on a plane and it would be vary boring when you can take but ur computer and go on ur computer at work and start doing work. when you said it takes away from exirsis well some people use the computer for that too to chart how fast they run or how meny miles they want and sometimes what they eat. the thrid reson is some peolpe jobs are on the computers or making computers for exmple when you made this artical you didnt use a type writer you used a computer and printed it out if we didnt have computers it would make ur a lot harder. thank you for reading and whe you are thinking adout it agen pleas consiter my thrie resons

### **Grammar Error Detection**

In [None]:
# from nltk.translate.bleu_score import sentence_bleu
# reference = result.text.split()

# candidate = 'Dear local newspaper, @CAPS1 best friend, @LOCATION2, was once a nerd with no hand-eye coordination, @CAPS2, he started to use a computer and now he has better hand-eye coordination than me.'.split()
# print('BLEU score -> {}'.format(sentence_bleu(reference, candidate )))

In [None]:
df1 = Data_Essay_01[['Essay', 'Sent_Count']]
df1.head()

In [None]:
# Grammar Error via CFG

# def grammar_error(essays_1,sent_count):
#     sentences = nltk.sent_tokenize(essays_1[1])
#     for sent in range(0,sent_count):
#        wrong =1
#        sent_split = sentences[sent].split()  
#        tagged = nltk.pos_tag(sent_split) 
#        tags = [x[1].lower() for x in tagged] 

#        try:
#         parser = nltk.RecursiveDescentParser(grammar)
        
#         for tree in parser.parse(tags):
#             s = tree
#             wrong =0
#             print("Correct Grammar!!!!")
#             print("*"*20)
        
#         if wrong ==1:
#             print("Wrong Grammar!!!")
#             print("*"*20)
    
#        except ValueError:
#         print("Sorry! Some words are not covered in the grammar yet :)")

    
# essays_1 = df1['Essay_Clean'].tolist()
# sent_count = df1['Clean_Sent_Count'].tolist()
# grammar_error(essays_1,sent_count[1])

In [None]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

In [None]:
df1 = Data_Essay_01[['Essay', 'Sent_Count']]
df1['Essay'] = df1['Essay'].apply(Remove_White_Spaces)   # to avoid whitespace error
df1['Essay']

In [None]:
def Grammar_Errors(essays):
    matches = tool.check(essays)
    errors = []
    #language_tool_python.utils.correct(text, matches)   # to correct it
    for i in range(0, len(matches)):
      errors.append(matches[i].ruleId)  # or category of the error (Misc, Whitespace, Typography)
    return len(matches), errors

In [None]:
# Data_Essay_01['Grammar_Errors'], Data_Essay_01['Grammar_Error_List'] = zip(*df1_copy['Essay'].map(grammar_errors))
Data_Essay_01['Grammar_Errors'], Data_Essay_01['Grammar_Error_List'] = zip(*df1['Essay'].map(Grammar_Errors))

In [None]:
Data_Essay_01.head()

In [None]:
out = Data_Essay_01['Grammar_Error_List'].explode().value_counts()
out

In [None]:
out.to_csv('GrammarErrors.csv')

### **Lexical Sophistication**

## **Content**

### **Latent Semantic Analysis (LSA)**

Content analysis generally implies only a high-level semantic analysis and comparison with source text and graded essays

## **Semantic**
Semantic metrics assess the correctness of content connotation

## **Semantic Coherence & Consistency**

## **Connectivity**