## **Installing Packages**

In [None]:
!pip install textblob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## **Importing Packages**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn 
import re
import nltk 
from nltk.corpus import stopwords
from textblob import Word
import spacy
  

from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# **Reading Data From Google Drive**

In [2]:
from google.colab import drive
drive.mount('/content/drive')
Data_Essay_01 = pd.read_csv("/content/drive/MyDrive/IntelliTech-DataSet/EssaySet01.csv")
Data_Essay_01.head()

Mounted at /content/drive


Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0


# **Feature Extraction**

## **Essay Pre Processing**

In [3]:
def Remove_NER(Essay):
  """
    Removes Named Entity Recognition (NER) from each essay

    Args:
      Sentence: Essay of each student 
    
    Returns: 
      String

  """
  return ' '.join (word for word in Essay.lower().split(' ') if not word.startswith('@'))

def Remove_Punctuations(sentence):
  """
    Removes punctuations from text
    Args:
      sentence: Essay of each student
    
    Returns: 
      String
  """
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  newSentence = ""
  for word in sentence:
      if (word in punctuations):
          newSentence = newSentence + " "
      else: 
          newSentence = newSentence + word
  return newSentence

def LowerCase_Words(Essay):
  """
    Lower case all the words in an essay

    Args:
      Sentence: Essay of each student
    
    Returns: 
      String
  """
  return re.sub('[0-9]+','', Essay).lower() 

def Tokenize_Essay(Essay):
    Preprocessed = Remove_Punctuations(Essay)
    return " ".join(word_tokenize(Preprocessed))

Removing NERs, Punctuations and Lower Casing

In [4]:
Data_Essay_01['Preprocessed_Essay'] = Data_Essay_01['Essay'].apply(Remove_NER)
Data_Essay_01['Preprocessed_Essay'] = Data_Essay_01['Preprocessed_Essay'].apply(Tokenize_Essay)
Data_Essay_01.head()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,dear local newspaper i think effects computers...
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0,dear i believe that using computers will benef...
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0,dear more and more people use computers but no...
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0,dear local newspaper i have found that many ex...
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0,dear i know having computers has a positive ef...


## **Basic Count Features**

#### 1. Counting Sentences per Essay

In [5]:
def Sentence_Count(Essay):
    """
    Counts sentences in an essay

    Args:
      Essay: Essay of each student 
    
    Returns: 
      int
      
  """
    sentence_no = nltk.sent_tokenize(Essay)
    return len(sentence_no)
  
Data_Essay_01['Sent_Count'] = Data_Essay_01['Essay'].apply(Sentence_Count)
Data_Essay_01

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,dear local newspaper i think effects computers...,16
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0,dear i believe that using computers will benef...,20
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0,dear more and more people use computers but no...,14
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0,dear local newspaper i have found that many ex...,27
4,5,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0,dear i know having computers has a positive ef...,30
...,...,...,...,...,...,...,...
1778,1783,"Dear @CAPS1, @CAPS2 several reasons on way I t...",4.0,4.0,8.0,dear several reasons on way i that advances in...,21
1779,1784,Do a adults and kids spend to much time on the...,3.0,4.0,7.0,do a adults and kids spend to much time on the...,18
1780,1785,My opinion is that people should have computer...,4.0,4.0,8.0,my opinion is that people should have computer...,18
1781,1786,"Dear readers, I think that its good and bad to...",1.0,1.0,2.0,dear readers i think that its good and bad to ...,1


#### 2. Counting Words per Essay

**Observation:** These word count are more than the original count coz of nltk tokenization. Punctations are treated as seperate words.


In [6]:
def Word_Count(Essay):
  """
    Counts words in an essay

    Args:
      Essay: Essay of each student 
    
    Returns: 
      int
      
  """
  #cleaned_essay = re.sub('[^a-zA-Z]','',essay) 
  word_no = nltk.word_tokenize(Essay)
  return len(word_no)
 
Data_Essay_01['Word_Count'] = Data_Essay_01['Essay'].apply(Word_Count)
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count,Word_Count
351,354,"Dear local Newspaper, @CAPS1 the years compute...",4.0,4.0,8.0,dear local newspaper the years computers has b...,20,302


#### 3. Counting Characters per Essay

In [7]:
def Char_Count(Essay):
  """
    Counts characters in an essay

    Args:
      Essay: Essay of each student 
    
    Returns: 
      int
      
  """
  #cleaned_essay = re.sub('[^a-zA-Z]',' ',Essay) 
  return len([character for character in Essay])

Data_Essay_01['Char_Count'] = Data_Essay_01['Essay'].apply(Char_Count)
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count,Word_Count,Char_Count
1094,1097,"Dear fellow citizens, @CAPS1 think if you had ...",5.0,5.0,10.0,dear fellow citizens think if you had to get a...,23,447,2099


#### 4. Average Word Length of Essay

In [8]:
def Avg_Word_Count(Essay):
  """
    Calculates Average Word Count In An Essay Set

    Args:
      Essay: Essay of each student 
    
    Returns: 
      float
      
  """
  word_list = nltk.word_tokenize(Essay)
  total = sum(map(len, word_list))/len(word_list)
  return total

Data_Essay_01['Avg_Word_Count'] = Data_Essay_01['Essay'].apply(Avg_Word_Count)
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count,Word_Count,Char_Count,Avg_Word_Count
1114,1117,Statistics show that more and more people ever...,3.0,4.0,7.0,statistics show that more and more people ever...,14,372,1793,3.948925


## **Parts Of Speech Counts**

In [13]:
def Pos_Tag_Count(Essay):
  """
    Counts Parts of Speech in an Essay

    Args:
      Essay: Essay of each student 
    
    Returns: 
      int,int,int,int,int,int
      
  """
  tagged_doc = nlp(Essay)

  adj_count=0
  verb_count=0
  noun_count=0
  pNoun_count=0
  adverb_count=0
  conj_count=0

  for token in tagged_doc:

    if(token.pos_ == 'ADJ'):
      adj_count+=1
    
    elif(token.pos_ =='NOUN'):
      noun_count+=1

    elif (token.pos_ =='PRON'):
      pNoun_count+=1

    elif (token.pos_ =='VERB'):
      verb_count+=1

    elif (token.pos_ =='ADV'):
      adverb_count+=1
    
    elif(token.pos_=='CCONJ'):
      conj_count+=1

  return verb_count,noun_count, adj_count, conj_count, adverb_count,pNoun_count

In [12]:
Data_Essay_01['Verb_Count'], Data_Essay_01['Noun_Count'], Data_Essay_01['Adj_Count'], Data_Essay_01['Conj_Count'], Data_Essay_01['Adverb_Count'], Data_Essay_01['pNoun_Count']=zip(*Data_Essay_01["Preprocessed_Essay"].map(Pos_Tag_Count))
Data_Essay_01.sample()

Unnamed: 0,ID,Essay,Rater_1 Score,Rater_2 Score,Total Score,Preprocessed_Essay,Sent_Count,Word_Count,Char_Count,Avg_Word_Count,Verb_Count,Noun_Count,Adj_Count,Conj_Count,Adverb_Count,pNoun_Count
1065,1068,"Computers are a huge impact on society, but do...",5.0,4.0,9.0,computers are a huge impact on society but do ...,32,566,2670,3.791519,83,90,38,25,34,86


## **Mechanics**

#### Counting Spelling Mistakes

In [14]:
def Check_Spelling(Sentence):
  """
    Checks spelling of each word

    Args:
      word: Words (Tokens) of each essay 
    
    Returns: 
      int
      
  """
  count = 0
  Sentence = word_tokenize(Sentence)
  for word in Sentence:
    word = Word(word)
  
    result = word.spellcheck()

    # result [0][0] contains the bool value if the spelling is correct or not
    # result [0][1] contains the confidence for the suggest correct spelling

    if word != result[0][0]:
      # print(f'Spelling of "{word}" is not correct!')
      # print(f'Correct spelling of "{word}": "{result[0][0]}" (with {result[0][1]} confidence).')
      count = count + 1

  return count

In [None]:
Data_Essay_01["Preprocessed_Essay"] = Data_Essay_01["Essay"].apply(Remove_NER)
listOfEssays = Data_Essay_01["Preprocessed_Essay"].apply(Remove_Punctuations)

In [None]:
Data_Essay_01["Spelling_Mistakes_Count"] = zip(*listOfEssays.map(Check_Spelling))