# Enriching Student Essays
Using SpaCy for enrichment--lemmatization, part-of-speech tagging, and named entity recognition.

## Setup

In [1]:
#Install os and glob
import glob 
import os

#Install pandas
import pandas as pd

#Install numpy
import numpy as np

#Imports the Natural Language Toolkit, which is necessary to install NLTK packages and libraries
#!pip install nltk
import nltk

#Import matplotlib for visualizations
import matplotlib.pyplot as plt

#Imports spaCy itself, necessary to use features 
#!pip install spaCy
import spacy
#Load the natural language processing pipeline
nlp = spacy.load("en_core_web_sm")
#Load spaCy visualizer
from spacy import displacy

from scipy import stats

import re  # For preprocessing
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim

In [2]:
##Get current working directory 
path = os.getcwd()
print(path)

#Change working directory
path = os.chdir("/Users/megankane/Desktop/clean_texts")

/Users/megankane/Documents/Dissertation_Research


In [6]:
#Upload dataframe of student essays (cleaned)
df = pd.read_csv(r'cleaned_essays.csv')
df.head()

Unnamed: 0,ID,Text,Text_Newlines,Lower_Text,NoPunct_Text,Clean_Text,Text_NoStops
0,"Score: 92.0, ID: 73.txt",Aysha Shaukat Professor Kane ENG 802 21 April ...,Aysha Shaukat Professor Kane ENG 802 21 April ...,aysha shaukat professor kane eng 802 21 april ...,aysha shaukat professor kane eng 802 21 april ...,aysha shaukat professor kane eng april fina...,aysha shaukat professor kane eng april final r...
1,"Score: 93.0, ID: 96.txt",Abigail Sensenig Professor Stefan Analytical R...,Abigail Sensenig Professor Stefan Analytical R...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...
2,"Score: 86.0, ID: 76.txt",Amaya Whipple Professor Megan Kane ENG 802 9 F...,Amaya Whipple Professor Megan Kane ENG 802 9 F...,amaya whipple professor megan kane eng 802 9 f...,amaya whipple professor megan kane eng 802 9 f...,amaya whipple professor megan kane eng febru...,amaya whipple professor megan kane eng februar...
3,"Score: 86.0, ID: 3.txt",Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,fallt 1 sela fallt professor kane eng-0802 12-...,fallt 1 sela fallt professor kane eng 0802 12 ...,fallt sela fallt professor kane eng final...,fallt sela fallt professor kane eng final refl...
4,"Score: 95.0, ID: 63.txt",Maya King Professor Megan Kane ENG 802 28 Sept...,Maya King Professor Megan Kane ENG 802 28 Sept...,maya king professor megan kane eng 802 28 sept...,maya king professor megan kane eng 802 28 sept...,maya king professor megan kane eng september...,maya king professor megan kane eng september f...


In [10]:
#Create new dataframe for enrichment
enriched_df = df[['ID', 'Clean_Text']].copy()

## Lemmatization

In [17]:
#Get lemmas
lemma_list = []

# Disable Dependency Parser, and NER since all we want is lemmatizer 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and getlemma, append lemma to list
  for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
        word_list.append(token.lemma_)
        
    lemma_list.append(word_list)

#Put lemmas in a new column in dataframe
enriched_df['Lemma_Text'] = lemma_list
enriched_df['Lemma_Text'] = [' '.join(map(str, l)) for l in enriched_df['Lemma_Text']]

#Check lemmas
enriched_df.head()

Unnamed: 0,ID,Clean_Text,POS_Text,Proper_Nouns,NER_Text,NER_Words,Lemma_Text
0,"Score: 92.0, ID: 73.txt",aysha shaukat professor kane eng april fina...,PROPN PROPN PROPN PROPN PROPN SPACE PROPN SPAC...,"aysha, shaukat, professor, kane, eng, april, t...",kane eng april first second three two three...,kane eng april first second three two three...,aysha shaukat professor kane eng april fi...
1,"Score: 93.0, ID: 96.txt",abigail sensenig professor stefan analytical r...,PROPN PROPN PROPN PROPN ADJ NOUN CCONJ VERB SP...,"abigail, sensenig, professor, stefan, septembe...",january daily anna orso s billypenn three phil...,january daily anna orso s billypenn three phil...,abigail sensenig professor stefan analytical r...
2,"Score: 86.0, ID: 76.txt",amaya whipple professor megan kane eng febru...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...,"amaya, whipple, professor, megan, kane, eng, f...",amaya megan kane eng february one megan kan...,amaya megan kane eng february one megan kan...,amaya whipple professor megan kane eng febr...
3,"Score: 86.0, ID: 3.txt",fallt sela fallt professor kane eng final...,PROPN SPACE PROPN PROPN PROPN PROPN PROPN SPAC...,"fallt, sela, fallt, professor, kane, eng, ross...",fallt sela fallt kane eng this year the year ...,fallt sela fallt kane eng this year the year ...,fallt sela fallt professor kane eng fin...
4,"Score: 95.0, ID: 63.txt",maya king professor megan kane eng september...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...,"maya, king, professor, megan, kane, eng, septe...",maya king megan kane eng september the anna...,maya king megan kane eng september the anna...,maya king professor megan kane eng septembe...


## Part of Speech Tagging

In [12]:
#Get part of speech tags
pos_list = []

# Disable Dependency Parser, and NER since all we want is POS 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and tag POS, append POS to list
  for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
        word_list.append(token.pos_)
        
    pos_list.append(word_list)

#Put POS in a new column in dataframe
enriched_df['POS_Text'] = pos_list
enriched_df['POS_Text'] = [' '.join(map(str, l)) for l in enriched_df['POS_Text']]

#Check pos tags
enriched_df.head()


Unnamed: 0,ID,Clean_Text,POS_Text
0,"Score: 92.0, ID: 73.txt",aysha shaukat professor kane eng april fina...,PROPN PROPN PROPN PROPN PROPN SPACE PROPN SPAC...
1,"Score: 93.0, ID: 96.txt",abigail sensenig professor stefan analytical r...,PROPN PROPN PROPN PROPN ADJ NOUN CCONJ VERB SP...
2,"Score: 86.0, ID: 76.txt",amaya whipple professor megan kane eng febru...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...
3,"Score: 86.0, ID: 3.txt",fallt sela fallt professor kane eng final...,PROPN SPACE PROPN PROPN PROPN PROPN PROPN SPAC...
4,"Score: 95.0, ID: 63.txt",maya king professor megan kane eng september...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...


In [14]:
#Get specific subset of part of speech tags
propnoun_list = []

# Disable Dependency Parser, and NER since all we want is POS 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and tag POS, append POS to list
  for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
      if token.pos_ == 'PROPN':
        word_list.append(token)
        
    propnoun_list.append(word_list)

#Make pos list a new column in DataFrame
enriched_df['Proper_Nouns'] = propnoun_list
enriched_df['Proper_Nouns'] = [', '.join(map(str, l)) for l in enriched_df['Proper_Nouns']]

#Check proper noun tags
enriched_df.head()

Unnamed: 0,ID,Clean_Text,POS_Text,Proper_Nouns
0,"Score: 92.0, ID: 73.txt",aysha shaukat professor kane eng april fina...,PROPN PROPN PROPN PROPN PROPN SPACE PROPN SPAC...,"aysha, shaukat, professor, kane, eng, april, t..."
1,"Score: 93.0, ID: 96.txt",abigail sensenig professor stefan analytical r...,PROPN PROPN PROPN PROPN ADJ NOUN CCONJ VERB SP...,"abigail, sensenig, professor, stefan, septembe..."
2,"Score: 86.0, ID: 76.txt",amaya whipple professor megan kane eng febru...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...,"amaya, whipple, professor, megan, kane, eng, f..."
3,"Score: 86.0, ID: 3.txt",fallt sela fallt professor kane eng final...,PROPN SPACE PROPN PROPN PROPN PROPN PROPN SPAC...,"fallt, sela, fallt, professor, kane, eng, ross..."
4,"Score: 95.0, ID: 63.txt",maya king professor megan kane eng september...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...,"maya, king, professor, megan, kane, eng, septe..."


## Named Entities

In [15]:
#Get named entities
ent_list = []

with nlp.disable_pipes('tagger', 'parser'):
    for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
        ent_list.append(doc.ents)

#Put NEs in a new column in dataframe
enriched_df['NER_Text'] = ent_list
enriched_df['NER_Text'] = [' '.join(map(str, l)) for l in enriched_df['NER_Text']]

#Check named entities
enriched_df.head()




Unnamed: 0,ID,Clean_Text,POS_Text,Proper_Nouns,NER_Text
0,"Score: 92.0, ID: 73.txt",aysha shaukat professor kane eng april fina...,PROPN PROPN PROPN PROPN PROPN SPACE PROPN SPAC...,"aysha, shaukat, professor, kane, eng, april, t...",kane eng april first second three two three...
1,"Score: 93.0, ID: 96.txt",abigail sensenig professor stefan analytical r...,PROPN PROPN PROPN PROPN ADJ NOUN CCONJ VERB SP...,"abigail, sensenig, professor, stefan, septembe...",january daily anna orso s billypenn three phil...
2,"Score: 86.0, ID: 76.txt",amaya whipple professor megan kane eng febru...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...,"amaya, whipple, professor, megan, kane, eng, f...",amaya megan kane eng february one megan kan...
3,"Score: 86.0, ID: 3.txt",fallt sela fallt professor kane eng final...,PROPN SPACE PROPN PROPN PROPN PROPN PROPN SPAC...,"fallt, sela, fallt, professor, kane, eng, ross...",fallt sela fallt kane eng this year the year ...
4,"Score: 95.0, ID: 63.txt",maya king professor megan kane eng september...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...,"maya, king, professor, megan, kane, eng, septe...",maya king megan kane eng september the anna...


In [16]:
#Get Named Entity words
ent_w_list = []

with nlp.disable_pipes('tagger', 'parser'):
    for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
        ent_w_list.append(doc.ents)

enriched_df['NER_Words'] = ent_w_list
enriched_df['NER_Words'] = [' '.join(map(str, l)) for l in enriched_df['NER_Words']]


#Check named entities
enriched_df.head()



Unnamed: 0,ID,Clean_Text,POS_Text,Proper_Nouns,NER_Text,NER_Words
0,"Score: 92.0, ID: 73.txt",aysha shaukat professor kane eng april fina...,PROPN PROPN PROPN PROPN PROPN SPACE PROPN SPAC...,"aysha, shaukat, professor, kane, eng, april, t...",kane eng april first second three two three...,kane eng april first second three two three...
1,"Score: 93.0, ID: 96.txt",abigail sensenig professor stefan analytical r...,PROPN PROPN PROPN PROPN ADJ NOUN CCONJ VERB SP...,"abigail, sensenig, professor, stefan, septembe...",january daily anna orso s billypenn three phil...,january daily anna orso s billypenn three phil...
2,"Score: 86.0, ID: 76.txt",amaya whipple professor megan kane eng febru...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...,"amaya, whipple, professor, megan, kane, eng, f...",amaya megan kane eng february one megan kan...,amaya megan kane eng february one megan kan...
3,"Score: 86.0, ID: 3.txt",fallt sela fallt professor kane eng final...,PROPN SPACE PROPN PROPN PROPN PROPN PROPN SPAC...,"fallt, sela, fallt, professor, kane, eng, ross...",fallt sela fallt kane eng this year the year ...,fallt sela fallt kane eng this year the year ...
4,"Score: 95.0, ID: 63.txt",maya king professor megan kane eng september...,PROPN PROPN PROPN PROPN PROPN PROPN SPACE PROP...,"maya, king, professor, megan, kane, eng, septe...",maya king megan kane eng september the anna...,maya king megan kane eng september the anna...


In [18]:
#Download enriched texts to csv
enriched_df.to_csv('enriched_texts.csv')