# Enriching Student Essays
Using SpaCy for enrichment--lemmatization, part-of-speech tagging, and named entity recognition.

## Setup

In [None]:
#Install os and glob
import glob 
import os

#Install pandas
import pandas as pd

#Install numpy
import numpy as np

#Imports the Natural Language Toolkit, which is necessary to install NLTK packages and libraries
#!pip install nltk
import nltk

#Import matplotlib for visualizations
import matplotlib.pyplot as plt

#Imports spaCy itself, necessary to use features 
#!pip install spaCy
import spacy
#Load the natural language processing pipeline
nlp = spacy.load("en_core_web_sm")
#Load spaCy visualizer
from spacy import displacy

from scipy import stats

import re  # For preprocessing
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim

In [None]:
##Get current working directory 
path = os.getcwd()
print(path)

#Change working directory
path = os.chdir("/Users/megankane/Desktop/clean_texts")

In [None]:
#Upload dataframe of student essays (cleaned)
df = pd.read_csv(r'cleaned_essays.csv')
df.head()

In [None]:
#Create new dataframe for enrichment
enriched_df = df[['ID', 'Clean_Text']].copy()

## Lemmatization

In [None]:
#Get lemmas
lemma_list = []

# Disable Dependency Parser, and NER since all we want is lemmatizer 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and getlemma, append lemma to list
  for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
        word_list.append(token.lemma_)
        
    lemma_list.append(word_list)

#Put lemmas in a new column in dataframe
enriched_df['Lemma_Text'] = lemma_list
enriched_df['Lemma_Text'] = [' '.join(map(str, l)) for l in enriched_df['Lemma_Text']]

#Check lemmas
enriched_df.head()

## Part of Speech Tagging

In [None]:
#Get part of speech tags
pos_list = []

# Disable Dependency Parser, and NER since all we want is POS 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and tag POS, append POS to list
  for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
        word_list.append(token.pos_)
        
    pos_list.append(word_list)

#Put POS in a new column in dataframe
enriched_df['POS_Text'] = pos_list
enriched_df['POS_Text'] = [' '.join(map(str, l)) for l in enriched_df['POS_Text']]

#Check pos tags
enriched_df.head()


In [None]:
#Get specific subset of part of speech tags
propnoun_list = []

# Disable Dependency Parser, and NER since all we want is POS 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and tag POS, append POS to list
  for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
      if token.pos_ == 'PROPN':
        word_list.append(token)
        
    propnoun_list.append(word_list)

#Make pos list a new column in DataFrame
enriched_df['Proper_Nouns'] = propnoun_list
enriched_df['Proper_Nouns'] = [', '.join(map(str, l)) for l in enriched_df['Proper_Nouns']]

#Check proper noun tags
enriched_df.head()

## Named Entities

In [None]:
#Get named entities
ent_list = []

with nlp.disable_pipes('tagger', 'parser'):
    for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
        ent_list.append(doc.ents)

#Put NEs in a new column in dataframe
enriched_df['NER_Text'] = ent_list
enriched_df['NER_Text'] = [' '.join(map(str, l)) for l in enriched_df['NER_Text']]

#Check named entities
enriched_df.head()


In [None]:
#Get Named Entity words
ent_w_list = []

with nlp.disable_pipes('tagger', 'parser'):
    for doc in nlp.pipe(enriched_df.Clean_Text.astype('unicode').values, batch_size=100):
        ent_w_list.append(doc.ents)

enriched_df['NER_Words'] = ent_w_list
enriched_df['NER_Words'] = [' '.join(map(str, l)) for l in enriched_df['NER_Words']]


#Check named entities
enriched_df.head()

In [None]:
#Download enriched texts to csv
enriched_df.to_csv('enriched_texts.csv')