## Setup

In [1]:
#Install os and glob
import glob 
import os

#Install pandas
import pandas as pd

#Install numpy
import numpy as np

#Imports the Natural Language Toolkit, which is necessary to install NLTK packages and libraries
#!pip install nltk
import nltk

#Installs libraries and packages to tokenize text
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from  nltk.text import ConcordanceIndex

#Installs libraries and packages to clean text
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

#Import matplotlib for visualizations
import matplotlib.pyplot as plt


#Imports spaCy itself, necessary to use features 
#!pip install spaCy
import spacy
#Load the natural language processing pipeline
nlp = spacy.load("en_core_web_sm")
#Load spaCy visualizer
from spacy import displacy

from scipy import stats

import re  # For preprocessing
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim

[nltk_data] Downloading package punkt to /Users/megankane/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/megankane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
##Get current working directory 
path = os.getcwd()
print(path)

#Change working directory
path = os.chdir("/Users/megankane/Desktop/clean_texts")

/Users/megankane/Downloads


In [19]:
#Append all txt files to pandas dataframe

#Make list for filenames and texts
filenames = []
data = []
files = [f for f in os.listdir(path) if os.path.isfile(f)]
for f in files:
    if f.endswith('.txt'):
        with open (f, "rb") as myfile:
            filenames.append(myfile.name)
            data.append(myfile.read())
d = {'ID':filenames,'Text':data}
        
essays = pd.DataFrame(d)
essays

Unnamed: 0,ID,Text
0,"Score: 92.0, ID: 73.txt",b'Aysha Shaukat Professor Kane ENG 802 21 Apri...
1,"Score: 93.0, ID: 96.txt",b'Abigail Sensenig Professor Stefan Analytical...
2,"Score: 86.0, ID: 76.txt",b'Amaya Whipple Professor Megan Kane ENG 802 9...
3,"Score: 86.0, ID: 3.txt",b'Fallt 1 Sela Fallt Professor Kane ENG-0802 1...
4,"Score: 95.0, ID: 63.txt",b'Maya King Professor Megan Kane ENG 802 28 Se...
...,...,...
142,"Score: 72.0, ID: 78.txt","b""Ella Campbell SaraGrace Stefan 12/07/2020 An..."
143,"Score: 86.0, ID: 125.txt","b""Anna Coomans Professor Megan Kane Analytical..."
144,"Score: 86.0, ID: 131.txt",b'Dylan Muthersbaugh Analytical Reading & Writ...
145,"Score: 75.0, ID: 83.txt","b""Hailey DiFrancesco Professor SaraGrace Stefa..."


## Basic Cleaning

In [20]:
#Remove encoding characters from Text column (b'\xef\xbb\xbf)
essays['Text'] = essays['Text'].apply(lambda x: x.decode('utf-8', errors='ignore'))
essays['Text'] = essays['Text'].astype(str)

#Remove newline characters and put in new column (will need to split paragraphs later)
essays['Text_Newlines'] = essays['Text']
essays['Text'] = essays['Text'].str.replace(r'\s+|\\r', ' ', regex=True) 
essays['Text'] = essays['Text'].str.replace(r'\s+|\\n', ' ', regex=True) 
essays.head()

Unnamed: 0,ID,Text,Text_Newlines
0,"Score: 92.0, ID: 73.txt",Aysha Shaukat Professor Kane ENG 802 21 April ...,Aysha Shaukat Professor Kane ENG 802 21 April ...
1,"Score: 93.0, ID: 96.txt",Abigail Sensenig Professor Stefan Analytical R...,Abigail Sensenig Professor Stefan Analytical R...
2,"Score: 86.0, ID: 76.txt",Amaya Whipple Professor Megan Kane ENG 802 9 F...,Amaya Whipple Professor Megan Kane ENG 802 9 F...
3,"Score: 86.0, ID: 3.txt",Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...
4,"Score: 95.0, ID: 63.txt",Maya King Professor Megan Kane ENG 802 28 Sept...,Maya King Professor Megan Kane ENG 802 28 Sept...


In [21]:
#Change text type to strings
essays['Text'] = essays['Text'].astype(str)

In [22]:
#Lowercase all words
essays['Lower_Text'] = essays['Text'].str.lower()
essays.head()

Unnamed: 0,ID,Text,Text_Newlines,Lower_Text
0,"Score: 92.0, ID: 73.txt",Aysha Shaukat Professor Kane ENG 802 21 April ...,Aysha Shaukat Professor Kane ENG 802 21 April ...,aysha shaukat professor kane eng 802 21 april ...
1,"Score: 93.0, ID: 96.txt",Abigail Sensenig Professor Stefan Analytical R...,Abigail Sensenig Professor Stefan Analytical R...,abigail sensenig professor stefan analytical r...
2,"Score: 86.0, ID: 76.txt",Amaya Whipple Professor Megan Kane ENG 802 9 F...,Amaya Whipple Professor Megan Kane ENG 802 9 F...,amaya whipple professor megan kane eng 802 9 f...
3,"Score: 86.0, ID: 3.txt",Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,fallt 1 sela fallt professor kane eng-0802 12-...
4,"Score: 95.0, ID: 63.txt",Maya King Professor Megan Kane ENG 802 28 Sept...,Maya King Professor Megan Kane ENG 802 28 Sept...,maya king professor megan kane eng 802 28 sept...
...,...,...,...,...
142,"Score: 72.0, ID: 78.txt",Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,ella campbell saragrace stefan 12/07/2020 anal...
143,"Score: 86.0, ID: 125.txt",Anna Coomans Professor Megan Kane Analytical R...,Anna Coomans Professor Megan Kane Analytical R...,anna coomans professor megan kane analytical r...
144,"Score: 86.0, ID: 131.txt",Dylan Muthersbaugh Analytical Reading & Writin...,Dylan Muthersbaugh Analytical Reading & Writin...,dylan muthersbaugh analytical reading & writin...
145,"Score: 75.0, ID: 83.txt",Hailey DiFrancesco Professor SaraGrace Stefan ...,Hailey DiFrancesco Professor SaraGrace Stefan ...,hailey difrancesco professor saragrace stefan ...


In [23]:
#Remove punctuation
p = re.compile(r'[^\w\s]+')
essays['NoPunct_Text'] = [p.sub(' ', x) for x in essays['Lower_Text'].tolist()]
essays.head()

Unnamed: 0,ID,Text,Text_Newlines,Lower_Text,NoPunct_Text
0,"Score: 92.0, ID: 73.txt",Aysha Shaukat Professor Kane ENG 802 21 April ...,Aysha Shaukat Professor Kane ENG 802 21 April ...,aysha shaukat professor kane eng 802 21 april ...,aysha shaukat professor kane eng 802 21 april ...
1,"Score: 93.0, ID: 96.txt",Abigail Sensenig Professor Stefan Analytical R...,Abigail Sensenig Professor Stefan Analytical R...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...
2,"Score: 86.0, ID: 76.txt",Amaya Whipple Professor Megan Kane ENG 802 9 F...,Amaya Whipple Professor Megan Kane ENG 802 9 F...,amaya whipple professor megan kane eng 802 9 f...,amaya whipple professor megan kane eng 802 9 f...
3,"Score: 86.0, ID: 3.txt",Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,fallt 1 sela fallt professor kane eng-0802 12-...,fallt 1 sela fallt professor kane eng 0802 12 ...
4,"Score: 95.0, ID: 63.txt",Maya King Professor Megan Kane ENG 802 28 Sept...,Maya King Professor Megan Kane ENG 802 28 Sept...,maya king professor megan kane eng 802 28 sept...,maya king professor megan kane eng 802 28 sept...
...,...,...,...,...,...
142,"Score: 72.0, ID: 78.txt",Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,ella campbell saragrace stefan 12/07/2020 anal...,ella campbell saragrace stefan 12 07 2020 anal...
143,"Score: 86.0, ID: 125.txt",Anna Coomans Professor Megan Kane Analytical R...,Anna Coomans Professor Megan Kane Analytical R...,anna coomans professor megan kane analytical r...,anna coomans professor megan kane analytical r...
144,"Score: 86.0, ID: 131.txt",Dylan Muthersbaugh Analytical Reading & Writin...,Dylan Muthersbaugh Analytical Reading & Writin...,dylan muthersbaugh analytical reading & writin...,dylan muthersbaugh analytical reading writin...
145,"Score: 75.0, ID: 83.txt",Hailey DiFrancesco Professor SaraGrace Stefan ...,Hailey DiFrancesco Professor SaraGrace Stefan ...,hailey difrancesco professor saragrace stefan ...,hailey difrancesco professor saragrace stefan ...


In [26]:
#Remove extraneous whitespace using regular expressions
essays['NoPunct_Text'] = essays['NoPunct_Text'] .str.replace('  +', ' ', regex=True)
essays.head()

Unnamed: 0,ID,Text,Text_Newlines,Lower_Text,NoPunct_Text
0,"Score: 92.0, ID: 73.txt",Aysha Shaukat Professor Kane ENG 802 21 April ...,Aysha Shaukat Professor Kane ENG 802 21 April ...,aysha shaukat professor kane eng 802 21 april ...,aysha shaukat professor kane eng 802 21 april ...
1,"Score: 93.0, ID: 96.txt",Abigail Sensenig Professor Stefan Analytical R...,Abigail Sensenig Professor Stefan Analytical R...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...
2,"Score: 86.0, ID: 76.txt",Amaya Whipple Professor Megan Kane ENG 802 9 F...,Amaya Whipple Professor Megan Kane ENG 802 9 F...,amaya whipple professor megan kane eng 802 9 f...,amaya whipple professor megan kane eng 802 9 f...
3,"Score: 86.0, ID: 3.txt",Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,fallt 1 sela fallt professor kane eng-0802 12-...,fallt 1 sela fallt professor kane eng 0802 12 ...
4,"Score: 95.0, ID: 63.txt",Maya King Professor Megan Kane ENG 802 28 Sept...,Maya King Professor Megan Kane ENG 802 28 Sept...,maya king professor megan kane eng 802 28 sept...,maya king professor megan kane eng 802 28 sept...
...,...,...,...,...,...
142,"Score: 72.0, ID: 78.txt",Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,ella campbell saragrace stefan 12/07/2020 anal...,ella campbell saragrace stefan 12 07 2020 anal...
143,"Score: 86.0, ID: 125.txt",Anna Coomans Professor Megan Kane Analytical R...,Anna Coomans Professor Megan Kane Analytical R...,anna coomans professor megan kane analytical r...,anna coomans professor megan kane analytical r...
144,"Score: 86.0, ID: 131.txt",Dylan Muthersbaugh Analytical Reading & Writin...,Dylan Muthersbaugh Analytical Reading & Writin...,dylan muthersbaugh analytical reading & writin...,dylan muthersbaugh analytical reading writing ...
145,"Score: 75.0, ID: 83.txt",Hailey DiFrancesco Professor SaraGrace Stefan ...,Hailey DiFrancesco Professor SaraGrace Stefan ...,hailey difrancesco professor saragrace stefan ...,hailey difrancesco professor saragrace stefan ...


In [27]:
#Remove numbers and extraneous characters
essays['Clean_Text'] = essays['NoPunct_Text'] .str.replace('\d+', '', regex=True)
essays['Clean_Text'] = essays['Clean_Text'] .str.replace('_', '')
essays


Unnamed: 0,ID,Text,Text_Newlines,Lower_Text,NoPunct_Text,Clean_Text
0,"Score: 92.0, ID: 73.txt",Aysha Shaukat Professor Kane ENG 802 21 April ...,Aysha Shaukat Professor Kane ENG 802 21 April ...,aysha shaukat professor kane eng 802 21 april ...,aysha shaukat professor kane eng 802 21 april ...,aysha shaukat professor kane eng april fina...
1,"Score: 93.0, ID: 96.txt",Abigail Sensenig Professor Stefan Analytical R...,Abigail Sensenig Professor Stefan Analytical R...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...
2,"Score: 86.0, ID: 76.txt",Amaya Whipple Professor Megan Kane ENG 802 9 F...,Amaya Whipple Professor Megan Kane ENG 802 9 F...,amaya whipple professor megan kane eng 802 9 f...,amaya whipple professor megan kane eng 802 9 f...,amaya whipple professor megan kane eng febru...
3,"Score: 86.0, ID: 3.txt",Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,fallt 1 sela fallt professor kane eng-0802 12-...,fallt 1 sela fallt professor kane eng 0802 12 ...,fallt sela fallt professor kane eng final...
4,"Score: 95.0, ID: 63.txt",Maya King Professor Megan Kane ENG 802 28 Sept...,Maya King Professor Megan Kane ENG 802 28 Sept...,maya king professor megan kane eng 802 28 sept...,maya king professor megan kane eng 802 28 sept...,maya king professor megan kane eng september...
...,...,...,...,...,...,...
142,"Score: 72.0, ID: 78.txt",Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,ella campbell saragrace stefan 12/07/2020 anal...,ella campbell saragrace stefan 12 07 2020 anal...,ella campbell saragrace stefan analytical r...
143,"Score: 86.0, ID: 125.txt",Anna Coomans Professor Megan Kane Analytical R...,Anna Coomans Professor Megan Kane Analytical R...,anna coomans professor megan kane analytical r...,anna coomans professor megan kane analytical r...,anna coomans professor megan kane analytical r...
144,"Score: 86.0, ID: 131.txt",Dylan Muthersbaugh Analytical Reading & Writin...,Dylan Muthersbaugh Analytical Reading & Writin...,dylan muthersbaugh analytical reading & writin...,dylan muthersbaugh analytical reading writing ...,dylan muthersbaugh analytical reading writing ...
145,"Score: 75.0, ID: 83.txt",Hailey DiFrancesco Professor SaraGrace Stefan ...,Hailey DiFrancesco Professor SaraGrace Stefan ...,hailey difrancesco professor saragrace stefan ...,hailey difrancesco professor saragrace stefan ...,hailey difrancesco professor saragrace stefan ...


In [28]:
#Remove stopwords
stop_words = set(stopwords.words("english"))
essays['Text_NoStops'] = essays['Clean_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
essays

Unnamed: 0,ID,Text,Text_Newlines,Lower_Text,NoPunct_Text,Clean_Text,Text_NoStops
0,"Score: 92.0, ID: 73.txt",Aysha Shaukat Professor Kane ENG 802 21 April ...,Aysha Shaukat Professor Kane ENG 802 21 April ...,aysha shaukat professor kane eng 802 21 april ...,aysha shaukat professor kane eng 802 21 april ...,aysha shaukat professor kane eng april fina...,aysha shaukat professor kane eng april final r...
1,"Score: 93.0, ID: 96.txt",Abigail Sensenig Professor Stefan Analytical R...,Abigail Sensenig Professor Stefan Analytical R...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...,abigail sensenig professor stefan analytical r...
2,"Score: 86.0, ID: 76.txt",Amaya Whipple Professor Megan Kane ENG 802 9 F...,Amaya Whipple Professor Megan Kane ENG 802 9 F...,amaya whipple professor megan kane eng 802 9 f...,amaya whipple professor megan kane eng 802 9 f...,amaya whipple professor megan kane eng febru...,amaya whipple professor megan kane eng februar...
3,"Score: 86.0, ID: 3.txt",Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,fallt 1 sela fallt professor kane eng-0802 12-...,fallt 1 sela fallt professor kane eng 0802 12 ...,fallt sela fallt professor kane eng final...,fallt sela fallt professor kane eng final refl...
4,"Score: 95.0, ID: 63.txt",Maya King Professor Megan Kane ENG 802 28 Sept...,Maya King Professor Megan Kane ENG 802 28 Sept...,maya king professor megan kane eng 802 28 sept...,maya king professor megan kane eng 802 28 sept...,maya king professor megan kane eng september...,maya king professor megan kane eng september f...
...,...,...,...,...,...,...,...
142,"Score: 72.0, ID: 78.txt",Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,Ella Campbell SaraGrace Stefan 12/07/2020 Anal...,ella campbell saragrace stefan 12/07/2020 anal...,ella campbell saragrace stefan 12 07 2020 anal...,ella campbell saragrace stefan analytical r...,ella campbell saragrace stefan analytical read...
143,"Score: 86.0, ID: 125.txt",Anna Coomans Professor Megan Kane Analytical R...,Anna Coomans Professor Megan Kane Analytical R...,anna coomans professor megan kane analytical r...,anna coomans professor megan kane analytical r...,anna coomans professor megan kane analytical r...,anna coomans professor megan kane analytical r...
144,"Score: 86.0, ID: 131.txt",Dylan Muthersbaugh Analytical Reading & Writin...,Dylan Muthersbaugh Analytical Reading & Writin...,dylan muthersbaugh analytical reading & writin...,dylan muthersbaugh analytical reading writing ...,dylan muthersbaugh analytical reading writing ...,dylan muthersbaugh analytical reading writing ...
145,"Score: 75.0, ID: 83.txt",Hailey DiFrancesco Professor SaraGrace Stefan ...,Hailey DiFrancesco Professor SaraGrace Stefan ...,hailey difrancesco professor saragrace stefan ...,hailey difrancesco professor saragrace stefan ...,hailey difrancesco professor saragrace stefan ...,hailey difrancesco professor saragrace stefan ...


In [30]:
#Save cleaned dataframe to working directory
essays.to_csv('cleaned_essays.csv', index=False)