## Setup

In [None]:
#Install os and glob
import glob 
import os

#Install pandas
import pandas as pd

#Install numpy
import numpy as np

#Imports the Natural Language Toolkit, which is necessary to install NLTK packages and libraries
#!pip install nltk
import nltk

#Installs libraries and packages to tokenize text
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from  nltk.text import ConcordanceIndex

#Installs libraries and packages to clean text
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [None]:
##Get current working directory 
path = os.getcwd()
print(path)

#Change working directory
path = os.chdir("/Users/megankane/Desktop/clean_texts")

In [None]:
#Append all txt files to pandas dataframe

#Make list for filenames and texts
filenames = []
data = []
files = [f for f in os.listdir(path) if os.path.isfile(f)]
for f in files:
    if f.endswith('.txt'):
        with open (f, "rb") as myfile:
            filenames.append(myfile.name)
            data.append(myfile.read())
d = {'ID':filenames,'Text':data}
        
essays = pd.DataFrame(d)
essays

## Basic Cleaning

In [None]:
#Remove encoding characters from Text column (b'\xef\xbb\xbf)
essays['Text'] = essays['Text'].apply(lambda x: x.decode('utf-8', errors='ignore'))
essays['Text'] = essays['Text'].astype(str)

#Remove newline characters and put in new column (will need to split paragraphs later)
essays['Text_Newlines'] = essays['Text']
essays['Text'] = essays['Text'].str.replace(r'\s+|\\r', ' ', regex=True) 
essays['Text'] = essays['Text'].str.replace(r'\s+|\\n', ' ', regex=True) 
essays.head()

In [None]:
#Change text type to strings
essays['Text'] = essays['Text'].astype(str)

In [None]:
#Lowercase all words
essays['Lower_Text'] = essays['Text'].str.lower()
essays.head()

In [None]:
#Remove punctuation
p = re.compile(r'[^\w\s]+')
essays['NoPunct_Text'] = [p.sub(' ', x) for x in essays['Lower_Text'].tolist()]
essays.head()

In [None]:
#Remove extraneous whitespace using regular expressions
essays['NoPunct_Text'] = essays['NoPunct_Text'] .str.replace('  +', ' ', regex=True)
essays.head()

In [None]:
#Remove numbers and extraneous characters
essays['Clean_Text'] = essays['NoPunct_Text'] .str.replace('\d+', '', regex=True)
essays['Clean_Text'] = essays['Clean_Text'] .str.replace('_', '')
essays


In [None]:
#Remove stopwords
stop_words = set(stopwords.words("english"))
essays['Text_NoStops'] = essays['Clean_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
essays

In [None]:
#Save cleaned dataframe to working directory
essays.to_csv('cleaned_essays.csv', index=False)