# Cleaning and Segmenting Student Essays

Perform basic cleaning measures (lowercasing, punctuation removal, whitespace removal, stopword removal) and segment full texts into paragraphs and sentences for further analysis. 

## Setup

In [None]:
#Install os and glob
import glob 
import os

#Install pandas
import pandas as pd

#Install numpy
import numpy as np

#Imports the Natural Language Toolkit, which is necessary to install NLTK packages and libraries
#!pip install nltk
import nltk

#import regex
import re

#Installs libraries and packages to tokenize text
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from  nltk.text import ConcordanceIndex

#Installs libraries and packages to clean text
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [None]:
##Get current working directory 
path = os.getcwd()
print(path)

#Change working directory
path = os.chdir("/Users/megankane/Desktop/Texts")

In [None]:
#Append all txt files to pandas dataframe

#Make list for filenames and texts
filenames = []
data = []
files = [f for f in os.listdir(path) if os.path.isfile(f)]
for f in files:
    if f.endswith('.txt'):
        with open (f, "rb") as myfile:
            filenames.append(myfile.name)
            data.append(myfile.read())
d = {'ID':filenames,'Text':data}
        
essays = pd.DataFrame(d)
essays

## Clean Full Essays

In [None]:
#Remove encoding characters from Text column (b'\xef\xbb\xbf)
essays['Text'] = essays['Text'].apply(lambda x: x.decode('utf-8', errors='ignore'))
essays['Text'] = essays['Text'].astype(str)

essays.head()

In [None]:
#Remove newline characters and put in new column (will need to split paragraphs later)
essays['Text_Newlines'] = essays['Text']
essays['Text'] = essays['Text'].str.replace(r'\s+|\\r', ' ', regex=True) 
essays['Text'] = essays['Text'].str.replace(r'\s+|\\n', ' ', regex=True) 
essays

In [None]:
#Lowercase all words
essays['Lower_Text'] = essays['Text'].str.lower()
essays.head()

In [None]:
#Remove punctuation
p = re.compile(r'[^\w\s]+')
essays['NoPunct_Text'] = [p.sub(' ', x) for x in essays['Lower_Text'].tolist()]
essays.head()

In [None]:
#Remove extraneous whitespace using regular expressions
essays['NoPunct_Text'] = essays['NoPunct_Text'] .str.replace('  +', ' ', regex=True)
essays.head()

In [None]:
#Remove numbers and extraneous characters
essays['Clean_Text'] = essays['NoPunct_Text'] .str.replace('\d+', '', regex=True)
essays['Clean_Text'] = essays['Clean_Text'] .str.replace('_', '')
essays

In [None]:
#Remove stopwords
stop_words = set(stopwords.words("english"))
essays['Text_NoStops'] = essays['Clean_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
essays

In [None]:
#Save cleaned dataframe to working directory
essays.to_csv('cleaned_full_essays.csv', index=False)

## Paragraph Segmentation and Cleaning
Segment full texts into paragraphs by splitting on each newline character and performing basic cleaning procedures.

In [None]:
#We only need one newlines version here
paragraphs_df = essays[['ID', 'Text_Newlines']].copy()

#Check new df
paragraphs_df.head()

In [None]:
#Count number of paragraphs in each text
paragraph_counts = paragraphs_df['Text_Newlines'].str.count(r'\n')
paragraph_counts

#Append paragraphs counts to dataframe
paragraphs_df["Paragraph_Counts"] = paragraph_counts
paragraphs_df

In [None]:
#Make new cell each time new paragraph starts 
new = paragraphs_df["Text_Newlines"].str.split(r'\n', expand = True).set_index(paragraphs_df['ID'])

#Flatten dataframe so each chapter is on own row, designated by book and chapter 
paragraphs_df = new.stack().reset_index()
paragraphs_df.columns = ["ID", "Paragraph", "Text"]
paragraphs_df

In [None]:
##Filter out paragraphs with 5 or less words (headers)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.split().str.len().lt(10)]

## Filter out paragraphs containing "http://", "doi:" , "https://" and "://www" (Works Cited citations)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("http://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("https://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("://www")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("www.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains(".com/")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("Vol.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("doi:")]

paragraphs_df

In [None]:
#Perform other basic cleaning procedures

#Lowercase all words
paragraphs_df['Lower_Text'] = paragraphs_df['Text'].str.lower()
paragraphs_df.head()


#Remove punctuation
p = re.compile(r'[^\w\s]+')
paragraphs_df['NoPunct_Text'] = [p.sub(' ', x) for x in paragraphs_df['Lower_Text'].tolist()]
paragraphs_df.head()

#Remove extraneous whitespace using regular expressions
paragraphs_df['NoPunct_Text'] = paragraphs_df['NoPunct_Text'] .str.replace('  +', ' ', regex=True)
paragraphs_df.head()

#Remove numbers and extraneous characters
paragraphs_df['Clean_Text'] = paragraphs_df['NoPunct_Text'] .str.replace('\d+', '', regex=True)
paragraphs_df['Clean_Text'] = paragraphs_df['Clean_Text'] .str.replace('_', '')
paragraphs_df

#Remove stopwords
stop_words = set(stopwords.words("english"))
paragraphs_df['Text_NoStops'] = paragraphs_df['Clean_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
paragraphs_df

In [None]:
#Make column with ID and paragraph
paragraphs_df['ID_Paragraph'] = paragraphs_df['ID'].astype(str) + '_' + paragraphs_df['Paragraph'].astype(str)

#Download paragraphs to csv
paragraphs_df_download = paragraphs_df[['ID_Paragraph', 'Text']].copy()
paragraphs_df_download.to_csv('all_paragraphs.csv')

In [None]:
#Download each paragraph as a txt file
#Add each text to a new list called paragraphs
paragraphs = []
for row in paragraphs_df['Clean_Text'].items():
    row_string = (str(row[1]))
    paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in paragraphs_df['ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir paragraphs_df

#Write texts to files
n = 0
for item in paragraphs:
  f = open("paragraphs_df/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  

## Sentence Segmentation and Cleaning
Segment paragraphs into sentences by splitting on each newline character and performing basic cleaning procedures.

In [None]:
#Make new df for sentence segmentation
sentences_df = paragraphs_df.copy()

#Count number of paragraphs in each text
sentence_counts = sentences_df['Text'].str.count(r'[.!?]+')
sentence_counts

#Append paragraphs counts to dataframe
sentences_df["Sentence_Counts"] = sentence_counts
sentences_df.head()

In [None]:
#Make new cell each time new sentence starts 
new = sentences_df.Text.str.split(r'[.!?]+', expand = True).set_index(paragraphs_df['ID'])

#Flatten dataframe so each chapter is on own row, designated by book and chapter 
sentences_df = new.stack().reset_index()
sentences_df.columns = ["ID", "Sentence", "Text"]

sentences_df

In [None]:
#Perform basic cleaning procedures

#Lowercase all words
sentences_df['Lower_Text'] = sentences_df['Text'].str.lower()
sentences_df.head()


#Remove punctuation
p = re.compile(r'[^\w\s]+')
sentences_df['NoPunct_Text'] = [p.sub(' ', x) for x in sentences_df['Lower_Text'].tolist()]
sentences_df.head()

#Remove extraneous whitespace using regular expressions
sentences_df['NoPunct_Text'] = sentences_df['NoPunct_Text'] .str.replace('  +', ' ', regex=True)
sentences_df.head()

#Remove numbers and extraneous characters
sentences_df['Clean_Text'] = sentences_df['NoPunct_Text'] .str.replace('\d+', '', regex=True)
sentences_df['Clean_Text'] = sentences_df['Clean_Text'] .str.replace('_', '')
sentences_df

#Remove stopwords
stop_words = set(stopwords.words("english"))
sentences_df['Text_NoStops'] = sentences_df['Clean_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
sentences_df

In [None]:
#Make column with ID and sentence
sentences_df['ID_Paragraph'] = sentences_df['ID'].astype(str) + '_' + sentences_df['Sentence'].astype(str)

#Download paragraphs to csv
sentences_df_download = paragraphs_df[['ID_Paragraph', 'Text']].copy()
sentences_df_download.to_csv('all_sentences.csv')

In [None]:
#Download each paragraph as a txt file
#Add each text to a new list called paragraphs
sentences = []
for row in sentences_df['Clean_Text'].items():
    row_string = (str(row[1]))
    sentences.append(row_string)

#Add filenames to list
filenames = []
for row in sentences_df['ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir sentences_df

#Write texts to files
n = 0
for item in sentences:
  f = open("sentences_df/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  