<a href="https://colab.research.google.com/github/mkane968/Digital-Text-Analysis-for-WPA/blob/main/Digital_Text_Analysis_for_WPA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Digital Text Analysis for WPA
A pipeline for examining student essays computationally for the purposes of writing program assessment.

Updated February 2023

Questions? Contact megan.kane@temple.edu

## Install Packages and Upload Files

In [None]:
#Install os and glob
import glob 
import os

#Install pandas
import pandas as pd

#Install regular expressions
import re

In [None]:
#Upload dataframe with papers and scores
#Pipeline for associating & cleaning essays and grades: https://github.com/mkane968/Text-Mining-with-Student-Papers/blob/main/notebooks/Text%20Mining%20Student%20Essays%2012-2022%20(Jupyter%20Notebook).ipynb
from google.colab import files
uploaded = files.upload()

## Segment Texts into Paragraphs

In [None]:
#Transform csv to dataframe
paragraphs_df = pd.read_csv('essays_and_scores.csv', index_col=0)

#Add ID and score in one column
paragraphs_df['Score_ID'] = 'Score: ' + paragraphs_df['Score'].astype(str) + ', ID: ' + paragraphs_df['ID'].astype(str)

#Check new df
paragraphs_df.head()

In [None]:
#Count number of paragraphs in each text
paragraph_counts = paragraphs_df['Text_Newlines'].str.count(r'\n')
paragraph_counts

#Append paragraphs counts to dataframe
paragraphs_df["Paragraph_Counts"] = paragraph_counts
paragraphs_df

In [None]:
#Make new cell each time new paragraph starts 
new = paragraphs_df["Text_Newlines"].str.split(r'\n', expand = True).set_index(paragraphs_df['Score_ID'])

#Flatten dataframe so each chapter is on own row, designated by book and chapter 
paragraphs_df = new.stack().reset_index()
paragraphs_df.columns = ["Score_ID", "Paragraph", "Text"]

#Split score and ID back to own columns
paragraphs_df[['Score','ID']] = paragraphs_df.Score_ID.str.split(", ",expand=True)
paragraphs_df['Score'] = paragraphs_df['Score'].map(lambda x: x.lstrip('Score: '))
paragraphs_df['ID'] = paragraphs_df['ID'].map(lambda x: x.lstrip('ID: '))
paragraphs_df['Score_ID_Paragraph'] = 'Score:_' + paragraphs_df['Score'].astype(str) + '_ID:_' + paragraphs_df['ID'].astype(str) + '_Paragraph:_' + paragraphs_df['Paragraph'].astype(str)
paragraphs_df

In [None]:
##Clean paragraphs
##Filter out paragraphs with 5 or less words (headers)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.split().str.len().lt(10)]

## Filter out paragraphs containing "http://", "doi:" , "https://" and "://www" (Works Cited citations)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("http://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("https://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("://www")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("www.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains(".com/")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("Vol.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("doi:")]

paragraphs_df

In [None]:
#Keep only score/id/paragraph and texts
paragraphs_full = paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()

#Download dataframe with all paragraphs
paragraphs_full.to_csv('paragraphs_full.csv') 
files.download('paragraphs_full.csv')

## Keep Only Paragraphs with Rhetorical Terminology

In [None]:
##Set up new dataframe for keyword frequency counts
rhetorical_keywords_paragraphs_df = paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()

#Count number of occurences of rhetorical terms in each paper
pathos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('pathos')
ethos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('ethos')
logos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('logos')
audience_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('audience')
context_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('context')
purpose_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('purpose')
author_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('author')
exigency_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('exigency')
appeal_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('appeal')

#Append each count to the dataframe
rhetorical_keywords_paragraphs_df['Pathos_Counts'] = pathos_counts
rhetorical_keywords_paragraphs_df["Ethos_Counts"] = ethos_counts
rhetorical_keywords_paragraphs_df["Logos_Counts"] = logos_counts
rhetorical_keywords_paragraphs_df["Audience_Counts"] = audience_counts
rhetorical_keywords_paragraphs_df["Context_Counts"] = context_counts
rhetorical_keywords_paragraphs_df["Purpose_Counts"] = purpose_counts
rhetorical_keywords_paragraphs_df["Author_Counts"] = author_counts
rhetorical_keywords_paragraphs_df["Exigency_Counts"] = exigency_counts
rhetorical_keywords_paragraphs_df["Appeal_Counts"] = appeal_counts


#Get sum of all term usages
rhetorical_terms = ['Pathos_Counts', 'Ethos_Counts', 'Logos_Counts', 'Audience_Counts', 'Context_Counts', 'Purpose_Counts', 'Author_Counts', 'Exigency_Counts', 'Appeal_Counts']
rhetorical_keywords_paragraphs_df['Sum_Terms'] = rhetorical_keywords_paragraphs_df[rhetorical_terms].sum(axis=1)

#Check dataframe
rhetorical_keywords_paragraphs_df

In [None]:
#Remove all rows with no rhetorical terms
rhetorical_keywords_paragraphs_df = rhetorical_keywords_paragraphs_df[rhetorical_keywords_paragraphs_df.Sum_Terms > 0]

rhetorical_keywords_paragraphs_df

In [None]:
#Keep only score/id/paragraph and text column
rhetorical_paras = rhetorical_keywords_paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()
rhetorical_paras

In [None]:
#Download dataframe with scores and rhetorical terminology paragraphs
rhetorical_paras.to_csv('rhetorical_paras.csv') 
files.download('rhetorical_paras.csv')

In [None]:
#Download zip file with rhetorical terminology paragraphs named by score
#Add each text to a new list called paragraphs
rhetorical_paragraphs = []
for row in rhetorical_paras['Text'].items():
    row_string = (str(row[1]))
    rhetorical_paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in rhetorical_paras['Score_ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir rhetorical_paragraphs

#Write texts to files
n = 0
for item in rhetorical_paragraphs:
  f = open("rhetorical_paragraphs/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  
#Zip text files in folder
!zip -r rhetorical_paragraphs.zip rhetorical_paragraphs

#Download file to zip folder to run through DocuScope
files.download('rhetorical_paragraphs.zip')


## Keep Only Paragraphs with Citation Markers

In [None]:
#Get any text inside parentheticals and count of parentheticals and append to dataframe
#https://stackoverflow.com/questions/24696715/regex-for-match-parentheses-in-python
parentheticals = r'(?<=\().*?(?=\))'

#Add new list for parenthetical citations
parenthetical_matches = []
parenthetical_counts = []

#Find all occurences of parenthetical citations in each paragraph of each text
citation_df = paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()
for text in citation_df['Text']:
  matches = re.findall(parentheticals, text)
  parenthetical_matches.append(matches)
  parenthetical_counts.append(len(matches))

#Make new column counting all appearances of parentheticals
citation_df["Parentheticals"] = parenthetical_matches
citation_df['Parenthetical_Counts'] = parenthetical_counts

citation_df


In [None]:
#Remove all rows with no parenthetical terms
citation_df_no_blanks = citation_df[citation_df.Parenthetical_Counts > 0]
citation_df_no_blanks

In [None]:
#Keep only score/id/paragraph and text column
citation_paras = citation_df_no_blanks[['Score_ID_Paragraph', 'Text']].copy()
citation_paras

In [None]:
#Download dataframe with scores and citation paragraphs
citation_paras.to_csv('citation_paras.csv') 
files.download('citation_paras.csv')

In [None]:
#Download zip file with rhetorical terminology paragraphs named by score
#Add each text to a new list called paragraphs
citation_paragraphs = []
for row in citation_paras['Text'].items():
    row_string = (str(row[1]))
    citation_paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in citation_paras['Score_ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir citation_paragraphs

#Write texts to files
n = 0
for item in citation_paragraphs:
  f = open("citation_paragraphs/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  
#Zip text files in folder
!zip -r citation_paragraphs.zip citation_paragraphs

#Download file to zip folder to run through DocuScope
files.download('citation_paragraphs.zip')
