<a href="https://colab.research.google.com/github/mkane968/Digital-Text-Analysis-for-WPA/blob/main/Digital_Text_Analysis_for_WPA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Digital Text Analysis for WPA
A pipeline for examining student essays computationally for the purposes of writing program assessment.

Updated February 2023

Questions? Contact megan.kane@temple.edu

## Install Packages and Upload Files

In [53]:
#Install os and glob
import glob 
import os

#Install pandas
import pandas as pd

#Install regular expressions
import re

In [2]:
#Upload dataframe with papers and scores
#Pipeline for associating & cleaning essays and grades: https://github.com/mkane968/Text-Mining-with-Student-Papers/blob/main/notebooks/Text%20Mining%20Student%20Essays%2012-2022%20(Jupyter%20Notebook).ipynb
from google.colab import files
uploaded = files.upload()

Saving essays_and_scores.csv to essays_and_scores.csv


## Segment Texts into Paragraphs

In [59]:
#Transform csv to dataframe
paragraphs_df = pd.read_csv('essays_and_scores.csv', index_col=0)

#Add ID and score in one column
paragraphs_df['Score_ID'] = 'Score: ' + paragraphs_df['Score'].astype(str) + ', ID: ' + paragraphs_df['ID'].astype(str)

#Check new df
paragraphs_df.head()

Unnamed: 0,ID,Score,Text_Newlines,Score_ID
0,91,41.0,...,"Score: 41.0, ID: 91"
1,87,45.0,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...,"Score: 45.0, ID: 87"
2,90,47.0,\n\nSophie Jung ...,"Score: 47.0, ID: 90"
3,81,57.0,Olivia Davino\nProfessor SaraGrace Stefan\nAna...,"Score: 57.0, ID: 81"
4,59,66.0,Liam A. Hart\nMegan Kane\nAnalytical Reading a...,"Score: 66.0, ID: 59"


In [60]:
#Count number of paragraphs in each text
paragraph_counts = paragraphs_df['Text_Newlines'].str.count(r'\n')
paragraph_counts

#Append paragraphs counts to dataframe
paragraphs_df["Paragraph_Counts"] = paragraph_counts
paragraphs_df

Unnamed: 0,ID,Score,Text_Newlines,Score_ID,Paragraph_Counts
0,91,41.0,...,"Score: 41.0, ID: 91",117
1,87,45.0,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...,"Score: 45.0, ID: 87",199
2,90,47.0,\n\nSophie Jung ...,"Score: 47.0, ID: 90",111
3,81,57.0,Olivia Davino\nProfessor SaraGrace Stefan\nAna...,"Score: 57.0, ID: 81",85
4,59,66.0,Liam A. Hart\nMegan Kane\nAnalytical Reading a...,"Score: 66.0, ID: 59",110
...,...,...,...,...,...
142,117,98.0,"Stephanie Robbins\nSaraGrace Stefan\nENG 0802,...","Score: 98.0, ID: 117",89
143,111,98.0,Paul Kushnirsky\nProfessor Stefan \nEnglish 08...,"Score: 98.0, ID: 111",164
144,101,99.0,Olivia Blake\nProfessor SaraGrace Stefan\nENG ...,"Score: 99.0, ID: 101",55
145,106,99.0,\n\nTemple University\n\n\n\nAnalytical Readin...,"Score: 99.0, ID: 106",149


In [61]:
#Make new cell each time new paragraph starts 
new = paragraphs_df["Text_Newlines"].str.split(r'\n', expand = True).set_index(paragraphs_df['Score_ID'])

#Flatten dataframe so each chapter is on own row, designated by book and chapter 
paragraphs_df = new.stack().reset_index()
paragraphs_df.columns = ["Score_ID", "Paragraph", "Text"]

#Split score and ID back to own columns
paragraphs_df[['Score','ID']] = paragraphs_df.Score_ID.str.split(", ",expand=True)
paragraphs_df['Score'] = paragraphs_df['Score'].map(lambda x: x.lstrip('Score: '))
paragraphs_df['ID'] = paragraphs_df['ID'].map(lambda x: x.lstrip('ID: '))
paragraphs_df['Score_ID_Paragraph'] = 'Score:_' + paragraphs_df['Score'].astype(str) + '_ID:_' + paragraphs_df['ID'].astype(str) + '_Paragraph:_' + paragraphs_df['Paragraph'].astype(str)
paragraphs_df

Unnamed: 0,Score_ID,Paragraph,Text,Score,ID,Score_ID_Paragraph
0,"Score: 41.0, ID: 91",0,...,41.0,91,Score:_41.0_ID:_91_Paragraph:_0
1,"Score: 41.0, ID: 91",1,...,41.0,91,Score:_41.0_ID:_91_Paragraph:_1
2,"Score: 41.0, ID: 91",2,"Dear Portfolio Committee,",41.0,91,Score:_41.0_ID:_91_Paragraph:_2
3,"Score: 41.0, ID: 91",3,I ...,41.0,91,Score:_41.0_ID:_91_Paragraph:_3
4,"Score: 41.0, ID: 91",4,,41.0,91,Score:_41.0_ID:_91_Paragraph:_4
...,...,...,...,...,...,...
21624,"Score: 99.0, ID: 110",154,"Space.” Feminist Studies, vol. 44, no. 3, 2018...",99.0,110,Score:_99.0_ID:_110_Paragraph:_154
21625,"Score: 99.0, ID: 110",155,"Wearing, Stephen, et al. “‘Poor Children on Ti...",99.0,110,Score:_99.0_ID:_110_Paragraph:_155
21626,"Score: 99.0, ID: 110",156,Feminist Political Economy of Volunteer Touris...,99.0,110,Score:_99.0_ID:_110_Paragraph:_156
21627,"Score: 99.0, ID: 110",157,,99.0,110,Score:_99.0_ID:_110_Paragraph:_157


In [62]:
##Clean paragraphs
##Filter out paragraphs with 5 or less words (headers)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.split().str.len().lt(10)]

## Filter out paragraphs containing "http://", "doi:" , "https://" and "://www" (Works Cited citations)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("http://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("https://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("://www")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("www.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains(".com/")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("Vol.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("doi:")]

paragraphs_df

Unnamed: 0,Score_ID,Paragraph,Text,Score,ID,Score_ID_Paragraph
3,"Score: 41.0, ID: 91",3,I ...,41.0,91,Score:_41.0_ID:_91_Paragraph:_3
18,"Score: 41.0, ID: 91",18,When will this come to an end? Ho...,41.0,91,Score:_41.0_ID:_91_Paragraph:_18
20,"Score: 41.0, ID: 91",20,Gay introduced his article with a...,41.0,91,Score:_41.0_ID:_91_Paragraph:_20
22,"Score: 41.0, ID: 91",22,“Any time you meet the cops and d...,41.0,91,Score:_41.0_ID:_91_Paragraph:_22
24,"Score: 41.0, ID: 91",24,"As a black, they are having a mark on their ow...",41.0,91,Score:_41.0_ID:_91_Paragraph:_24
...,...,...,...,...,...,...
21617,"Score: 99.0, ID: 110",147,"Dunne, Sarah Anne. “Lena Dunham’s Apology to A...",99.0,110,Score:_99.0_ID:_110_Paragraph:_147
21619,"Score: 99.0, ID: 110",149,"Lang, Cady. “How the Karen Meme Confronts Hist...",99.0,110,Score:_99.0_ID:_110_Paragraph:_149
21621,"Score: 99.0, ID: 110",151,"Li, Shirley. “When a TV Adaptation Does What t...",99.0,110,Score:_99.0_ID:_110_Paragraph:_151
21623,"Score: 99.0, ID: 110",153,"Mann, Justin Louis. “What's Your Emergency?: W...",99.0,110,Score:_99.0_ID:_110_Paragraph:_153


In [69]:
#Keep only score/id/paragraph and texts
paragraphs_full = paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()

#Download dataframe with all paragraphs
paragraphs_full.to_csv('paragraphs_full.csv') 
files.download('paragraphs_full.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Keep Only Paragraphs with Rhetorical Terminology

In [45]:
##Set up new dataframe for keyword frequency counts
rhetorical_keywords_paragraphs_df = paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()

#Count number of occurences of rhetorical terms in each paper
pathos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('pathos')
ethos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('ethos')
logos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('logos')
audience_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('audience')
context_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('context')
purpose_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('purpose')
author_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('author')
exigency_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('exigency')
appeal_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('appeal')

#Append each count to the dataframe
rhetorical_keywords_paragraphs_df['Pathos_Counts'] = pathos_counts
rhetorical_keywords_paragraphs_df["Ethos_Counts"] = ethos_counts
rhetorical_keywords_paragraphs_df["Logos_Counts"] = logos_counts
rhetorical_keywords_paragraphs_df["Audience_Counts"] = audience_counts
rhetorical_keywords_paragraphs_df["Context_Counts"] = context_counts
rhetorical_keywords_paragraphs_df["Purpose_Counts"] = purpose_counts
rhetorical_keywords_paragraphs_df["Author_Counts"] = author_counts
rhetorical_keywords_paragraphs_df["Exigency_Counts"] = exigency_counts
rhetorical_keywords_paragraphs_df["Appeal_Counts"] = appeal_counts


#Get sum of all term usages
rhetorical_terms = ['Pathos_Counts', 'Ethos_Counts', 'Logos_Counts', 'Audience_Counts', 'Context_Counts', 'Purpose_Counts', 'Author_Counts', 'Exigency_Counts', 'Appeal_Counts']
rhetorical_keywords_paragraphs_df['Sum_Terms'] = rhetorical_keywords_paragraphs_df[rhetorical_terms].sum(axis=1)

#Check dataframe
rhetorical_keywords_paragraphs_df

Unnamed: 0,Score_ID_Paragraph,Text,Pathos_Counts,Ethos_Counts,Logos_Counts,Audience_Counts,Context_Counts,Purpose_Counts,Author_Counts,Exigency_Counts,Appeal_Counts,Sum_Terms
3,Score:_41.0_ID:_91_Paragraph:_3,I ...,0,0,0,0,0,0,0,0,0,0
18,Score:_41.0_ID:_91_Paragraph:_18,When will this come to an end? Ho...,0,0,0,0,0,0,0,0,0,0
20,Score:_41.0_ID:_91_Paragraph:_20,Gay introduced his article with a...,0,0,0,0,0,0,0,0,0,0
22,Score:_41.0_ID:_91_Paragraph:_22,“Any time you meet the cops and d...,0,0,0,0,0,0,0,0,0,0
24,Score:_41.0_ID:_91_Paragraph:_24,"As a black, they are having a mark on their ow...",0,0,0,0,0,0,3,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
21617,Score:_99.0_ID:_110_Paragraph:_147,"Dunne, Sarah Anne. “Lena Dunham’s Apology to A...",0,0,0,0,0,0,0,0,0,0
21619,Score:_99.0_ID:_110_Paragraph:_149,"Lang, Cady. “How the Karen Meme Confronts Hist...",0,0,0,0,0,0,0,0,0,0
21621,Score:_99.0_ID:_110_Paragraph:_151,"Li, Shirley. “When a TV Adaptation Does What t...",0,0,0,0,0,0,0,0,0,0
21623,Score:_99.0_ID:_110_Paragraph:_153,"Mann, Justin Louis. “What's Your Emergency?: W...",0,0,0,0,0,0,0,0,0,0


In [46]:
#Remove all rows with no rhetorical terms
rhetorical_keywords_paragraphs_df = rhetorical_keywords_paragraphs_df[rhetorical_keywords_paragraphs_df.Sum_Terms > 0]

rhetorical_keywords_paragraphs_df

Unnamed: 0,Score_ID_Paragraph,Text,Pathos_Counts,Ethos_Counts,Logos_Counts,Audience_Counts,Context_Counts,Purpose_Counts,Author_Counts,Exigency_Counts,Appeal_Counts,Sum_Terms
24,Score:_41.0_ID:_91_Paragraph:_24,"As a black, they are having a mark on their ow...",0,0,0,0,0,0,3,0,0,3
87,Score:_41.0_ID:_91_Paragraph:_87,In the book “Video surveillance: Power and Pri...,0,0,0,0,0,0,4,0,0,4
88,Score:_41.0_ID:_91_Paragraph:_88,In the book “Crime Security and Su...,0,0,0,0,0,0,1,0,0,1
90,Score:_41.0_ID:_91_Paragraph:_90,In the book “Surveillance As Social...,0,0,0,0,0,0,2,0,0,2
91,Score:_41.0_ID:_91_Paragraph:_91,In the book “ Surveillance In The T...,0,0,0,0,0,0,2,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
21539,Score:_99.0_ID:_110_Paragraph:_69,"In the mid-1700s, Jeremy Bentham created a “so...",0,0,0,0,0,0,1,0,0,1
21540,Score:_99.0_ID:_110_Paragraph:_70,sociological effect is that the prisoners are ...,0,0,0,0,0,0,2,0,0,2
21541,Score:_99.0_ID:_110_Paragraph:_71,"In the 1970s, Michel Foucalt expanded on the p...",0,0,0,0,0,0,1,0,0,1
21544,Score:_99.0_ID:_110_Paragraph:_74,\tAlthough most studies focus on those subject...,0,0,0,2,0,0,0,0,0,2


In [50]:
#Keep only score/id/paragraph and text column
rhetorical_paras = rhetorical_keywords_paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()
rhetorical_paras

Unnamed: 0,Score_ID_Paragraph,Text
24,Score:_41.0_ID:_91_Paragraph:_24,"As a black, they are having a mark on their ow..."
87,Score:_41.0_ID:_91_Paragraph:_87,In the book “Video surveillance: Power and Pri...
88,Score:_41.0_ID:_91_Paragraph:_88,In the book “Crime Security and Su...
90,Score:_41.0_ID:_91_Paragraph:_90,In the book “Surveillance As Social...
91,Score:_41.0_ID:_91_Paragraph:_91,In the book “ Surveillance In The T...
...,...,...
21539,Score:_99.0_ID:_110_Paragraph:_69,"In the mid-1700s, Jeremy Bentham created a “so..."
21540,Score:_99.0_ID:_110_Paragraph:_70,sociological effect is that the prisoners are ...
21541,Score:_99.0_ID:_110_Paragraph:_71,"In the 1970s, Michel Foucalt expanded on the p..."
21544,Score:_99.0_ID:_110_Paragraph:_74,\tAlthough most studies focus on those subject...


In [None]:
#Download dataframe with scores and rhetorical terminology paragraphs
rhetorical_paras.to_csv('rhetorical_paras.csv') 
files.download('rhetorical_paras.csv')

In [None]:
#Download zip file with rhetorical terminology paragraphs named by score
#Add each text to a new list called paragraphs
rhetorical_paragraphs = []
for row in rhetorical_paras['Text'].items():
    row_string = (str(row[1]))
    rhetorical_paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in rhetorical_paras['Score_ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir rhetorical_paragraphs

#Write texts to files
n = 0
for item in rhetorical_paragraphs:
  f = open("rhetorical_paragraphs/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  
#Zip text files in folder
!zip -r rhetorical_paragraphs.zip rhetorical_paragraphs

#Download file to zip folder to run through DocuScope
files.download('rhetorical_paragraphs.zip')


## Keep Only Paragraphs with Citation Markers

In [64]:
#Get any text inside parentheticals and count of parentheticals and append to dataframe
#https://stackoverflow.com/questions/24696715/regex-for-match-parentheses-in-python
parentheticals = r'(?<=\().*?(?=\))'

#Add new list for parenthetical citations
parenthetical_matches = []
parenthetical_counts = []

#Find all occurences of parenthetical citations in each paragraph of each text
citation_df = paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()
for text in citation_df['Text']:
  matches = re.findall(parentheticals, text)
  parenthetical_matches.append(matches)
  parenthetical_counts.append(len(matches))

#Make new column counting all appearances of parentheticals
citation_df["Parentheticals"] = parenthetical_matches
citation_df['Parenthetical_Counts'] = parenthetical_counts

citation_df


Unnamed: 0,Score_ID_Paragraph,Text,Parentheticals,Parenthetical_Counts
3,Score:_41.0_ID:_91_Paragraph:_3,I ...,[],0
18,Score:_41.0_ID:_91_Paragraph:_18,When will this come to an end? Ho...,[],0
20,Score:_41.0_ID:_91_Paragraph:_20,Gay introduced his article with a...,[],0
22,Score:_41.0_ID:_91_Paragraph:_22,“Any time you meet the cops and d...,[],0
24,Score:_41.0_ID:_91_Paragraph:_24,"As a black, they are having a mark on their ow...",[],0
...,...,...,...,...
21617,Score:_99.0_ID:_110_Paragraph:_147,"Dunne, Sarah Anne. “Lena Dunham’s Apology to A...",[],0
21619,Score:_99.0_ID:_110_Paragraph:_149,"Lang, Cady. “How the Karen Meme Confronts Hist...",[],0
21621,Score:_99.0_ID:_110_Paragraph:_151,"Li, Shirley. “When a TV Adaptation Does What t...",[],0
21623,Score:_99.0_ID:_110_Paragraph:_153,"Mann, Justin Louis. “What's Your Emergency?: W...",[],0


In [65]:
#Remove all rows with no parenthetical terms
citation_df_no_blanks = citation_df[citation_df.Parenthetical_Counts > 0]
citation_df_no_blanks

Unnamed: 0,Score_ID_Paragraph,Text,Parentheticals,Parenthetical_Counts
26,Score:_41.0_ID:_91_Paragraph:_26,Gay also discussed about his honey bees that h...,[ So the bees could haul its honey back inside],1
148,Score:_45.0_ID:_87_Paragraph:_30,Titchkosky argues that these statements of jus...,[p. 46],1
150,Score:_45.0_ID:_87_Paragraph:_32,Those responsible for the building say that ot...,[44],1
151,Score:_45.0_ID:_87_Paragraph:_33,The very inclusion of these stories helps to ...,[p.42],1
152,Score:_45.0_ID:_87_Paragraph:_34,The use of these five anecdotes has a purpose ...,[45],1
...,...,...,...,...
21609,Score:_99.0_ID:_110_Paragraph:_139,\tBeing openly racist is not the only way that...,"[Dunne, Dunne, Dunne]",3
21610,Score:_99.0_ID:_110_Paragraph:_140,This scandal is just one occasion in which a w...,[Dunne],1
21611,Score:_99.0_ID:_110_Paragraph:_141,\tNeocolonialism is another form of white wome...,"[Wearing, Wearing, Wearing]",3
21612,Score:_99.0_ID:_110_Paragraph:_142,\tTo further exemplify how dangerous white wom...,"[Li, Li, Li]",3


In [66]:
#Keep only score/id/paragraph and text column
citation_paras = citation_df_no_blanks[['Score_ID_Paragraph', 'Text']].copy()
citation_paras

Unnamed: 0,Score_ID_Paragraph,Text
26,Score:_41.0_ID:_91_Paragraph:_26,Gay also discussed about his honey bees that h...
148,Score:_45.0_ID:_87_Paragraph:_30,Titchkosky argues that these statements of jus...
150,Score:_45.0_ID:_87_Paragraph:_32,Those responsible for the building say that ot...
151,Score:_45.0_ID:_87_Paragraph:_33,The very inclusion of these stories helps to ...
152,Score:_45.0_ID:_87_Paragraph:_34,The use of these five anecdotes has a purpose ...
...,...,...
21609,Score:_99.0_ID:_110_Paragraph:_139,\tBeing openly racist is not the only way that...
21610,Score:_99.0_ID:_110_Paragraph:_140,This scandal is just one occasion in which a w...
21611,Score:_99.0_ID:_110_Paragraph:_141,\tNeocolonialism is another form of white wome...
21612,Score:_99.0_ID:_110_Paragraph:_142,\tTo further exemplify how dangerous white wom...


In [71]:
#Download dataframe with scores and citation paragraphs
citation_paras.to_csv('citation_paras.csv') 
files.download('citation_paras.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [72]:
#Download zip file with rhetorical terminology paragraphs named by score
#Add each text to a new list called paragraphs
citation_paragraphs = []
for row in citation_paras['Text'].items():
    row_string = (str(row[1]))
    citation_paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in citation_paras['Score_ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir citation_paragraphs

#Write texts to files
n = 0
for item in citation_paragraphs:
  f = open("citation_paragraphs/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  
#Zip text files in folder
!zip -r citation_paragraphs.zip citation_paragraphs

#Download file to zip folder to run through DocuScope
files.download('citation_paragraphs.zip')


  adding: citation_paragraphs/ (stored 0%)
  adding: citation_paragraphs/Score:_82.0_ID:_114_Paragraph:_85.txt (deflated 41%)
  adding: citation_paragraphs/Score:_92.0_ID:_29_Paragraph:_78.txt (deflated 43%)
  adding: citation_paragraphs/Score:_89.0_ID:_74_Paragraph:_67.txt (deflated 52%)
  adding: citation_paragraphs/Score:_94.0_ID:_137_Paragraph:_40.txt (deflated 54%)
  adding: citation_paragraphs/Score:_86.0_ID:_65_Paragraph:_59.txt (deflated 55%)
  adding: citation_paragraphs/Score:_89.0_ID:_0_Paragraph:_64.txt (deflated 50%)
  adding: citation_paragraphs/Score:_86.0_ID:_145_Paragraph:_43.txt (deflated 51%)
  adding: citation_paragraphs/Score:_90.0_ID:_138_Paragraph:_117.txt (deflated 37%)
  adding: citation_paragraphs/Score:_47.0_ID:_90_Paragraph:_50.txt (deflated 43%)
  adding: citation_paragraphs/Score:_84.0_ID:_6_Paragraph:_34.txt (deflated 52%)
  adding: citation_paragraphs/Score:_87.0_ID:_1_Paragraph:_77.txt (deflated 52%)
  adding: citation_paragraphs/Score:_70.0_ID:_98_Para

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>