<a href="https://colab.research.google.com/github/mkane968/Digital-Text-Analysis-for-WPA/blob/main/Digital_Text_Analysis_for_WPA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Digital Text Analysis for WPA
A pipeline for examining student essays computationally for the purposes of writing program assessment.

Updated February 2023

Questions? Contact megan.kane@temple.edu

## Install Packages and Upload Files

In [None]:
#Install os and glob
import glob 
import os

#Install pandas
import pandas as pd

#Install regular expressions
import re

#Import files to upload text and csv files to drive
from google.colab import files

## Segment Texts into Paragraphs

In [None]:
#Upload dataframe with papers and scores
#Pipeline for associating & cleaning essays and grades: https://github.com/mkane968/Text-Mining-with-Student-Papers/blob/main/notebooks/Text%20Mining%20Student%20Essays%2012-2022%20(Jupyter%20Notebook).ipynb
uploaded = files.upload()

In [None]:
#Transform csv to dataframe
paragraphs_df = pd.read_csv('essays_and_scores.csv', index_col=0)

#Add ID and score in one column
paragraphs_df['Score_ID'] = 'Score: ' + paragraphs_df['Score'].astype(str) + ', ID: ' + paragraphs_df['ID'].astype(str)

#Check new df
paragraphs_df.head()

In [None]:
#Count number of paragraphs in each text
paragraph_counts = paragraphs_df['Text_Newlines'].str.count(r'\n')
paragraph_counts

#Append paragraphs counts to dataframe
paragraphs_df["Paragraph_Counts"] = paragraph_counts
paragraphs_df

In [None]:
#Make new cell each time new paragraph starts 
new = paragraphs_df["Text_Newlines"].str.split(r'\n', expand = True).set_index(paragraphs_df['Score_ID'])

#Flatten dataframe so each chapter is on own row, designated by book and chapter 
paragraphs_df = new.stack().reset_index()
paragraphs_df.columns = ["Score_ID", "Paragraph", "Text"]

#Split score and ID back to own columns
paragraphs_df[['Score','ID']] = paragraphs_df.Score_ID.str.split(", ",expand=True)
paragraphs_df['Score'] = paragraphs_df['Score'].map(lambda x: x.lstrip('Score: '))
paragraphs_df['ID'] = paragraphs_df['ID'].map(lambda x: x.lstrip('ID: '))
paragraphs_df['Score_ID_Paragraph'] = 'Score:_' + paragraphs_df['Score'].astype(str) + '_ID:_' + paragraphs_df['ID'].astype(str) + '_Paragraph:_' + paragraphs_df['Paragraph'].astype(str)
paragraphs_df

In [None]:
##Clean paragraphs
##Filter out paragraphs with 5 or less words (headers)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.split().str.len().lt(10)]

## Filter out paragraphs containing "http://", "doi:" , "https://" and "://www" (Works Cited citations)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("http://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("https://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("://www")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("www.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains(".com/")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("Vol.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("doi:")]

paragraphs_df

In [None]:
#Keep only score/id/paragraph and texts
paragraphs_full = paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()

#Download dataframe with all paragraphs
paragraphs_full.to_csv('paragraphs_full.csv') 
files.download('paragraphs_full.csv')

## Keep Only Paragraphs with Rhetorical Terminology

In [None]:
##Set up new dataframe for keyword frequency counts
rhetorical_keywords_paragraphs_df = paragraphs_df[['Score','Score_ID_Paragraph', 'Text']].copy()

#Count number of occurences of rhetorical terms in each paper
pathos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('pathos')
ethos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('ethos')
logos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('logos')
audience_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('audience')
context_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('context')
purpose_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('purpose')
author_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('author')
exigency_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('exigency')
appeal_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('appeal')

#Append each count to the dataframe
rhetorical_keywords_paragraphs_df['Pathos_Counts'] = pathos_counts
rhetorical_keywords_paragraphs_df["Ethos_Counts"] = ethos_counts
rhetorical_keywords_paragraphs_df["Logos_Counts"] = logos_counts
rhetorical_keywords_paragraphs_df["Audience_Counts"] = audience_counts
rhetorical_keywords_paragraphs_df["Context_Counts"] = context_counts
rhetorical_keywords_paragraphs_df["Purpose_Counts"] = purpose_counts
rhetorical_keywords_paragraphs_df["Author_Counts"] = author_counts
rhetorical_keywords_paragraphs_df["Exigency_Counts"] = exigency_counts
rhetorical_keywords_paragraphs_df["Appeal_Counts"] = appeal_counts


#Get sum of all term usages
rhetorical_terms = ['Pathos_Counts', 'Ethos_Counts', 'Logos_Counts', 'Audience_Counts', 'Context_Counts', 'Purpose_Counts', 'Author_Counts', 'Exigency_Counts', 'Appeal_Counts']
rhetorical_keywords_paragraphs_df['Sum_Terms'] = rhetorical_keywords_paragraphs_df[rhetorical_terms].sum(axis=1)

#Check dataframe
rhetorical_keywords_paragraphs_df

In [None]:
#Remove all rows with no rhetorical terms
rhetorical_keywords_paragraphs_df = rhetorical_keywords_paragraphs_df[rhetorical_keywords_paragraphs_df.Sum_Terms > 0]

rhetorical_keywords_paragraphs_df

In [None]:
#Keep only score/id/paragraph and text column
rhetorical_paras = rhetorical_keywords_paragraphs_df[['Score_ID_Paragraph', 'Text']].copy()
rhetorical_paras

In [None]:
#Download dataframe with scores and rhetorical terminology paragraphs
rhetorical_paras.to_csv('rhetorical_paras.csv') 
files.download('rhetorical_paras.csv')

In [None]:
#Download zip file with rhetorical terminology paragraphs named by score
#Add each text to a new list called paragraphs
rhetorical_paragraphs = []
for row in rhetorical_paras['Text'].items():
    row_string = (str(row[1]))
    rhetorical_paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in rhetorical_paras['Score_ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir rhetorical_paragraphs

#Write texts to files
n = 0
for item in rhetorical_paragraphs:
  f = open("rhetorical_paragraphs/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  
#Zip text files in folder
!zip -r rhetorical_paragraphs.zip rhetorical_paragraphs

#Download file to zip folder to run through DocuScope
files.download('rhetorical_paragraphs.zip')


## Keep Only Paragraphs with Citation Markers

In [None]:
#Get any text inside parentheticals and count of parentheticals and append to dataframe
#https://stackoverflow.com/questions/24696715/regex-for-match-parentheses-in-python
parentheticals = r'(?<=\().*?(?=\))'

#Add new list for parenthetical citations
parenthetical_matches = []
parenthetical_counts = []

#Find all occurences of parenthetical citations in each paragraph of each text
citation_df = paragraphs_df[['Score','Score_ID_Paragraph', 'Text']].copy()
for text in citation_df['Text']:
  matches = re.findall(parentheticals, text)
  parenthetical_matches.append(matches)
  parenthetical_counts.append(len(matches))

#Make new column counting all appearances of parentheticals
citation_df["Parentheticals"] = parenthetical_matches
citation_df['Parenthetical_Counts'] = parenthetical_counts

citation_df


In [None]:
#Remove all rows with no parenthetical terms
citation_df_no_blanks = citation_df[citation_df.Parenthetical_Counts > 0]
citation_df_no_blanks

In [None]:
#Keep only score/id/paragraph and text column
citation_paras = citation_df_no_blanks[['Score_ID_Paragraph', 'Text']].copy()
citation_paras

In [None]:
#Download dataframe with scores and citation paragraphs
citation_paras.to_csv('citation_paras.csv') 
files.download('citation_paras.csv')

In [None]:
#Download zip file with rhetorical terminology paragraphs named by score
#Add each text to a new list called paragraphs
citation_paragraphs = []
for row in citation_paras['Text'].items():
    row_string = (str(row[1]))
    citation_paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in citation_paras['Score_ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir citation_paragraphs

#Write texts to files
n = 0
for item in citation_paragraphs:
  f = open("citation_paragraphs/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  
#Zip text files in folder
!zip -r citation_paragraphs.zip citation_paragraphs

#Download file to zip folder to run through DocuScope
files.download('citation_paragraphs.zip')


## Regression Analyses with Rhetorical and Citation Terms

In [None]:
#Bring down rhetorical paras dataframe with term counts
rhet_calculations_df = rhetorical_keywords_paragraphs_df.copy()
rhet_calculations_df.head()

In [None]:
##Add term counts from Voyant
#Count number of times the words "article" and "articles" appear in each text
feel_counts = rhet_calculations_df['Text'].str.count('feel')
articles_counts = rhet_calculations_df['Text'].str.count('understand')
element_counts = rhet_calculations_df['Text'].str.count('element')
rhetorical_counts = rhet_calculations_df['Text'].str.count('rhetorical')


#Append each count to the dataframe
rhet_calculations_df['Feel_Counts'] = feel_counts
rhet_calculations_df["Understand_Counts"] = articles_counts
rhet_calculations_df["Element_Counts"] = articles_counts
rhet_calculations_df["Rhetorical_Counts"] = articles_counts


#Get sum of all term usages
rhetorical_terms = ['Pathos_Counts', 'Ethos_Counts', 'Logos_Counts', 'Audience_Counts', 'Context_Counts', 'Purpose_Counts', 'Author_Counts', 'Exigency_Counts', 'Appeal_Counts', 'Feel_Counts', 'Understand_Counts', 'Element_Counts', 'Rhetorical_Counts']
rhet_calculations_df['Sum_Terms'] = rhet_calculations_df[rhetorical_terms].sum(axis=1)

rhet_calculations_df.head()

In [None]:
#Chart average use of each term across two paragraph types
import plotly.express as px

fig = px.histogram(rhet_calculations_df, x="Score", y='Sum_Terms', barmode='group')
fig.update_layout(title_text='Usage of Rhetorical Terms in Each Paragraph')
fig.show()

In [None]:
#Check if amount of all term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhet_calculations_df['Score']).astype(float)
y = np.array(rhet_calculations_df['Sum_Terms']).astype(float)

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

result = stats.linregress(x, y)

print(f"R-squared for Number of Rhetorical Terms Used in Each Paragraph: {result.rvalue**2:}")

plt.plot(x, y, 'o', label='Student Essay Data', color = 'b')
plt.plot(x, result.intercept + result.slope*x, 'r', label='Predicted Score')
plt.xlabel("Paper Score")
plt.ylabel("Count of Rhetorical Terms Used")
plt.legend()
plt.show()


In [None]:
#Bring down citation paras dataframe with term counts
citation_calculations_df = citation_df_no_blanks.copy()
citation_calculations_df.head()

In [None]:
#Check if amount of all term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

#Create arrays of independent (x) and dependent (y) variables
x = np.array(citation_calculations_df['Score']).astype(float)
y = np.array(citation_calculations_df['Parenthetical_Counts']).astype(float)

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

result = stats.linregress(x, y)

print(f"R-squared for Number of Parenthetical Citations Used in Each Paragraph: {result.rvalue**2:}")

plt.plot(x, y, 'o', label='Student Essay Data', color = 'g')
plt.plot(x, result.intercept + result.slope*x, 'r', label='Predicted Score')
plt.xlabel("Paper Score")
plt.ylabel("Count of Parenthetical Citations Used")
plt.legend()
plt.show()


In [None]:
#Count number of times the words "article" and "articles" appear in each text
article_counts = citation_calculations_df['Text'].str.count('article')
articles_counts = citation_calculations_df['Text'].str.count('articles')

#Append each count to the dataframe
citation_calculations_df['Article_Counts'] = article_counts
citation_calculations_df["Articles_Counts"] = articles_counts
citation_calculations_df

In [None]:
#Chart average use of each term across two paragraph types
import plotly.express as px

fig = px.histogram(citation_calculations_df, x="Score", y=["Articles_Counts", "Article_Counts"], barmode='group')
fig.update_layout(title_text='Usage of "Article" and "Articles" in Each Paragraph')
fig.show()

In [None]:
#Check if amount of articles term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

#Create arrays of independent (x) and dependent (y) variables
x = np.array(citation_calculations_df['Score']).astype(float)
y = np.array(citation_calculations_df['Articles_Counts']).astype(float)

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

result = stats.linregress(x, y)

print(f"R-squared for Number of Times 'Articles' Is Used in Each Paragraph: {result.rvalue**2:}")

plt.plot(x, y, 'o', label='Student Essay Data', color = 'y')
plt.plot(x, result.intercept + result.slope*x, 'r', label='Predicted Score')
plt.xlabel("Paper Score")
plt.ylabel("Count of Term 'Articles' Used")
plt.legend()
plt.show()


## Calculate DocuScope LAT Frequencies

In [None]:
#Upload citation paragraphs data from DocuScope
uploaded = files.upload()

In [None]:
#Transform csv to dataframe
citation_docuscope_data = pd.read_csv('CLUSTER_N_citation_paragraphs 2.csv')

#Regression: Citation vs. Grade
#Separate out paragraph with score alone
citation_docuscope_data[['ScoreTitle','Score', 'IDTitle', 'ID', 'ParagraphTitle', 'Paragraph']] = citation_docuscope_data.Filename.str.split("_",expand=True)

citation_docuscope_data['Score'] = citation_docuscope_data['Score'].map(lambda x: x.lstrip('Score: '))

citation_docuscope_data.drop(['ScoreTitle','IDTitle', 'ID', 'ParagraphTitle', 'Paragraph'], axis=1, inplace=True)

citation_docuscope_data

In [None]:
#Get average of each column
citation_docuscope_avgs = citation_docuscope_data.mean().round(5)
citation_docuscope_avgs = citation_docuscope_avgs.to_frame()
citation_docuscope_avgs = citation_docuscope_avgs.iloc[2:]
citation_docuscope_avgs = citation_docuscope_avgs.reset_index()
citation_docuscope_avgs = citation_docuscope_avgs.rename(columns={'index': "LATs", 0: "Average"})
#Sort from most to least frequent LATs
citation_docuscope_avgs = citation_docuscope_avgs.sort_values(by=['Average'], ascending=False)
citation_docuscope_avgs.head(20)

In [None]:
#Chart number of times most frequent LATs were used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

#Chart average use of each term across two paragraph types
import plotly.express as px

fig = px.histogram(citation_docuscope_data, x="Score", y=["InformationGeneral", "Narrative", "Negative", "Description", "InformationExposition"], barmode='group')
fig.update_layout(title_text='Counts of Five Most Frequent LATs (on Average) in Each Citation Paragraph')
fig.show()

In [None]:
#Check if amount of academic term term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

#Create arrays of independent (x) and dependent (y) variables
x = np.array(citation_docuscope_data['Score']).astype(float)
y = np.array(citation_docuscope_data['AcademicTerms']).astype(float)

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

result = stats.linregress(x, y)

print(f"R-squared for Number of Academic Terms in Each Citation Paragraph: {result.rvalue**2:}")

plt.plot(x, y, 'o', label='Student Essay Data', color = 'g')
plt.plot(x, result.intercept + result.slope*x, 'r', label='Predicted Score')
plt.xlabel("Paper Score")
plt.ylabel("Academic Term Use Count")
plt.legend()
plt.show()


In [None]:
#Upload rhetorical paragraph data from DocuScope
uploaded = files.upload()

In [None]:
#Transform csv to dataframe
rhetorical_docuscope_data = pd.read_csv('CLUSTER_N_rhetorical_paragraphs.csv')

#Regression: Citation vs. Grade
#Separate out paragraph with score alone
rhetorical_docuscope_data[['ScoreTitle','Score', 'IDTitle', 'ID', 'ParagraphTitle', 'Paragraph']] = rhetorical_docuscope_data.Filename.str.split("_",expand=True)
rhetorical_docuscope_data['Score'] = rhetorical_docuscope_data['Score'].map(lambda x: x.lstrip('Score: '))

rhetorical_docuscope_data.drop(['ScoreTitle','IDTitle', 'ID', 'ParagraphTitle', 'Paragraph'], axis=1, inplace=True)

rhetorical_docuscope_data

In [None]:
#Get average of each column
rhetorical_docuscope_avgs = rhetorical_docuscope_data.mean().round(5)
rhetorical_docuscope_avgs = rhetorical_docuscope_avgs.to_frame()
rhetorical_docuscope_avgs = rhetorical_docuscope_avgs.iloc[2:]
rhetorical_docuscope_avgs = rhetorical_docuscope_avgs.reset_index()
rhetorical_docuscope_avgs = rhetorical_docuscope_avgs.rename(columns={'index': "LATs", 0: "Average"})
#Sort from most to least frequent LATs
rhetorical_docuscope_avgs = rhetorical_docuscope_avgs.sort_values(by=['Average'], ascending=False)
rhetorical_docuscope_avgs.head(20)

In [None]:
#Chart number of times most frequent LATs were used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

#Chart average use of each term across two paragraph types
import plotly.express as px

fig = px.histogram(rhetorical_docuscope_data, x="Score", y=["InformationGeneral", "Narrative", "Negative", "Description", "InformationExposition"], barmode='group')
fig.update_layout(title_text='Counts of Five Most Frequent LATs (on Average) in Each Rhetorical Paragraph')
fig.show()

In [None]:
#Sum LATs per score value (citations)
sum_citations = citation_docuscope_data.groupby("Score").mean()
sum_citations.reset_index(inplace=True)
sum_citations

#Sum LATs per score value (rhetorical analysis)
sum_rhetorical = rhetorical_docuscope_data.groupby("Score").mean()
sum_rhetorical.reset_index(inplace=True)
sum_rhetorical

#Combine sums of two columns into one dataframe
sum_citations['Paragraph_Type'] = 'Citation'
sum_rhetorical['Paragraph_Type'] = 'Rhetorical_Analysis'

frames = [sum_citations, sum_rhetorical]

result = pd.concat(frames)

#Chart sum use of language type across both paragraph types
import plotly.express as px

fig = px.line(result, x="Score", y="AcademicTerms", color='Paragraph_Type')
fig.update_layout(title_text='Average Use of Academic Terms Language Across Paragraphs')
fig.show()

In [None]:
#Average LATs per score value (citations)
avg_citations = docuscope_data.groupby("Score").mean()
avg_citations.reset_index(inplace=True)
avg_citations

#Average LATs per score value (rhetorical analysis)
avg_rhetorical = docuscope_data2.groupby("Score").mean()
avg_rhetorical.reset_index(inplace=True)
avg_rhetorical

#combine averages of two columns into one dataframe
avg_citations['Move'] = 'Citation'
avg_rhetorical['Move'] = 'Rhetorical_Analysis'

frames = [avg_citations, avg_rhetorical]

result = pd.concat(frames)
result

#Chart average use of each term across two paragraph types
import plotly.express as px

fig = px.line(result, x="Score", y="Citation", color='Move')
fig.update_layout(title_text='Counts of Citation Language (on Average) in Each Paragraph')
fig.show()

In [None]:
#combine averages of two columns into one dataframe
avg_citations['Move'] = 'Citation'
avg_rhetorical['Move'] = 'Rhetorical_Analysis'

frames = [avg_citations, avg_rhetorical]

result = pd.concat(frames)
result

In [None]:
#Chart average use of each term across two paragraph types
import plotly.express as px

fig = px.line(result, x="Score", y="AcademicTerms", color='Move')
fig.update_layout(title_text='Counts of Metadiscourse Language (on Average) in Each Paragraph')
fig.show()

In [None]:
#Chart number of times most frequent LATs were used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='Academic Terms Counts', x=docuscope_data["Score"], y=docuscope_data["AcademicTerms"]),
    go.Bar(name='Academic Terms Counts', x=docuscope_data2["Score"], y=docuscope_data2["AcademicTerms"]),

])

# Change the bar mode
fig.update_layout(title_text='Counts of Academic Term Language in Each Citation Paragraph')
fig.show()