<a href="https://colab.research.google.com/github/mkane968/Text-Mining-with-Student-Papers/blob/main/Text_Mining_Student_Papers_12_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Text Mining Student Papers: A Computational Exploration


## 1. Install Packages

In [7]:
#Mount Google Drive
from google.colab import drive
from google.colab import files

#Install glob
import glob 

#Install pandas
import pandas as pd

#Install numpy
import numpy as np

#Imports the Natural Language Toolkit, which is necessary to install NLTK packages and libraries
#!pip install nltk
import nltk

#Installs libraries and packages to tokenize text
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from  nltk.text import ConcordanceIndex

#Installs libraries and packages to clean text
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

#Import matplotlib for visualizations
import matplotlib.pyplot as plt


#Imports spaCy itself, necessary to use features 
#!pip install spaCy
import spacy
#Load the natural language processing pipeline
nlp = spacy.load("en_core_web_sm")
#Load spaCy visualizer
from spacy import displacy

from scipy import stats

import re  # For preprocessing
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2. Import Student Essays and Metadata

###Import Student Essays and Add to DataFrame

In [8]:
#Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Add files to upload from local machine
uploaded = files.upload()

In [10]:
#Put essays into dataframe
essays = pd.DataFrame.from_dict(uploaded, orient='index')

#Reset index and add column names to make wrangling easier
essays = essays.reset_index()
essays.columns = ["ID", "Text"]

#Remove encoding characters from Text column (b'\xef\xbb\xbf)
essays['Text'] = essays['Text'].apply(lambda x: x.decode('utf-8'))

#Remove newline characters and put in new column 
essays['Text_Newlines'] = essays['Text']
essays['Text'] = essays['Text'].str.replace(r'\s+|\\r', ' ', regex=True) 
essays['Text'] = essays['Text'].str.replace(r'\s+|\\n', ' ', regex=True) 
essays.head()

Unnamed: 0,ID,Text,Text_Newlines
0,zhangnicky_194583_13865387_FINAL PORTFOLIO 080...,Nicky Zhang Professor Stefan Analysis Reading ...,Nicky Zhang\nProfessor Stefan \nAnalysis Readi...
1,zengyongmei_157660_11534127_Portfolio.txt,Yongmei Zeng Sara Grace Stefan Analytical Read...,Yongmei Zeng\nSara Grace Stefan\nAnalytical Re...
2,zavorskipauld_142981_11496625_ARaW.txt,Paul Zavorski Professor Megan Kane English 080...,Paul Zavorski\nProfessor Megan Kane\nEnglish 0...
3,yuknekkathryn_189403_13865230_Final Portfolio ...,Yuknek 1 Kathryn Yuknek Professor Kane ENG 802...,Yuknek 1\n\n\nKathryn Yuknek\n\n\nProfessor Ka...
4,yellaniashrita_185528_11531910_Final Reflectio...,Ashrita Yellani Professor Kane English 0802 De...,Ashrita Yellani\nProfessor Kane\nEnglish 0802\...


### Add column without identifying information from each paper ID (instructor/student names) 

In [11]:
#Remove identifying information from ID
#Remove any occurences of "LATE_" from dataset (otherwise will skew ID cleaning)
essays['ID'] = essays['ID'].str.replace(r'LATE_', '', regex=True) 

#Split book on first underscore (_) in ID, keep only text in between first and second underscore (ID number)
start = essays["ID"].str.split("_", expand = True)
essays['ID'] = start[1]
essays['ID'] = essays['ID'].astype(int)
essays

Unnamed: 0,ID,Text,Text_Newlines
0,194583,Nicky Zhang Professor Stefan Analysis Reading ...,Nicky Zhang\nProfessor Stefan \nAnalysis Readi...
1,157660,Yongmei Zeng Sara Grace Stefan Analytical Read...,Yongmei Zeng\nSara Grace Stefan\nAnalytical Re...
2,142981,Paul Zavorski Professor Megan Kane English 080...,Paul Zavorski\nProfessor Megan Kane\nEnglish 0...
3,189403,Yuknek 1 Kathryn Yuknek Professor Kane ENG 802...,Yuknek 1\n\n\nKathryn Yuknek\n\n\nProfessor Ka...
4,185528,Ashrita Yellani Professor Kane English 0802 De...,Ashrita Yellani\nProfessor Kane\nEnglish 0802\...
...,...,...,...
142,195145,Olivia Bedell Professor Megan Kane ENG 802 14 ...,Olivia Bedell\nProfessor Megan Kane\nENG 802\n...
143,181150,"Michael Arena Professor stefan English 0802, S...",Michael Arena \nProfessor stefan\nEnglish 0802...
144,191168,Imani Alleyne SaraGrace H Stefan English 802 5...,Imani Alleyne\n\nSaraGrace H Stefan\n\nEnglish...
145,232002,Raven Ahenkora Professor Megan Kane English 08...,Raven Ahenkora\nProfessor Megan Kane\nEnglish ...


In [12]:
len(essays)

147

### Import grades and additional metadata to second dataframe





In [13]:
#Upload csvs with essay metadata
uploaded_grades = files.upload()

Saving 2022-12-03T2146_Grades-LA-ENG-0802-010-4683-202103.csv to 2022-12-03T2146_Grades-LA-ENG-0802-010-4683-202103.csv
Saving 2022-12-03T2145_Grades-LA-ENG-0802-039-3375-202036.csv to 2022-12-03T2145_Grades-LA-ENG-0802-039-3375-202036.csv
Saving 2022-11-28T1332_Grades-LA-ENG-0802-010-3350-202036.csv to 2022-11-28T1332_Grades-LA-ENG-0802-010-3350-202036.csv
Saving 2022-11-28T1331_Grades-LA-ENG-0802-012-3352-202136.csv to 2022-11-28T1331_Grades-LA-ENG-0802-012-3352-202136.csv
Saving 2022-11-28T1326_Grades-LA-ENG-0802-011-4684-202103.csv to 2022-11-28T1326_Grades-LA-ENG-0802-011-4684-202103.csv
Saving 2022-09-13T0945_Grades-LA-ENG-0802-062-37264-202203.csv to 2022-09-13T0945_Grades-LA-ENG-0802-062-37264-202203.csv
Saving 2022-09-13T0943_Grades-LA-ENG-0802-711-10742-202220.csv to 2022-09-13T0943_Grades-LA-ENG-0802-711-10742-202220.csv


In [14]:
#Link to path where csv files are stored in drive
local_path = r'/content'

#Create variable to store all csvs in path
filenames = glob.glob(local_path + "/*.csv")

#Create df list for all csvs
dfs = [pd.read_csv(filename) for filename in filenames]

len(filenames)

7

In [15]:
# Concatenate all data into one DataFrame
metadata = pd.concat(dfs, ignore_index=True)

#Change data to string (for further cleaning)
metadata.astype(str)

metadata.head()

Unnamed: 0,Student,ID,SIS User ID,SIS Login ID,Integration ID,Section,Final Portfolio (1313717),Assignments Current Score,Assignments Unposted Current Score,Assignments Final Score,...,Final Portfolio (Score),Final Portfolio (1676963),Attendance and Weekly Assignments Current Score,Attendance and Weekly Assignments Unposted Current Score,Attendance and Weekly Assignments Final Score,Attendance and Weekly Assignments Unposted Final Score,Portfolio Preparation Current Score,Portfolio Preparation Unposted Current Score,Portfolio Preparation Final Score,Portfolio Preparation Unposted Final Score
0,Points Possible,,,,,,100.0,(read only),(read only),(read only),...,,,,,,,,,,
1,"Bedell, Olivia",195145.0,tul65082,tul65082,915858403.0,Section: 012,89.0,,,0,...,,,,,,,,,,
2,"Caniglia, Dominick",190318.0,tul57468,tul57468,915850582.0,Section: 012,87.0,,,0,...,,,,,,,,,,
3,"Dugan, Connor",194255.0,tul08747,tul08747,915801444.0,Section: 012,88.0,,,0,...,,,,,,,,,,
4,"Fallt, Sela",216363.0,tun33888,tun33888,915908778.0,Section: 012,86.0,,,0,...,,,,,,,,,,


In [16]:
#Drop header rows(Points Possible) and test student rows (Student, Test)
metadata = metadata[metadata['Student'].str.contains('Points Possible|Student, Test')==False]
metadata.head()

Unnamed: 0,Student,ID,SIS User ID,SIS Login ID,Integration ID,Section,Final Portfolio (1313717),Assignments Current Score,Assignments Unposted Current Score,Assignments Final Score,...,Final Portfolio (Score),Final Portfolio (1676963),Attendance and Weekly Assignments Current Score,Attendance and Weekly Assignments Unposted Current Score,Attendance and Weekly Assignments Final Score,Attendance and Weekly Assignments Unposted Final Score,Portfolio Preparation Current Score,Portfolio Preparation Unposted Current Score,Portfolio Preparation Final Score,Portfolio Preparation Unposted Final Score
1,"Bedell, Olivia",195145.0,tul65082,tul65082,915858403.0,Section: 012,89.0,,,0,...,,,,,,,,,,
2,"Caniglia, Dominick",190318.0,tul57468,tul57468,915850582.0,Section: 012,87.0,,,0,...,,,,,,,,,,
3,"Dugan, Connor",194255.0,tul08747,tul08747,915801444.0,Section: 012,88.0,,,0,...,,,,,,,,,,
4,"Fallt, Sela",216363.0,tun33888,tun33888,915908778.0,Section: 012,86.0,,,0,...,,,,,,,,,,
5,"Fritz, Jake",175469.0,tul58736,tul58736,915851930.0,Section: 012,92.0,,,0,...,,,,,,,,,,


In [17]:
#Keep only relevant metadata (ID, Section, Final Portfolio Scores)
clean_metadata = metadata[['ID'] + ['Section'] + list(metadata.loc[:, metadata.columns.str.startswith('Final Portfolio (')])]


#Change columns to float as needed (check with df.dtypes())
clean_metadata["Final Portfolio (1Score)"] = pd.to_numeric(clean_metadata["Final Portfolio (1Score)"], downcast="float")
clean_metadata["Final Portfolio (Score)"] = pd.to_numeric(clean_metadata["Final Portfolio (Score)"], downcast="float")
#Want other metadata? Check the columns
#Get all column names 
#for col in metadata.columns:
   # print(col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_metadata["Final Portfolio (1Score)"] = pd.to_numeric(clean_metadata["Final Portfolio (1Score)"], downcast="float")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_metadata["Final Portfolio (Score)"] = pd.to_numeric(clean_metadata["Final Portfolio (Score)"], downcast="float")


In [18]:
#Replace all NaN values with 0 
clean_metadata = clean_metadata.replace(np.nan, 0)
clean_metadata

Unnamed: 0,ID,Section,Final Portfolio (1313717),Final Portfolio (1Score),Final Portfolio (1059452),Final Portfolio (1689777),Final Portfolio (878160),Final Portfolio (Score),Final Portfolio (1676963)
1,195145.0,Section: 012,89.0,0.0,0.0,0.0,0.0,0.0,0.0
2,190318.0,Section: 012,87.0,0.0,0.0,0.0,0.0,0.0,0.0
3,194255.0,Section: 012,88.0,0.0,0.0,0.0,0.0,0.0,0.0
4,216363.0,Section: 012,86.0,0.0,0.0,0.0,0.0,0.0,0.0
5,175469.0,Section: 012,92.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
171,217053.0,Section: 062,0.0,0.0,0.0,0.0,0.0,0.0,90.0
172,192683.0,Section: 062,0.0,0.0,0.0,0.0,0.0,0.0,93.0
173,233407.0,Section: 062,0.0,0.0,0.0,0.0,0.0,0.0,88.0
174,226726.0,Section: 062,0.0,0.0,0.0,0.0,0.0,0.0,92.0


In [19]:
#Create new final portfolio column with all values
#Add values of each column together; values except correct grade will be zero
score_counts = clean_metadata.columns[2:]
clean_metadata['Portfolio_Score'] = clean_metadata[score_counts].sum(axis=1)
clean_metadata['Portfolio_Score']

1      89.0
2      87.0
3      88.0
4      86.0
5      92.0
       ... 
171    90.0
172    93.0
173    88.0
174    92.0
175    89.0
Name: Portfolio_Score, Length: 164, dtype: float64

In [20]:
#Drop grade columns for individual classes
clean_metadata = clean_metadata[['ID', 'Section', "Portfolio_Score"]]

#Round scores to nearest integer
clean_metadata.Portfolio_Score = clean_metadata.Portfolio_Score.round()
clean_metadata

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,ID,Section,Portfolio_Score
1,195145.0,Section: 012,89.0
2,190318.0,Section: 012,87.0
3,194255.0,Section: 012,88.0
4,216363.0,Section: 012,86.0
5,175469.0,Section: 012,92.0
...,...,...,...
171,217053.0,Section: 062,90.0
172,192683.0,Section: 062,93.0
173,233407.0,Section: 062,88.0
174,226726.0,Section: 062,92.0


In [21]:
#Drop decimal from ID (inconsistent with ID in essay dataframe)
clean_metadata['ID'] = clean_metadata['ID'].astype(int)

#Check cleaned DF one more time
clean_metadata.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_metadata['ID'] = clean_metadata['ID'].astype(int)


Unnamed: 0,ID,Section,Portfolio_Score
1,195145,Section: 012,89.0
2,190318,Section: 012,87.0
3,194255,Section: 012,88.0
4,216363,Section: 012,86.0
5,175469,Section: 012,92.0


### Merge essays and grade metadata into one dataframe

In [22]:
#Merge metadata and cleaned essays into new dataframe
#Will only keep rows where both essay and metadata are present
essays_grades_master = clean_metadata.merge(essays,on='ID')

#Print dataframe
essays_grades_master

Unnamed: 0,ID,Section,Portfolio_Score,Text,Text_Newlines
0,195145,Section: 012,89.0,Olivia Bedell Professor Megan Kane ENG 802 14 ...,Olivia Bedell\nProfessor Megan Kane\nENG 802\n...
1,190318,Section: 012,87.0,Dominick Caniglia Professor Kane Analytical Re...,Dominick Caniglia\nProfessor Kane\nAnalytical ...
2,194255,Section: 012,88.0,Connor Dugan Professor Megan Kane ENG 802 6 De...,Connor Dugan \nProfessor Megan Kane\nENG 802\n...
3,216363,Section: 012,86.0,Fallt 1 Sela Fallt Professor Kane ENG-0802 12-...,Fallt 1\n\nSela Fallt\n\nProfessor Kane\n\nENG...
4,175469,Section: 012,92.0,Fritz 1 Jake Fritz Professor Megan Kane ENG-08...,Fritz 1\n\n\n\nJake Fritz\n\n\nProfessor Megan...
...,...,...,...,...,...
142,217053,Section: 062,90.0,"Marion Schroder Apr 25, 2022 Professor Megan K...","Marion Schroder\nApr 25, 2022\n\nProfessor Meg..."
143,192683,Section: 062,93.0,Siah Thach Professor Megan Kane ENG 802 25 Apr...,Siah Thach\nProfessor Megan Kane\nENG 802\n25 ...
144,233407,Section: 062,88.0,Metshet Tilahun Professor Kane ENG 0802 April ...,Metshet Tilahun\nProfessor Kane\nENG 0802\nApr...
145,226726,Section: 062,92.0,Sydney Tomman Professor Megan Kane ENG 802 25 ...,Sydney Tomman \nProfessor Megan Kane\nENG 802\...


In [23]:
#Sort dataframe by grades
essays_grades_master.sort_values(by=['Portfolio_Score'], inplace = True)
essays_grades_master.head()

Unnamed: 0,ID,Section,Portfolio_Score,Text,Text_Newlines
117,185594,Section: 039,41.0,Vivek Chowdary Kotapati 12/07/2020. Dear Port...,...
113,151066,Section: 039,45.0,Temple University Final Writing Portfolio Emme...,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...
116,195858,Section: 039,47.0,"Sophie Jung December 7, 2020 SaraGrace Stefan...",\n\nSophie Jung ...
107,192375,Section: 039,57.0,Olivia Davino Professor SaraGrace Stefan Analy...,Olivia Davino\nProfessor SaraGrace Stefan\nAna...
48,199773,Section: 011,66.0,Liam A. Hart Megan Kane Analytical Reading and...,Liam A. Hart\nMegan Kane\nAnalytical Reading a...


In [None]:
#Save new df to csv and download
essays_grades_master.to_csv('essays_grades_master.csv') 
files.download('essays_grades_master.csv')

In [None]:
#Save and download de-identified essays for future analysis
#Add each text to a new list called paragraph_context
deidentified_texts = []
for row in essays_grades_master['Text'].items():
    row_string = (str(row[1]))
    deidentified_texts.append(row_string)

#Add filenames to list
filenames = []
for row in essays_grades_master['ID'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir deidentified_texts

#Write texts to files
n = 0
for item in deidentified_texts:
  f = open("deidentified_texts/" + filenames[n] + '.txt','w')
  n= n+1
  f.write(item)
  f.close()

#Zip text files in folder
!zip -r deidentified_texts.zip deidentified_texts

#Download file to zip folder to run through DocuScope
files.download('deidentified_texts.zip')

## 3. Clean Data

### Basic Cleaning with NLTK
####Lowercasing, Punctuation Removal, and Stopword Removal

In [25]:
#Rename dataframe
clean_essay_grades_df = essays_grades_master
clean_essay_grades_df.rename(columns = {"Text_NoHeaders": "Text"}, inplace = True)

#Lowercase all words
clean_essay_grades_df['Lower_Text'] = clean_essay_grades_df['Text'].str.lower()

#Remove punctuation and replace with no space (except periods and hyphens)
clean_essay_grades_df['NoPunct_Text'] = clean_essay_grades_df['Lower_Text'].str.replace(r'[^\w\-\.\'\s]+', '', regex = True)

#Remove periods and replace with space (to prevent incorrect compounds)
clean_essay_grades_df['NoPunct_Text'] = clean_essay_grades_df['NoPunct_Text'].str.replace(r'[^\w\-\'\s]+', ' ', regex = True)

#Remove stopwords
stop_words = set(stopwords.words("english"))
clean_essay_grades_df['NoStops_Text'] = clean_essay_grades_df['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

#Check output
clean_essay_grades_df.head()

Unnamed: 0,ID,Section,Portfolio_Score,Text,Text_Newlines,Lower_Text,NoPunct_Text,NoStops_Text
117,185594,Section: 039,41.0,Vivek Chowdary Kotapati 12/07/2020. Dear Port...,...,vivek chowdary kotapati 12/07/2020. dear port...,vivek chowdary kotapati 12072020 dear portfo...,Vivek Chowdary Kotapati 12/07/2020. Dear Portf...
113,151066,Section: 039,45.0,Temple University Final Writing Portfolio Emme...,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...,temple university final writing portfolio emme...,temple university final writing portfolio emme...,Temple University Final Writing Portfolio Emme...
116,195858,Section: 039,47.0,"Sophie Jung December 7, 2020 SaraGrace Stefan...",\n\nSophie Jung ...,"sophie jung december 7, 2020 saragrace stefan...",sophie jung december 7 2020 saragrace stefan ...,"Sophie Jung December 7, 2020 SaraGrace Stefan ..."
107,192375,Section: 039,57.0,Olivia Davino Professor SaraGrace Stefan Analy...,Olivia Davino\nProfessor SaraGrace Stefan\nAna...,olivia davino professor saragrace stefan analy...,olivia davino professor saragrace stefan analy...,Olivia Davino Professor SaraGrace Stefan Analy...
48,199773,Section: 011,66.0,Liam A. Hart Megan Kane Analytical Reading and...,Liam A. Hart\nMegan Kane\nAnalytical Reading a...,liam a. hart megan kane analytical reading and...,liam a hart megan kane analytical reading and...,Liam A. Hart Megan Kane Analytical Reading Wri...


##4. Text Enrichment

Lemmatization, Part-of-Speech Tagging, and Named Entity Recognition with SpaCy

LAT Tagging with DocuScope

In [32]:
#Get lemmas
lemma_list = []

# Disable Dependency Parser, and NER since all we want is lemmatizer 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and getlemma, append lemma to list
  for doc in nlp.pipe(clean_essay_grades_df.NoPunct_Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
        word_list.append(token.lemma_)
        
    lemma_list.append(word_list)

#Put lemmas in a new column in dataframe
clean_essay_grades_df['Lemma_Text'] = lemma_list
clean_essay_grades_df['Lemma_Text'] = [' '.join(map(str, l)) for l in clean_essay_grades_df['Lemma_Text']]

#Check lemmas
clean_essay_grades_df.head()


Unnamed: 0,ID,Section,Portfolio_Score,Text,Text_Newlines,Lower_Text,NoPunct_Text,NoStops_Text,Lemma_Text
117,185594,Section: 039,41.0,Vivek Chowdary Kotapati 12/07/2020. Dear Port...,...,vivek chowdary kotapati 12/07/2020. dear port...,vivek chowdary kotapati 12072020 dear portfo...,Vivek Chowdary Kotapati 12/07/2020. Dear Portf...,vivek chowdary kotapati 12072020 dear port...
113,151066,Section: 039,45.0,Temple University Final Writing Portfolio Emme...,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...,temple university final writing portfolio emme...,temple university final writing portfolio emme...,Temple University Final Writing Portfolio Emme...,temple university final writing portfolio emme...
116,195858,Section: 039,47.0,"Sophie Jung December 7, 2020 SaraGrace Stefan...",\n\nSophie Jung ...,"sophie jung december 7, 2020 saragrace stefan...",sophie jung december 7 2020 saragrace stefan ...,"Sophie Jung December 7, 2020 SaraGrace Stefan ...",sophie jung december 7 2020 saragrace stefan...
107,192375,Section: 039,57.0,Olivia Davino Professor SaraGrace Stefan Analy...,Olivia Davino\nProfessor SaraGrace Stefan\nAna...,olivia davino professor saragrace stefan analy...,olivia davino professor saragrace stefan analy...,Olivia Davino Professor SaraGrace Stefan Analy...,olivia davino professor saragrace stefan analy...
48,199773,Section: 011,66.0,Liam A. Hart Megan Kane Analytical Reading and...,Liam A. Hart\nMegan Kane\nAnalytical Reading a...,liam a. hart megan kane analytical reading and...,liam a hart megan kane analytical reading and...,Liam A. Hart Megan Kane Analytical Reading Wri...,liam a hart megan kane analytical reading an...


In [33]:
#Get part of speech tags
pos_list = []

# Disable Dependency Parser, and NER since all we want is POS 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and tag POS, append POS to list
  for doc in nlp.pipe(clean_essay_grades_df.NoPunct_Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
        word_list.append(token.pos_)
        
    pos_list.append(word_list)

#Put POS in a new column in dataframe
clean_essay_grades_df['POS_Text'] = pos_list
clean_essay_grades_df['POS_Text'] = [' '.join(map(str, l)) for l in clean_essay_grades_df['POS_Text']]

#Check pos tags
clean_essay_grades_df.head()


Unnamed: 0,ID,Section,Portfolio_Score,Text,Text_Newlines,Lower_Text,NoPunct_Text,NoStops_Text,Lemma_Text,POS_Text
117,185594,Section: 039,41.0,Vivek Chowdary Kotapati 12/07/2020. Dear Port...,...,vivek chowdary kotapati 12/07/2020. dear port...,vivek chowdary kotapati 12072020 dear portfo...,Vivek Chowdary Kotapati 12/07/2020. Dear Portf...,vivek chowdary kotapati 12072020 dear port...,SPACE PROPN PROPN PROPN NUM SPACE PROPN PROPN ...
113,151066,Section: 039,45.0,Temple University Final Writing Portfolio Emme...,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...,temple university final writing portfolio emme...,temple university final writing portfolio emme...,Temple University Final Writing Portfolio Emme...,temple university final writing portfolio emme...,PROPN PROPN ADJ NOUN NOUN VERB NOUN ADJ NOUN C...
116,195858,Section: 039,47.0,"Sophie Jung December 7, 2020 SaraGrace Stefan...",\n\nSophie Jung ...,"sophie jung december 7, 2020 saragrace stefan...",sophie jung december 7 2020 saragrace stefan ...,"Sophie Jung December 7, 2020 SaraGrace Stefan ...",sophie jung december 7 2020 saragrace stefan...,SPACE PROPN PROPN PROPN NUM NUM NOUN PROPN NOU...
107,192375,Section: 039,57.0,Olivia Davino Professor SaraGrace Stefan Analy...,Olivia Davino\nProfessor SaraGrace Stefan\nAna...,olivia davino professor saragrace stefan analy...,olivia davino professor saragrace stefan analy...,Olivia Davino Professor SaraGrace Stefan Analy...,olivia davino professor saragrace stefan analy...,PROPN PROPN NOUN VERB PROPN ADJ NOUN CCONJ VER...
48,199773,Section: 011,66.0,Liam A. Hart Megan Kane Analytical Reading and...,Liam A. Hart\nMegan Kane\nAnalytical Reading a...,liam a. hart megan kane analytical reading and...,liam a hart megan kane analytical reading and...,Liam A. Hart Megan Kane Analytical Reading Wri...,liam a hart megan kane analytical reading an...,PROPN DET SPACE PROPN PROPN PROPN ADJ NOUN CCO...


In [34]:
#Get named entities
ent_list = []

with nlp.disable_pipes('tagger', 'parser'):
    for doc in nlp.pipe(clean_essay_grades_df.NoPunct_Text.astype('unicode').values, batch_size=100):
        ent_list.append(doc.ents)

#Put NEs in a new column in dataframe
clean_essay_grades_df['NER_Text'] = ent_list
clean_essay_grades_df['NER_Text'] = [' '.join(map(str, l)) for l in clean_essay_grades_df['NER_Text']]

#Check named entities
clean_essay_grades_df.head()




Unnamed: 0,ID,Section,Portfolio_Score,Text,Text_Newlines,Lower_Text,NoPunct_Text,NoStops_Text,Lemma_Text,POS_Text,NER_Text
117,185594,Section: 039,41.0,Vivek Chowdary Kotapati 12/07/2020. Dear Port...,...,vivek chowdary kotapati 12/07/2020. dear port...,vivek chowdary kotapati 12072020 dear portfo...,Vivek Chowdary Kotapati 12/07/2020. Dear Portf...,vivek chowdary kotapati 12072020 dear port...,SPACE PROPN PROPN PROPN NUM SPACE PROPN PROPN ...,12072020 english 802 kotapati 0802 sec 039 15 ...
113,151066,Section: 039,45.0,Temple University Final Writing Portfolio Emme...,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...,temple university final writing portfolio emme...,temple university final writing portfolio emme...,Temple University Final Writing Portfolio Emme...,temple university final writing portfolio emme...,PROPN PROPN ADJ NOUN NOUN VERB NOUN ADJ NOUN C...,section 39 7 december 2020 eng802 15 september...
116,195858,Section: 039,47.0,"Sophie Jung December 7, 2020 SaraGrace Stefan...",\n\nSophie Jung ...,"sophie jung december 7, 2020 saragrace stefan...",sophie jung december 7 2020 saragrace stefan ...,"Sophie Jung December 7, 2020 SaraGrace Stefan ...",sophie jung december 7 2020 saragrace stefan...,SPACE PROPN PROPN PROPN NUM NUM NOUN PROPN NOU...,december 7 2020 section 39 four three 10 at le...
107,192375,Section: 039,57.0,Olivia Davino Professor SaraGrace Stefan Analy...,Olivia Davino\nProfessor SaraGrace Stefan\nAna...,olivia davino professor saragrace stefan analy...,olivia davino professor saragrace stefan analy...,Olivia Davino Professor SaraGrace Stefan Analy...,olivia davino professor saragrace stefan analy...,PROPN PROPN NOUN VERB PROPN ADJ NOUN CCONJ VER...,olivia davino 0802 15 september 2020 today dec...
48,199773,Section: 011,66.0,Liam A. Hart Megan Kane Analytical Reading and...,Liam A. Hart\nMegan Kane\nAnalytical Reading a...,liam a. hart megan kane analytical reading and...,liam a hart megan kane analytical reading and...,Liam A. Hart Megan Kane Analytical Reading Wri...,liam a hart megan kane analytical reading an...,PROPN DET SPACE PROPN PROPN PROPN ADJ NOUN CCO...,kane 04262021 up to 6 only three three my fres...


In [None]:
#Download cleaned df
#clean_essay_grades_df.to_csv('essays_grades_enriched.csv') 
#files.download('essays_grades_enriched.csv')

Upload DocuScope LATs per text and append to dataframe based on ID

In [None]:
#Add files to upload from local machine
uploaded_dfs = files.upload()

In [None]:
#Upload each LAT csv into its own dataframe
import pandas as pd
list_of_dfs = []

for file in uploaded_dfs:
    df = pd.read_csv(file, header=None, engine='python', error_bad_lines=False)
    df = df.rename(columns={0: "Token", 1: "Clean_Token", 2: "Type", 3: "LAT_name", 4:"Count"})
    display(df.head())
    list_of_dfs.append(df)

In [75]:
#Append shortened name of LATs in each dataframe to master list
results_list = []

#Split lats in each dataframe
for df in list_of_dfs:
  df = df.drop(columns={"Token", "Clean_Token", "Type", "Count"})
  df = df['LAT_name'].str.extract('([A-Z][a-z]*)([A-Z][a-z]*)', expand=True)
  df.rename(columns={0 : "First", 1 : "Second"}, inplace = True)
  df['LATs'] = df['First']  + '' + df['Second'] 
  #display(df.head())
  #Append each column to list and then to list of lists
  l = df['LATs'].to_list()
  result = ' '.join(str(item) for item in l)
  result = result.replace('nan', '')
  results_list.append(result)


In [76]:
result_df = pd.DataFrame(results_list, columns = ['LATs'])
 
# print dataframe
result_df

Unnamed: 0,LATs
0,CharacterNames CharacterTypes CharacterNames ...
1,CharacterNames CharacterTypes AcademicDimensio...
2,CharacterTypes CharacterNames NarrativeDur...
3,DescriptObjects CharacterNames CharacterTypes ...
4,CharacterNames CharacterTypes CharacterNames ...
...,...
142,CharacterNames ReasonAnalyze NarrativeDuratio...
143,CharacterNames CharacterTypes CharacterNames ...
144,InformationTopics InformationTopics NarrativeS...
145,CharacterNames CharacterTypes CharacterNames ...


In [77]:
#Add column with IDs 
filenames = []
for name in uploaded_dfs.keys():
  filenames.append(name)
files = pd.Series(filenames)
result_df['ID'] = files.values

# shift column 'Range' to first position
first_column = result_df.pop('ID')
  
# insert column using insert(position,column_name,
# first_column) function
result_df.insert(0, 'ID', first_column)

#Remove _tokens.csv from file names 
result_df['ID'] = result_df['ID'].str.replace(r'_tokens.csv', '', regex=True)
result_df['ID'] = result_df['ID'].astype(int) 

In [78]:
#Merge LATs with enriched dataframe
enriched_essays_df = clean_essay_grades_df.merge(result_df,on='ID')
enriched_essays_df

Unnamed: 0,ID,Section,Portfolio_Score,Text,Text_Newlines,Lower_Text,NoPunct_Text,NoStops_Text,Lemma_Text,POS_Text,NER_Text,LATs
0,185594,Section: 039,41.0,Vivek Chowdary Kotapati 12/07/2020. Dear Port...,...,vivek chowdary kotapati 12/07/2020. dear port...,vivek chowdary kotapati 12072020 dear portfo...,Vivek Chowdary Kotapati 12/07/2020. Dear Portf...,vivek chowdary kotapati 12072020 dear port...,SPACE PROPN PROPN PROPN NUM SPACE PROPN PROPN ...,12072020 english 802 kotapati 0802 sec 039 15 ...,CharacterNames PositiveEmotion PublicCommi...
1,151066,Section: 039,45.0,Temple University Final Writing Portfolio Emme...,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...,temple university final writing portfolio emme...,temple university final writing portfolio emme...,Temple University Final Writing Portfolio Emme...,temple university final writing portfolio emme...,PROPN PROPN ADJ NOUN NOUN VERB NOUN ADJ NOUN C...,section 39 7 december 2020 eng802 15 september...,InformationTopics InformationTopics NarrativeS...
2,195858,Section: 039,47.0,"Sophie Jung December 7, 2020 SaraGrace Stefan...",\n\nSophie Jung ...,"sophie jung december 7, 2020 saragrace stefan...",sophie jung december 7 2020 saragrace stefan ...,"Sophie Jung December 7, 2020 SaraGrace Stefan ...",sophie jung december 7 2020 saragrace stefan...,SPACE PROPN PROPN PROPN NUM NUM NOUN PROPN NOU...,december 7 2020 section 39 four three 10 at le...,CharacterNames CharacterNames NarrativeDuratio...
3,192375,Section: 039,57.0,Olivia Davino Professor SaraGrace Stefan Analy...,Olivia Davino\nProfessor SaraGrace Stefan\nAna...,olivia davino professor saragrace stefan analy...,olivia davino professor saragrace stefan analy...,Olivia Davino Professor SaraGrace Stefan Analy...,olivia davino professor saragrace stefan analy...,PROPN PROPN NOUN VERB PROPN ADJ NOUN CCONJ VER...,olivia davino 0802 15 september 2020 today dec...,CharacterNames CharacterTypes CharacterNames...
4,199773,Section: 011,66.0,Liam A. Hart Megan Kane Analytical Reading and...,Liam A. Hart\nMegan Kane\nAnalytical Reading a...,liam a. hart megan kane analytical reading and...,liam a hart megan kane analytical reading and...,Liam A. Hart Megan Kane Analytical Reading Wri...,liam a hart megan kane analytical reading an...,PROPN DET SPACE PROPN PROPN PROPN ADJ NOUN CCO...,kane 04262021 up to 6 only three three my fres...,CharacterNames OrphanedDimension OrphanedDimen...
...,...,...,...,...,...,...,...,...,...,...,...,...
142,194101,Section: 010,98.0,"Paul Kushnirsky Professor Stefan English 0802,...",Paul Kushnirsky\nProfessor Stefan \nEnglish 08...,"paul kushnirsky professor stefan english 0802,...",paul kushnirsky professor stefan english 0802 ...,"Paul Kushnirsky Professor Stefan English 0802,...",paul kushnirsky professor stefan english 0802 ...,PROPN PROPN PROPN PROPN PROPN PROPN NOUN NUM N...,paul kushnirsky stefan english 0802 section 01...,CharacterNames CharacterTypes CharacterNames ...
143,156676,Section: 010,98.0,Noah Palmer Professor Megan Kane ENG 082 4 Dec...,Noah Palmer\nProfessor Megan Kane\nENG 082\n4 ...,noah palmer professor megan kane eng 082 4 dec...,noah palmer professor megan kane eng 082 4 dec...,Noah Palmer Professor Megan Kane ENG 082 4 Dec...,noah palmer professor megan kane eng 082 4 dec...,PROPN PROPN PROPN PROPN PROPN PROPN NUM NUM PR...,megan kane 082 december 2020 english three fir...,DescriptObjects CharacterTypes CharacterTypes ...
144,190309,Section: 010,99.0,Emma Jensen Professor SaraGrace Stefan Analyti...,Emma Jensen\nProfessor SaraGrace Stefan\nAnaly...,emma jensen professor saragrace stefan analyti...,emma jensen professor saragrace stefan analyti...,Emma Jensen Professor SaraGrace Stefan Analyti...,emma jensen professor saragrace stefan analyti...,PROPN PROPN PROPN PROPN PROPN ADJ NOUN CCONJ V...,emma jensen saragrace stefan analytical sectio...,CharacterNames CharacterNames CharacterTypes ...
145,190146,Section: 010,99.0,Temple University Analytical Reading and Writ...,\n\nTemple University\n\n\n\nAnalytical Readin...,temple university analytical reading and writ...,temple university analytical reading and writ...,Temple University Analytical Reading Writing F...,temple university analytical reading and wri...,SPACE PROPN PROPN ADJ NOUN CCONJ VERB ADJ NOUN...,sarah 0802 10 april 26 section 10 06 april 202...,InformationTopics InformationTopics ReasonAnal...


## 5. Paragraph Segmentation

In [26]:
#We only need one newlines version here
paragraphs_df = clean_essay_grades_df[['Portfolio_Score','ID', 'Text_Newlines']].copy()

#Add ID and score in one column
paragraphs_df['Score_ID'] = 'Score: ' + paragraphs_df['Portfolio_Score'].astype(str) + ', ID: ' + paragraphs_df['ID'].astype(str)

#Check new df
paragraphs_df.head()


Unnamed: 0,Portfolio_Score,ID,Text_Newlines,Score_ID
117,41.0,185594,...,"Score: 41.0, ID: 185594"
113,45.0,151066,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...,"Score: 45.0, ID: 151066"
116,47.0,195858,\n\nSophie Jung ...,"Score: 47.0, ID: 195858"
107,57.0,192375,Olivia Davino\nProfessor SaraGrace Stefan\nAna...,"Score: 57.0, ID: 192375"
48,66.0,199773,Liam A. Hart\nMegan Kane\nAnalytical Reading a...,"Score: 66.0, ID: 199773"


In [27]:
#Count number of paragraphs in each text
paragraph_counts = paragraphs_df['Text_Newlines'].str.count(r'\n')
paragraph_counts

#Append paragraphs counts to dataframe
paragraphs_df["Paragraph_Counts"] = paragraph_counts
paragraphs_df

Unnamed: 0,Portfolio_Score,ID,Text_Newlines,Score_ID,Paragraph_Counts
117,41.0,185594,...,"Score: 41.0, ID: 185594",117
113,45.0,151066,Temple University\n\n\n\n\n\n\n\n\n\nFinal Wri...,"Score: 45.0, ID: 151066",199
116,47.0,195858,\n\nSophie Jung ...,"Score: 47.0, ID: 195858",111
107,57.0,192375,Olivia Davino\nProfessor SaraGrace Stefan\nAna...,"Score: 57.0, ID: 192375",85
48,66.0,199773,Liam A. Hart\nMegan Kane\nAnalytical Reading a...,"Score: 66.0, ID: 199773",110
...,...,...,...,...,...
32,98.0,194101,Paul Kushnirsky\nProfessor Stefan \nEnglish 08...,"Score: 98.0, ID: 194101",164
92,98.0,156676,Noah Palmer\nProfessor Megan Kane\nENG 082\n4 ...,"Score: 98.0, ID: 156676",135
31,99.0,190309,Emma Jensen\nProfessor SaraGrace Stefan\nAnaly...,"Score: 99.0, ID: 190309",158
27,99.0,190146,\n\nTemple University\n\n\n\nAnalytical Readin...,"Score: 99.0, ID: 190146",149


In [28]:
#Make new cell each time new paragraph starts 
new = paragraphs_df["Text_Newlines"].str.split(r'\n', expand = True).set_index(paragraphs_df['Score_ID'])

#Flatten dataframe so each chapter is on own row, designated by book and chapter 
paragraphs_df = new.stack().reset_index()
paragraphs_df.columns = ["Score_ID", "Paragraph", "Text"]

#Split score and ID back to own columns
paragraphs_df[['Score','ID']] = paragraphs_df.Score_ID.str.split(", ",expand=True)
paragraphs_df['Score'] = paragraphs_df['Score'].map(lambda x: x.lstrip('Score: '))
paragraphs_df['ID'] = paragraphs_df['ID'].map(lambda x: x.lstrip('ID: '))
paragraphs_df['ID_Paragraph'] = paragraphs_df['ID'].astype(str) + '_' + paragraphs_df['Paragraph'].astype(str)
paragraphs_df

Unnamed: 0,Score_ID,Paragraph,Text,Score,ID,ID_Paragraph
0,"Score: 41.0, ID: 185594",0,...,41.0,185594,185594_0
1,"Score: 41.0, ID: 185594",1,...,41.0,185594,185594_1
2,"Score: 41.0, ID: 185594",2,"Dear Portfolio Committee,",41.0,185594,185594_2
3,"Score: 41.0, ID: 185594",3,I ...,41.0,185594,185594_3
4,"Score: 41.0, ID: 185594",4,,41.0,185594,185594_4
...,...,...,...,...,...,...
21624,"Score: 99.0, ID: 191777",51,"Simmons, Kimberly Eison. “Race and Racialized ...",99.0,191777,191777_51
21625,"Score: 99.0, ID: 191777",52,,99.0,191777,191777_52
21626,"Score: 99.0, ID: 191777",53,,99.0,191777,191777_53
21627,"Score: 99.0, ID: 191777",54,,99.0,191777,191777_54


In [29]:
##Clean paragraphs
##Filter out paragraphs with 5 or less words (headers)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.split().str.len().lt(10)]

## Filter out paragraphs containing "http://", "doi:" , "https://" and "://www" (Works Cited citations)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("http://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("https://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("://www")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("www.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains(".com/")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("Vol.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("doi:")]

paragraphs_df

Unnamed: 0,Score_ID,Paragraph,Text,Score,ID,ID_Paragraph
3,"Score: 41.0, ID: 185594",3,I ...,41.0,185594,185594_3
18,"Score: 41.0, ID: 185594",18,When will this come to an end? Ho...,41.0,185594,185594_18
20,"Score: 41.0, ID: 185594",20,Gay introduced his article with a...,41.0,185594,185594_20
22,"Score: 41.0, ID: 185594",22,“Any time you meet the cops and d...,41.0,185594,185594_22
24,"Score: 41.0, ID: 185594",24,"As a black, they are having a mark on their ow...",41.0,185594,185594_24
...,...,...,...,...,...,...
21614,"Score: 99.0, ID: 191777",41,"In terms of his adulthood, Kuti was exposed to...",99.0,191777,191777_41
21615,"Score: 99.0, ID: 191777",42,As Kuti and his malicious exposure of differen...,99.0,191777,191777_42
21616,"Score: 99.0, ID: 191777",43,\tThis next section will be detailing some of ...,99.0,191777,191777_43
21617,"Score: 99.0, ID: 191777",44,\tFela Kuti’s music and actions led to positiv...,99.0,191777,191777_44


In [30]:
#Save new df to csv and download to clean further
#paragraphs_df.to_csv('paragraphs.csv') 
#files.download('paragraphs.csv')

In [None]:
#Download each paragraph as a txt file
#Add each text to a new list called paragraphs
paragraphs = []
for row in paragraphs_df['Text'].items():
    row_string = (str(row[1]))
    paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in paragraphs_df['ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir paragraphs

#Write texts to files
n = 0
for item in paragraphs:
  f = open("paragraphs/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  
#Zip text files in folder
!zip -r paragraphs.zip paragraphs

#Download file to zip folder to run through DocuScope
files.download('paragraphs.zip')

## 6. Identify Keywords in Context

### Outcome 1: Extracting Rhetorical Analysis Terms and Context

In [None]:
##Set up new dataframe for keyword frequency counts
rhetorical_keywords_paragraphs_df = paragraphs_df.copy()

#Count number of occurences of rhetorical terms in each paper
pathos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('pathos')
ethos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('ethos')
logos_counts = rhetorical_keywords_paragraphs_df['Text'].str.count('logos')

#Append each count to the dataframe
rhetorical_keywords_paragraphs_df['Pathos_Counts'] = pathos_counts
rhetorical_keywords_paragraphs_df["Ethos_Counts"] = ethos_counts
rhetorical_keywords_paragraphs_df["Logos_Counts"] = logos_counts

#Get summ of all term usages
rhetorical_terms = ['Pathos_Counts', 'Ethos_Counts', 'Logos_Counts']
rhetorical_keywords_paragraphs_df['Sum_Terms'] = rhetorical_keywords_paragraphs_df[rhetorical_terms].sum(axis=1)

#Split score and ID back to own columns
rhetorical_keywords_paragraphs_df[['Score','ID']] = rhetorical_keywords_paragraphs_df.Score_ID.str.split(", ",expand=True)
rhetorical_keywords_paragraphs_df['Score'] = rhetorical_keywords_paragraphs_df['Score'].map(lambda x: x.lstrip('Score: '))
rhetorical_keywords_paragraphs_df['ID'] = rhetorical_keywords_paragraphs_df['Score'].map(lambda x: x.lstrip('ID: '))

rhetorical_keywords_paragraphs_df

In [None]:
#Remove all rows with no rhetorical terms
rhetorical_keywords_paragraphs_df_no_blanks = rhetorical_keywords_paragraphs_df[rhetorical_keywords_paragraphs_df.Sum_Terms > 0]
rhetorical_keywords_paragraphs_df_no_blanks

In [None]:
#Save new df to csv and download
rhetorical_keywords_paragraphs_df_no_blanks.to_csv('rhetorical_keywords_paragraphs_df_no_blanks.csv') 
files.download('rhetorical_keywords_paragraphs_df_no_blanks.csv')

In [None]:
#Download each rhetorical analysis paragraph as a txt file
#Add each text to a new list called rhetorical_paragraphs
rhetorical_paragraphs = []
for row in rhetorical_keywords_paragraphs_df_no_blanks['Text'].items():
    row_string = (str(row[1]))
    rhetorical_paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in rhetorical_keywords_paragraphs_df_no_blanks['ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir rhetorical_paragraphs

#Write texts to files
n = 0
for item in rhetorical_paragraphs:
  f = open("rhetorical_paragraphs/" + filenames[n] + '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  
#Zip text files in folder
!zip -r rhetorical_paragraphs.zip rhetorical_paragraphs

#Download file to zip folder to run through DocuScope
files.download('rhetorical_paragraphs.zip')

At the end of this text mining, we have two new data sets to analyze: 


*   `rhetorical_keywords_paragraphs_df_no_blanks.csv`: A CSV file containing each paragraph where rhetorical terminology was used, along with relevant metadata (can be used for close-reading, frequency and regression analysis, PCA)
*  `rhetorical_paragraphs.zip`: A zip file containing plain txt versions of each paragraph where rhetorical terminology was used (can be used for close-reading, DocuScope analysis, topic modeling, and/or other types of corpus analysis)


We can also go back and extract other terms, such as synonyms, which may aid later comparative analysis. 


In [None]:
##Set up new dataframe for synonym frequency counts in paragraphs
rhetorical_synonym_df = rhetorical_keywords_paragraphs_df_no_blanks.copy()

In [None]:
rhetorical_synonym_df['Pathos_Synonyms'] = rhetorical_synonym_df['Text'].str.count('experience|feel|stories|story|understand|compassion|passion|anecdote|sad|anger|sympathy|sympathetic|empathy|pity|fear*')
rhetorical_synonym_df['Logos_Synonyms'] = rhetorical_synonym_df['Text'].str.count('logic|logical|reason|reasoning|statistic|statistics|fact|facts|common sense|evidence')
rhetorical_synonym_df['Ethos_Synonyms'] = rhetorical_synonym_df['Text'].str.count('credible|credibility|authority|ethic|ethical|reliable|fair')
rhetorical_synonym_df['Rhetorical_Vocab'] = rhetorical_synonym_df['Text'].str.count('audience|reader|context|situation|rhetorical|element|device|appeal|effective|argue|argument')
rhetorical_synonym_df


In [None]:
#Get summ of all term usages
pathos_terms = ['Pathos_Counts', 'Pathos_Synonyms']
rhetorical_synonym_df['Sum_Pathos_Terms'] = rhetorical_synonym_df[pathos_terms].sum(axis=1)

logos_terms = ['Logos_Counts', 'Logos_Synonyms']
rhetorical_synonym_df['Sum_Logos_Terms'] = rhetorical_synonym_df[logos_terms].sum(axis=1)

ethos_terms = ['Ethos_Counts', 'Ethos_Synonyms']
rhetorical_synonym_df['Sum_Ethos_Terms'] = rhetorical_synonym_df[ethos_terms].sum(axis=1)
rhetorical_synonym_df


#Get sum of all term usages
all_terms = ['Sum_Pathos_Terms', 'Sum_Ethos_Terms', 'Sum_Ethos_Terms', 'Rhetorical_Vocab']
rhetorical_synonym_df['Sum_All_Terms'] = rhetorical_synonym_df[all_terms].sum(axis=1)
rhetorical_synonym_df

### Outcome 2: Extracting Citation Practices and Context



In [None]:
#Get any text inside parentheticals and count of parentheticals and append to dataframe
#https://stackoverflow.com/questions/24696715/regex-for-match-parentheses-in-python
parentheticals = r'(?<=\().*?(?=\))'

#Add new list for parenthetical citations
parenthetical_matches = []
parenthetical_counts = []

#Find all occurences of parenthetical citations in each paragraph of each text
citation_df = paragraphs_df.copy()
for text in citation_df['Text']:
  matches = re.findall(parentheticals, text)
  parenthetical_matches.append(matches)
  parenthetical_counts.append(len(matches))

#Make new column counting all appearances of parentheticals
citation_df["Parentheticals"] = parenthetical_matches
citation_df['Parenthetical_Counts'] = parenthetical_counts

citation_df


In [None]:
#Remove all rows with no parenthetical terms
citation_df_no_blanks = citation_df[citation_df.Parenthetical_Counts > 0]
citation_df_no_blanks

In [None]:
#Save new df to csv and download
citation_df.to_csv('citation_df_no_blanks.csv') 
files.download('citation_df_no_blanks.csv')

In [None]:
#Download each paragraph as a txt file
#Add each text to a new list called paragraphs
citation_paragraphs = []
for row in citation_df_no_blanks['Text'].items():
    row_string = (str(row[1]))
    citation_paragraphs.append(row_string)

#Add filenames to list
filenames = []
for row in citation_df_no_blanks['ID_Paragraph'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir citation_paragraphs

#Write texts to files
n = 0
for item in citation_paragraphs:
  f = open("citation_paragraphs/" + filenames[n] + '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  
#Zip text files in folder
!zip -r citation_paragraphs.zip citation_paragraphs

#Download file to zip folder to run through DocuScope
files.download('citation_paragraphs.zip')

At the end of this text mining, we have two new data sets to analyze: 

*   `citation_df.csv`: A CSV file containing each paragraph where rhetorical terminology was used, along with relevant metadata (can be used for close-reading, frequency and regression analysis, PCA)
*  `citation_paragraphs.zip`: A zip file containing plain txt versions of each paragraph where rhetorical terminology was used (can be used for close-reading, DocuScope analysis, topic modeling, and/or other types of corpus analysis)



## 7. Analyze Keywords in Context
This section uses frequency plots and regression analysis to determine whether rhetorical analysis term usage and/or citation practice usage are good indicators of score. 

### Rhetorical Terms Regression Analysis

In [None]:
#We need the metadata and text with newlines here; we'll also take the nostops text for further count analysis
rhetorical_keywords_df_full_texts = clean_essay_grades_df[['ID', 'Section', 'Portfolio_Score', 'Text_Newlines', 'NoStops_Text']].copy()

#Add ID and score in one column
rhetorical_keywords_df_full_texts['Score_ID'] = 'Score: ' + rhetorical_keywords_df_full_texts['Portfolio_Score'].astype(str) + ', ID:' + rhetorical_keywords_df_full_texts['ID'].astype(str)

#Check new df
rhetorical_keywords_df_full_texts.head()


In [None]:
#Count usage of each term in each essay
pathos_counts = rhetorical_keywords_df_full_texts['NoStops_Text'].str.count('pathos')
ethos_counts = rhetorical_keywords_df_full_texts['NoStops_Text'].str.count('ethos')
logos_counts = rhetorical_keywords_df_full_texts['NoStops_Text'].str.count('logos')

#Append each count to the dataframe
rhetorical_keywords_df_full_texts['Pathos_Counts'] = pathos_counts
rhetorical_keywords_df_full_texts["Ethos_Counts"] = ethos_counts
rhetorical_keywords_df_full_texts["Logos_Counts"] = logos_counts

#Get summ of all term usages
rhetorical_terms = ['Pathos_Counts', 'Ethos_Counts', 'Logos_Counts']
rhetorical_keywords_df_full_texts['Sum_Terms'] = rhetorical_keywords_df_full_texts[rhetorical_terms].sum(axis=1)

rhetorical_keywords_df_full_texts

In [None]:
#Chart number of times each term was used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='Pathos Counts', x=rhetorical_keywords_df_full_texts["Score_ID"], y=rhetorical_keywords_df_full_texts["Pathos_Counts"]),
    go.Bar(name='Ethos Counts', x=rhetorical_keywords_df_full_texts["Score_ID"], y=rhetorical_keywords_df_full_texts["Ethos_Counts"]),
    go.Bar(name='Logos Counts', x=rhetorical_keywords_df_full_texts["Score_ID"], y=rhetorical_keywords_df_full_texts["Logos_Counts"]),
    go.Bar(name='All Term Counts', x=rhetorical_keywords_df_full_texts["Score_ID"], y=rhetorical_keywords_df_full_texts["Sum_Terms"]),

])

# Change the bar mode
fig.update_layout(title_text='Counts of Each Rhetorical Term in Each Essay')
fig.update_layout(barmode='stack')
fig.show()

In [None]:
#Check if amount of all term usage is indicative of grade
#Based on results, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)

#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_keywords_df_full_texts['Portfolio_Score'])
y = np.array(rhetorical_keywords_df_full_texts['Sum_Terms'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Sum Counts By Score")
plt.xlabel("Score")
plt.ylabel("Sum Counts")
plt.show()

print("R value for Total Rhetorical Terms is " + str(r))

In [None]:
#Check if amount of usages of pathos is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_keywords_df_full_texts['Portfolio_Score'])
y = np.array(rhetorical_keywords_df_full_texts['Pathos_Counts'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Pathos Counts By Score")
plt.xlabel("Score")
plt.ylabel("Pathos Counts")
plt.show()

print("R value for Pathos is " + str(r))


#Check if amount of usages of logos is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_keywords_df_full_texts['Portfolio_Score'])
y = np.array(rhetorical_keywords_df_full_texts['Logos_Counts'])


#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()

print("R value for Logos is " + str(r))


#Check if amount of usages of ethos is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_keywords_df_full_texts['Portfolio_Score'])
y = np.array(rhetorical_keywords_df_full_texts['Ethos_Counts'])


#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()

print("R value for Ethos is " + str(r))

In [None]:
#Plot # paragraphs in which terms were used vs. essay grade
##In other words, do more successful writers use terms in multiple paragrpahs (indicating more coherence)?

#Count number of paragraphs where terms used and append to new dataframe
new_Series = rhetorical_keywords_paragraphs_df_no_blanks['Score_ID'].value_counts(ascending=True)
df3 = pd.DataFrame(new_Series).reset_index()
df3

df3.rename(columns={"index": "Score_ID", "Score_ID": "Paragraph_Counts"}, errors="raise", inplace=True)
df3[['ID','Score']] = df3.Score_ID.str.split(", ",expand=True)

df3

#Plot paragraph counts per paper
fig = go.Figure(data=[
    go.Bar(name='Paragraph Counts', x=df3["Score_ID"], y=df3["Paragraph_Counts"]),

])
# Change the bar mode
fig.update_layout(title_text='Number of Paragraphs Where Rhetorical Terms Were Used')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})
fig.show()

In [None]:
df3[['Score','ID']] = df3.Score_ID.str.split(", ",expand=True)
df3['Score'] = df3['Score'].map(lambda x: x.lstrip('Score: '))
df3 = df3[['Score','Paragraph_Counts']].copy()
df3 = df3.apply(pd.to_numeric)
df3

In [None]:
#Check if amount of paragraph term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
from scipy import stats

#Check if amount of usages of all terms per paragraph is indicative of grade
#Create arrays of independent (x) and dependent (y) variables


x = np.array(df3['Score'])
y = np.array(df3['Paragraph_Counts'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Paragraph Counts By Score")
plt.xlabel("Score")
plt.ylabel("Paragrah Counts")
plt.show()

print("R value for Terms per Paragraph is " + str(r))

We'll do the same thing with the synonyms to see if this makes a difference. 

In [None]:
#Get counts of synonym term usages in full texts
full_text_rhetorical_synonym_df = rhetorical_keywords_df_full_texts.copy()
full_text_rhetorical_synonym_df['Pathos_Synonyms'] = full_text_rhetorical_synonym_df['NoStops_Text'].str.count('experience|feel|stories|story|understand|compassion|passion|anecdote|sad|anger|sympathy|sympathetic|empathy|pity|fear*')
full_text_rhetorical_synonym_df['Logos_Synonyms'] = full_text_rhetorical_synonym_df['NoStops_Text'].str.count('logic|reason|reasoning|statistic|fact|data|common sense|evidence')
full_text_rhetorical_synonym_df['Ethos_Synonyms'] = full_text_rhetorical_synonym_df['NoStops_Text'].str.count('credible|credibility|authority|ethic|ethical|reliable')
full_text_rhetorical_synonym_df['Rhetorical_Vocab'] = full_text_rhetorical_synonym_df['NoStops_Text'].str.count('audience|reader|context|rhetorical|element|device|appeal|effective')

#Get summ of each type of term usages
pathos_terms = ['Pathos_Counts', 'Pathos_Synonyms']
full_text_rhetorical_synonym_df['Sum_Pathos_Terms'] = full_text_rhetorical_synonym_df[pathos_terms].sum(axis=1)

logos_terms = ['Logos_Counts', 'Logos_Synonyms']
full_text_rhetorical_synonym_df['Sum_Logos_Terms'] = full_text_rhetorical_synonym_df[logos_terms].sum(axis=1)

ethos_terms = ['Ethos_Counts', 'Ethos_Synonyms']
full_text_rhetorical_synonym_df['Sum_Ethos_Terms'] = full_text_rhetorical_synonym_df[ethos_terms].sum(axis=1)


#Get sum of all term usages
all_terms = ['Sum_Pathos_Terms', 'Sum_Ethos_Terms', 'Sum_Ethos_Terms', 'Rhetorical_Vocab']
full_text_rhetorical_synonym_df['Sum_All_Terms'] = full_text_rhetorical_synonym_df[all_terms].sum(axis=1)
full_text_rhetorical_synonym_df.head()

In [None]:
#Chart number of times each type of term was used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='Sum_Pathos_Terms', x=rhetorical_synonym_df["Score_ID"], y=rhetorical_synonym_df["Sum_Pathos_Terms"]),
    go.Bar(name='Sum_Ethos_Terms', x=rhetorical_synonym_df["Score_ID"], y=rhetorical_synonym_df["Sum_Ethos_Terms"]),
    go.Bar(name='Sum_Logos_Terms', x=rhetorical_synonym_df["Score_ID"], y=rhetorical_synonym_df["Sum_Logos_Terms"]),
    go.Bar(name='Sum_Rhetorical_Vocab', x=rhetorical_synonym_df["Score_ID"], y=rhetorical_synonym_df["Rhetorical_Vocab"])

])


# Change the bar mode
fig.update_layout(title_text='Counts of Each Type of Rhetorical Term in Each Essay')
fig.update_layout(barmode='stack')
fig.show()



In [None]:
#Create new df for numerical values for regression calculations
rhetorical_regression_df = rhetorical_synonym_df[['Score','Sum_Pathos_Terms','Sum_Ethos_Terms',	'Sum_Logos_Terms','Sum_All_Terms', 'Rhetorical_Vocab']].copy()
rhetorical_regression_df = rhetorical_regression_df.apply(pd.to_numeric) 
rhetorical_regression_df

In [None]:
#Check if amount of usages of pathos is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_regression_df['Score'])
y = np.array(rhetorical_regression_df['Sum_Pathos_Terms'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Pathos Counts By Score")
plt.xlabel("Score")
plt.ylabel("Pathos Counts")
plt.show()

print("R value for Pathos is " + str(r))


#Check if amount of usages of logos is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_regression_df['Score'])
y = np.array(rhetorical_regression_df['Sum_Logos_Terms'])


#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()

print("R value for Logos is " + str(r))


#Check if amount of usages of ethos is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_regression_df['Score'])
y = np.array(rhetorical_regression_df['Sum_Ethos_Terms'])


#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()

print("R value for Ethos is " + str(r))


#Check if amount of rhetorical term usages is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_regression_df['Score'])
y = np.array(rhetorical_regression_df['Rhetorical_Vocab'])


#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()

print("R value for Rhetorical Vocab is " + str(r))



In [None]:
#Check if amount of all term usages is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_regression_df['Score'])
y = np.array(rhetorical_regression_df['Sum_All_Terms'])


#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()

print("R value for All Terms is " + str(r))

### Citation Practice Regression Analysis


In [None]:
#Using FULL TEXTS Get any text inside parentheticals and count of parentheticals and append to dataframe
#https://stackoverflow.com/questions/24696715/regex-for-match-parentheses-in-python
parentheticals = r'(?<=\().*?(?=\))'

parenthetical_matches = []
parenthetical_counts = []

citation_df_full_texts = clean_essay_grades_df[['ID', 'Section', 'Portfolio_Score','Text']].copy()
for text in citation_df_full_texts['Text']:
  matches = re.findall(parentheticals, text)
  parenthetical_matches.append(matches)
  parenthetical_counts.append(len(matches))

citation_df_full_texts["Parentheticals"] = parenthetical_matches
citation_df_full_texts['Parenthetical_Counts'] = parenthetical_counts
citation_df_full_texts

In [None]:
#Add ID and score in one column
citation_df_full_texts['Score_ID'] = 'Score: ' + citation_df_full_texts['Portfolio_Score'].astype(str) + ', ID:' + citation_df_full_texts['ID'].astype(str)

In [None]:
#Chart number of times parentheticals were used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='Parenthetical_Tags', x=citation_df_full_texts["Score_ID"], y=citation_df_full_texts["Parenthetical_Counts"])
])
# Change the bar mode
fig.update_layout(title_text='Counts of Parentheticals Used in Each Essay')
fig.update_layout(barmode='stack')
fig.show()

In [None]:
#Regression: Parentheticals vs. Grade

#Check if amount of all term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
from scipy import stats


#Create arrays of independent (x) and dependent (y) variables
x = np.array(citation_df_full_texts['Portfolio_Score'])
y = np.array(citation_df_full_texts['Parenthetical_Counts'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Parenthetical Counts By Score")
plt.xlabel("Score")
plt.ylabel("Parenthetical Counts")
plt.show()

print("R value for Parentheticals is " + str(r))


In [None]:
#Plot # paragraphs in which terms were used vs. essay grade
##In other words, do more successful writers use terms in multiple paragrpahs (indicating more coherence)?

#Count number of paragraphs where terms used and append to new dataframe
new_Series = citation_df['Score_ID'].value_counts(ascending=True)
df3 = pd.DataFrame(new_Series).reset_index()
df3

df3.rename(columns={"index": "Score_ID", "Score_ID": "Paragraph_Counts"}, errors="raise", inplace=True)
df3[['ID','Score']] = df3.Score_ID.str.split(", ",expand=True)

df3

#Plot paragraph counts per paper
fig = go.Figure(data=[
    go.Bar(name='Paragraph Counts', x=df3["Score_ID"], y=df3["Paragraph_Counts"]),

])
# Change the bar mode
fig.update_layout(title_text='Number of Paragraphs Where Citation Terms Were Used')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})
fig.show()

In [None]:
df3[['Score','ID']] = df3.Score_ID.str.split(", ",expand=True)
df3['Score'] = df3['Score'].map(lambda x: x.lstrip('Score: '))
df3 = df3[['Score','Paragraph_Counts']].copy()
df3 = df3.apply(pd.to_numeric)
df3

In [None]:
#Check if amount of paragraph term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
from scipy import stats

#Check if amount of usages of all terms per paragraph is indicative of grade
#Create arrays of independent (x) and dependent (y) variables


x = np.array(df3['Score'])
y = np.array(df3['Paragraph_Counts'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Paragraph Counts By Score")
plt.xlabel("Score")
plt.ylabel("Paragrah Counts")
plt.show()

print("R value for Parentheticals per Paragraph is " + str(r))

The analysis below uses output from the DocuScope Corpus Analysis platform. This platform is freely available for download from Carnegie Melon University:https://www.cmu.edu/dietrich/english/research-and-publications/docuscope.html
 

DocuScope is a dictionary-based tool that "tags" words and phrases in texts based on its 50+ categories of rhetorical primers. The tool might tag the words “according to,” and “is proposing that” as evidence that the student is engaging in citation, for example. In aggregate, these counts can indicate to what degree each text in a corpus contains language indicating a particular rhetorical effect. Our interest in this case is the language DocuScope has tagged as indicating "Citation" is occuring; this language can be isolated from the CSV generated from the DocuScope tool, and an example can be found on this Github repository. 

In [None]:
##Let's do the same using DocuScope citation data
#Upload csv with LAT data
uploaded_LATS = files.upload()

In [None]:
lats_df = pd.read_csv('DIMENSION_C_deidentified_texts_citation_clusters_dimensions.csv')
lats_df

In [None]:
#Make ID document to merge docuscope lats with
ids = clean_essay_grades_df[['ID', 'Portfolio_Score']].copy()
ids

#Rename filename column to id and merge target and LAT tables based on ID
lats_df.rename(columns={"Filename": "ID"}, inplace=True)
lats_df['ID'] = lats_df['ID'].map(lambda x: x.rstrip('.txt'))
lats_df['ID'] = lats_df['ID'].astype('float')
merged_lat_df = pd.merge(ids, lats_df, on='ID')

#Add ID and score in one column
merged_lat_df['Score_ID'] = 'Score: ' + merged_lat_df['Portfolio_Score'].astype(str) + ', ID:' + merged_lat_df['ID'].astype(str)
merged_lat_df

In [None]:
#Chart number of times all terms were used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='Citations_Tags', x=merged_lat_df["Score_ID"], y=merged_lat_df["Citation"])
])
# Change the bar mode
fig.update_layout(title_text='Counts of All Citation Cluster Terms Used in Each Essay')
fig.update_layout(barmode='stack')
fig.show()

In [None]:
#Regression: Citation vs. Grade

#Check if amount of all term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
from scipy import stats


#Create arrays of independent (x) and dependent (y) variables
x = np.array(merged_lat_df['Portfolio_Score'])
y = np.array(merged_lat_df['Citation'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Citation Counts By Score")
plt.xlabel("Score")
plt.ylabel("Citation Counts")
plt.show()

print("R value for Total Citation Terms is " + str(r))


In [None]:
#Chart number of times each citation dimension was used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='CitationAuthority', x=merged_lat_df["Score_ID"], y=merged_lat_df["CitationAuthority"]),
    go.Bar(name='CitationControversy', x=merged_lat_df["Score_ID"], y=merged_lat_df["CitationControversy"]),
    go.Bar(name='CitationGeneric', x=merged_lat_df["Score_ID"], y=merged_lat_df["CitationGeneric"]),
    go.Bar(name='CitationHedged', x=merged_lat_df["Score_ID"], y=merged_lat_df["CitationHedged"]),
    go.Bar(name='CitationNegative', x=merged_lat_df["Score_ID"], y=merged_lat_df["CitationNegative"]),
    go.Bar(name='CitationNeutral', x=merged_lat_df["Score_ID"], y=merged_lat_df["CitationNeutral"]),
    go.Bar(name='CitationSpeakerLookMood', x=merged_lat_df["Score_ID"], y=merged_lat_df["CitationSpeakerLookMood"]),
    go.Bar(name='UncertainCitation', x=merged_lat_df["Score_ID"], y=merged_lat_df["UncertainCitation"]),

])


# Change the bar mode
fig.update_layout(title_text='Counts of Each Rhetorical Term in Each Essay')
fig.update_layout(barmode='stack')
fig.show()