<a href="https://colab.research.google.com/github/mkane968/Text-Mining-with-Student-Papers/blob/main/notebooks/Associate_Student_Essays_%26_Metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Upload Files and Add to Dataframe

In [None]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Selet all files to upload
from google.colab import files

uploaded = files.upload()

In [None]:
#Add files into dataframe
import pandas as pd

essays = pd.DataFrame.from_dict(uploaded, orient='index')
essays.head()

In [None]:
#Reset index and add column names to make wrangling easier
essays = essays.reset_index()
essays.columns = ["ID", "Text"]
essays

#Clean Texts

In [None]:
#Remove encoding characters from Text column (b'\xef\xbb\xbf)
essays['Text'] = essays['Text'].apply(lambda x: x.decode('utf-8'))
essays.head()

In [None]:
#Remove identifying information from ID
#Remove any occurences of "LATE_" from dataset (otherwise will skew ID cleaning)
essays['ID'] = essays['ID'].str.replace(r'LATE_', '', regex=True) 

#Split book on first underscore (_) in ID, keep only text in between first and second underscore (ID number)
start = essays["ID"].str.split("_", expand = True)
essays['ID'] = start[1]
essays['ID'] = essays['ID'].astype(int)
essays

In [None]:
#Remove newline characters
essays['Text'] = essays['Text'].str.replace(r'\s+|\\r', ' ', regex=True) 
essays['Text'] = essays['Text'].str.replace(r'\s+|\\n', ' ', regex=True) 
essays

In [None]:
#Remove headers containing student name, instructor name, course name and date
#Split text on 2022 (will likely be last value in headers) and add all contents before to new column
headers = essays["Text"].str.split("22", 1, expand = True)
essays['Header'] = headers[0]
print(essays['Header'])

#Add 2022 back to header column
essays['Header'] = essays['Header'] + '22'
essays['Header'][0]

In [None]:
#Remove any occurences of the header from the rest of the text in each cell (should be at top of each essay in portfolio)
essays['Text_NoHeaders'] = essays.apply(lambda row : row['Text'].replace(str(row['Header']), ''), axis=1)
essays['Text_NoHeaders'] 

In [None]:
#Remove old text and header columns from dataframe 
essays = essays.drop(columns=['Text', 'Header'])
essays

This isn't perfect -- headers are not standardized across all papers, sometimes students end with prof name or other info, some student names still visible if referenced in papers themselves. 

#Add Grades and Additional Metadata

In [None]:
#Selet csv file to upload
from google.colab import files

uploaded = files.upload()

In [None]:
#Create dataframe for all metadata and inspect
import glob 

#Link to path where files are stored in drive
local_path = r'/content'

#Create variable to store all csvs in path
filenames = glob.glob(local_path + "/*.csv")

#Create df list for all csvs
dfs = [pd.read_csv(filename) for filename in filenames]

# Concatenate all data into one DataFrame
metadata = pd.concat(dfs, ignore_index=True)

#Change data to string (for further cleaning)
metadata.astype(str)

metadata

In [None]:
#Drop header rows(Points Possible) and test student rows (Student, Test)
metadata = metadata[metadata['Student'].str.contains('Points Possible|Student, Test')==False]
metadata

In [None]:
#Get all column names
for col in metadata.columns:
    print(col)

#Choose which rows to keep (ID, section and final portfolios with #s after chosen here)
metadata = metadata[['ID', 'Section', "Final Portfolio (1689777)", "Final Portfolio (1676963)"]]
metadata

In [None]:
#Replace all NaN values with 0 
import numpy as np
metadata = metadata.replace(np.nan, 0)

In [None]:
#Create new final portfolio column with all values
#Not sure how this will work with more than two dataframes
metadata['Portfolio Score'] = metadata['Final Portfolio (1689777)'] + metadata['Final Portfolio (1676963)']
metadata

In [None]:
#Drop grade columns for individual classes
clean_metadata = metadata[['ID', 'Section', "Portfolio Score"]]
clean_metadata

In [None]:
#Drop decimal from ID (inconsistent with ID in essay dataframe)
clean_metadata['ID'] = clean_metadata['ID'].astype(int)

#Check cleaned DF one more time
clean_metadata

# Merge Essays and Metadata

In [None]:
#Merge metadata and cleaned essays into new dataframe
#Will only keep rows where both essay and metadata are present
new_df = clean_metadata.merge(essays,on='ID')
new_df

In [None]:
#Save new df to tsv
new_df.to_csv('test_submissions.tsv') 
files.download('test_submissions.tsv')