## Packages

In [55]:
# Standard Packages
import numpy as np
import pandas as pd

import string # For punctuation
import re # Regular expressions

from nltk.corpus import stopwords
stop_words = stopwords.words("english")

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from nltk.stem import WordNetLemmatizer # Lemmatization
from nltk.stem import PorterStemmer # Stemming


## Reading File

In [56]:
user_prompt7 = pd.read_csv("prompt7_essays.csv", index_col=0)
print(user_prompt7.head())


                                           gen_essay
0  \n\n\nI was having a really hard time with mat...
1  \n\nMy mom is always so patient with me. We go...
2  \n\n\nPatience is one of the most important th...
3  \n\nA woman was at the doctor's office for her...
4  \n\nPatience is a virtue that is often times d...


In [91]:
user_prompt7.shape

(170, 9)

Note: We have 1730 prompt 7 essays with average of 250 words.

## Data Cleaning

In [57]:
def remove_punctuation(text):
    # Removes punctuation and newlines from sentences.
    no_punct = [words for words in text if words not in string.punctuation]
    words_wo_punct = ''.join(no_punct)
    words_wo_punct = str.strip(words_wo_punct)
    return words_wo_punct

user_prompt7['gen_essay_punct'] = user_prompt7['gen_essay'].apply(lambda x: remove_punctuation(x))
user_prompt7.head()

Unnamed: 0,gen_essay,gen_essay_punct
0,\n\n\nI was having a really hard time with mat...,I was having a really hard time with math but ...
1,\n\nMy mom is always so patient with me. We go...,My mom is always so patient with me We go thro...
2,\n\n\nPatience is one of the most important th...,Patience is one of the most important things y...
3,\n\nA woman was at the doctor's office for her...,A woman was at the doctors office for her annu...
4,\n\nPatience is a virtue that is often times d...,Patience is a virtue that is often times diffi...


In [58]:
def tokenize(text):
    # Split a string into words using regular expressions
    # The \W+ splits on one or more nonword character
    split = re.split("\W+",text) 
    return split
user_prompt7['gen_essay_tokenize'] = user_prompt7['gen_essay_punct'].apply(lambda x: tokenize(x.lower()))
user_prompt7.iloc[:, 1:].head()

Unnamed: 0,gen_essay_punct,gen_essay_tokenize
0,I was having a really hard time with math but ...,"[i, was, having, a, really, hard, time, with, ..."
1,My mom is always so patient with me We go thro...,"[my, mom, is, always, so, patient, with, me, w..."
2,Patience is one of the most important things y...,"[patience, is, one, of, the, most, important, ..."
3,A woman was at the doctors office for her annu...,"[a, woman, was, at, the, doctors, office, for,..."
4,Patience is a virtue that is often times diffi...,"[patience, is, a, virtue, that, is, often, tim..."


In [59]:
# Take a look at first 10 frequent stop words in English
print(stop_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [60]:
def remove_stopwords(text):
    # Remove stop words in each essay
    text = [word for word in text if word not in stop_words]
    return text
user_prompt7['gen_essay_wo_stopwords'] = user_prompt7['gen_essay_tokenize'].apply(lambda x: remove_stopwords(x))
user_prompt7.iloc[:, 2:].head()

Unnamed: 0,gen_essay_tokenize,gen_essay_wo_stopwords
0,"[i, was, having, a, really, hard, time, with, ...","[really, hard, time, math, teacher, told, us, ..."
1,"[my, mom, is, always, so, patient, with, me, w...","[mom, always, patient, go, many, trials, tribu..."
2,"[patience, is, one, of, the, most, important, ...","[patience, one, important, things, life, wheth..."
3,"[a, woman, was, at, the, doctors, office, for,...","[woman, doctors, office, annual, checkup, doct..."
4,"[patience, is, a, virtue, that, is, often, tim...","[patience, virtue, often, times, difficult, fi..."


In [61]:
lemmatizer = WordNetLemmatizer()
def lemmatization(text):
    # Find the root of word using lemmatization (by dictionary definition)
    # This takes into account the context of word used
    text = [lemmatizer.lemmatize(word) for word in text]
    return text
user_prompt7['gen_essay_lemmatized'] = user_prompt7['gen_essay_wo_stopwords'].apply(lambda x: lemmatization(x))
user_prompt7.iloc[:, 3:].head()

Unnamed: 0,gen_essay_wo_stopwords,gen_essay_lemmatized
0,"[really, hard, time, math, teacher, told, us, ...","[really, hard, time, math, teacher, told, u, w..."
1,"[mom, always, patient, go, many, trials, tribu...","[mom, always, patient, go, many, trial, tribul..."
2,"[patience, one, important, things, life, wheth...","[patience, one, important, thing, life, whethe..."
3,"[woman, doctors, office, annual, checkup, doct...","[woman, doctor, office, annual, checkup, docto..."
4,"[patience, virtue, often, times, difficult, fi...","[patience, virtue, often, time, difficult, fin..."


In [62]:
ps = PorterStemmer()
def stemming(text):
    # Find root word by stemming (leads to less words than lemming)
    # This can chop off end of some words
    text = [ps.stem(word) for word in text]
    return text
user_prompt7['gen_essay_stemmed'] = user_prompt7['gen_essay_wo_stopwords'].apply(lambda x: stemming(x))
user_prompt7.iloc[:, 4:].head()  

Unnamed: 0,gen_essay_lemmatized,gen_essay_stemmed
0,"[really, hard, time, math, teacher, told, u, w...","[realli, hard, time, math, teacher, told, us, ..."
1,"[mom, always, patient, go, many, trial, tribul...","[mom, alway, patient, go, mani, trial, tribul,..."
2,"[patience, one, important, thing, life, whethe...","[patienc, one, import, thing, life, whether, p..."
3,"[woman, doctor, office, annual, checkup, docto...","[woman, doctor, offic, annual, checkup, doctor..."
4,"[patience, virtue, often, time, difficult, fin...","[patienc, virtu, often, time, difficult, find,..."


In [63]:
user_prompt7.head()

Unnamed: 0,gen_essay,gen_essay_punct,gen_essay_tokenize,gen_essay_wo_stopwords,gen_essay_lemmatized,gen_essay_stemmed
0,\n\n\nI was having a really hard time with mat...,I was having a really hard time with math but ...,"[i, was, having, a, really, hard, time, with, ...","[really, hard, time, math, teacher, told, us, ...","[really, hard, time, math, teacher, told, u, w...","[realli, hard, time, math, teacher, told, us, ..."
1,\n\nMy mom is always so patient with me. We go...,My mom is always so patient with me We go thro...,"[my, mom, is, always, so, patient, with, me, w...","[mom, always, patient, go, many, trials, tribu...","[mom, always, patient, go, many, trial, tribul...","[mom, alway, patient, go, mani, trial, tribul,..."
2,\n\n\nPatience is one of the most important th...,Patience is one of the most important things y...,"[patience, is, one, of, the, most, important, ...","[patience, one, important, things, life, wheth...","[patience, one, important, thing, life, whethe...","[patienc, one, import, thing, life, whether, p..."
3,\n\nA woman was at the doctor's office for her...,A woman was at the doctors office for her annu...,"[a, woman, was, at, the, doctors, office, for,...","[woman, doctors, office, annual, checkup, doct...","[woman, doctor, office, annual, checkup, docto...","[woman, doctor, offic, annual, checkup, doctor..."
4,\n\nPatience is a virtue that is often times d...,Patience is a virtue that is often times diffi...,"[patience, is, a, virtue, that, is, often, tim...","[patience, virtue, often, times, difficult, fi...","[patience, virtue, often, time, difficult, fin...","[patienc, virtu, often, time, difficult, find,..."


In [64]:
for col in user_prompt7.columns:
    for idx, row in user_prompt7.iterrows():
        row[col] = " ".join(row[col])

In [82]:
user_prompt7["Essay_set_id"] = [7 for i in range(user_prompt7.shape[0])]
user_prompt7["Essay_id"] = [x for x in range(len(user_prompt7))]
user_prompt7["Essay"] = user_prompt7["gen_essay_stemmed"]

In [96]:
scores = pd.read_excel("../asap_aes_data.xlsx")
scores = scores[scores["essay_set"] == 7]
scores = scores[scores[["rater1_domain1", "rater2_domain1", "domain1_score"]].notna()]
scores.shape

(1569, 28)

In [94]:
scores.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
10686,17834,7,Patience is when your waiting .I was patience ...,8.0,7.0,,15.0,,,,...,2.0,2.0,,,,,,,,
10687,17836,7,"I am not a patience person, like I can’t sit i...",6.0,7.0,,13.0,,,,...,2.0,1.0,,,,,,,,
10688,17837,7,One day I was at basketball practice and I was...,7.0,8.0,,15.0,,,,...,2.0,2.0,,,,,,,,
10689,17838,7,I going to write about a time when I went to t...,8.0,9.0,,17.0,,,,...,2.0,3.0,,,,,,,,
10690,17839,7,It can be very hard for somebody to be patient...,7.0,6.0,,13.0,,,,...,1.0,2.0,,,,,,,,


In [83]:
# user_prompt7.to_csv("chatgpt_essay_set_7.csv", columns=["Essay_id", "Essay_set_id", "Essay"])

Unnamed: 0,Essay_id,Essay_set_id,Essay
0,0,7,realli hard time math teacher told us would ta...
1,1,7,mom alway patient go mani trial tribul famili ...
2,2,7,patienc one import thing life whether person l...
3,3,7,woman doctor offic annual checkup doctor ask t...
4,4,7,patienc virtu often time difficult find one st...


## Cleaning Chat-GPT

In [73]:
chatgpt_prompt7 = pd.read_csv("../Prompt 7/../chatgpt_prompt7_essays.csv", index_col=0)
chatgpt_prompt7.head()

Unnamed: 0,gen_essay
0,\n\n\nI was having a really hard time with mat...
1,\n\nMy mom is always so patient with me. We go...
2,\n\n\nPatience is one of the most important th...
3,\n\nA woman was at the doctor's office for her...
4,\n\nPatience is a virtue that is often times d...


In [74]:
chatgpt_prompt7['gen_essay_punct'] = chatgpt_prompt7['gen_essay'].apply(lambda x: remove_punctuation(x))
chatgpt_prompt7['gen_essay_tokenize'] = chatgpt_prompt7['gen_essay_punct'].apply(lambda x: tokenize(x.lower()))
chatgpt_prompt7['gen_essay_wo_stopwords'] = chatgpt_prompt7['gen_essay_tokenize'].apply(lambda x: remove_stopwords(x))
chatgpt_prompt7['gen_essay_lemmatized'] = chatgpt_prompt7['gen_essay_wo_stopwords'].apply(lambda x: lemmatization(x))
chatgpt_prompt7['gen_essay_stemmed'] = chatgpt_prompt7['gen_essay_wo_stopwords'].apply(lambda x: stemming(x))

In [76]:
for col in chatgpt_prompt7.columns:
    for idx, row in chatgpt_prompt7.iterrows():
        row[col] = " ".join(row[col])

In [77]:
chatgpt_prompt7["Essay_set_id"] = [7 for i in range(chatgpt_prompt7.shape[0])]
chatgpt_prompt7["Essay_id"] = [x for x in range(len(chatgpt_prompt7))]
chatgpt_prompt7["Essay"] = chatgpt_prompt7["gen_essay_stemmed"]

In [81]:
chatgpt_prompt7.to_csv("chatgpt_essay_set_7.csv", columns=["Essay_id", "Essay_set_id", "Essay"])