<a href="https://colab.research.google.com/github/mkane968/Text-Mining-with-Student-Papers/blob/main/Text_Mining_Student_Essays_A_Computational_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Mining Student Essays: A Computational Exploration

This pipeline will ingest, clean and analyze meaningful language patterns in a corpora of student papers. The following input is required: 

*   Corpus of student papers (.txt files)
*   Grades and other relevant metadata associated with the papers (.csv files)


## 1. Install Packages

In [None]:
#Mount Google Drive
from google.colab import drive
from google.colab import files

#Install glob
import glob 

#Install pandas
import pandas as pd

#Install numpy
import numpy as np

#Imports the Natural Language Toolkit, which is necessary to install NLTK packages and libraries
#!pip install nltk
import nltk

#Installs libraries and packages to tokenize text
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from  nltk.text import ConcordanceIndex

#Installs libraries and packages to clean text
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

#Import matplotlib for visualizations
import matplotlib.pyplot as plt


#Imports spaCy itself, necessary to use features 
#!pip install spaCy
import spacy
#Load the natural language processing pipeline
nlp = spacy.load("en_core_web_sm")
#Load spaCy visualizer
from spacy import displacy


import re  # For preprocessing
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim

## 2. Import Student Essays and Metadata

###Import Student Essays and Add to DataFrame

In [None]:
#Mount Google Drive
drive.mount('/content/drive')

In [None]:
#Add files to upload from local machine
uploaded = files.upload()

In [None]:
#Put essays into dataframe
essays = pd.DataFrame.from_dict(uploaded, orient='index')

#Reset index and add column names to make wrangling easier
essays = essays.reset_index()
essays.columns = ["ID", "Text"]

#Remove encoding characters from Text column (b'\xef\xbb\xbf)
essays['Text'] = essays['Text'].apply(lambda x: x.decode('utf-8'))

#Remove newline characters and put in new column 
essays['Text_Newlines'] = essays['Text']
essays['Text'] = essays['Text'].str.replace(r'\s+|\\r', ' ', regex=True) 
essays['Text'] = essays['Text'].str.replace(r'\s+|\\n', ' ', regex=True) 
essays.head()

### Remove identifying information from each paper ID (instructor/student names) 

In [None]:
#Remove identifying information from ID
#Remove any occurences of "LATE_" from dataset (otherwise will skew ID cleaning)
essays['ID'] = essays['ID'].str.replace(r'LATE_', '', regex=True) 

#Split book on first underscore (_) in ID, keep only text in between first and second underscore (ID number)
start = essays["ID"].str.split("_", expand = True)
essays['ID'] = start[1]
essays['ID'] = essays['ID'].astype(int)
essays

In [None]:
len(essays)

### Import grades and additional metadata to second dataframe


In [None]:
#Upload csvs with essay metadata
uploaded_grades = files.upload()

In [None]:
#Link to path where csv files are stored in drive
local_path = r'/content'

#Create variable to store all csvs in path
filenames = glob.glob(local_path + "/*.csv")

#Create df list for all csvs
dfs = [pd.read_csv(filename) for filename in filenames]

len(filenames)

In [None]:
print(filenames)

In [None]:
# Concatenate all data into one DataFrame
metadata = pd.concat(dfs, ignore_index=True)

#Change data to string (for further cleaning)
metadata.astype(str)

metadata

In [None]:
#Drop header rows(Points Possible) and test student rows (Student, Test)
metadata = metadata[metadata['Student'].str.contains('Points Possible|Student, Test')==False]
metadata

In [None]:
#Keep only relevant metadata (ID, Section, Final Portfolio Scores)
clean_metadata = metadata[['ID'] + ['Section'] + list(metadata.loc[:, metadata.columns.str.startswith('Final Portfolio (')])]
clean_metadata
#Want other metadata? Check the columns
#Get all column names 
#for col in metadata.columns:
   # print(col)

In [None]:
#Replace all NaN values with 0 
clean_metadata = clean_metadata.replace(np.nan, 0)
clean_metadata

In [None]:
#Create new final portfolio column with all values
#Add values of each column together; values except correct grade will be zero
score_counts = clean_metadata.columns[2:]
clean_metadata['Portfolio_Score'] = clean_metadata[score_counts].sum(axis=1)

In [None]:
clean_metadata['Portfolio_Score']

In [None]:
#Drop grade columns for individual classes
clean_metadata = clean_metadata[['ID', 'Section', "Portfolio_Score"]]
clean_metadata

In [None]:
#Drop decimal from ID (inconsistent with ID in essay dataframe)
clean_metadata['ID'] = clean_metadata['ID'].astype(int)

#Check cleaned DF one more time
clean_metadata

### Merge essays and grade metadata into one dataframe

In [None]:
#Merge metadata and cleaned essays into new dataframe
#Will only keep rows where both essay and metadata are present
essays_grades_master = clean_metadata.merge(essays,on='ID')

#Print dataframe
essays_grades_master

In [None]:
#Sort dataframe by grades
essays_grades_master.sort_values(by=['Portfolio_Score'], inplace = True)
essays_grades_master

In [None]:
#Save new df to csv and download
essays_grades_master.to_csv('essays_grades_master.csv') 
files.download('essays_grades_master.csv')

## 3. Clean Data

### Basic Cleaning with NLTK
####Lowercasing, Punctuation Removal, and Stopword Removal

In [None]:
#Rename dataframe
clean_essay_grades_df = essays_grades_master
clean_essay_grades_df.rename(columns = {"Text_NoHeaders": "Text"}, inplace = True)

#Lowercase all words
clean_essay_grades_df['Lower_Text'] = clean_essay_grades_df['Text'].str.lower()

#Remove punctuation and replace with no space (except periods and hyphens)
clean_essay_grades_df['NoPunct_Text'] = clean_essay_grades_df['Lower_Text'].str.replace(r'[^\w\-\.\'\s]+', '', regex = True)

#Remove periods and replace with space (to prevent incorrect compounds)
clean_essay_grades_df['NoPunct_Text'] = clean_essay_grades_df['NoPunct_Text'].str.replace(r'[^\w\-\'\s]+', ' ', regex = True)

#Remove stopwords
stop_words = set(stopwords.words("english"))
clean_essay_grades_df['NoStops_Text'] = clean_essay_grades_df['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

#Check output
clean_essay_grades_df.head()

## 4. Extract Keywords and Context: Rhetorical Analysis

**Key Terms:** Pathos, Ethos, Logos

**Related Outcome:** *To learn to employ rhetorical terms and strategies and strengthen your ability to analyze rhetorical techniques in published essays and visual texts.*


In [None]:
#We need the metadata and text with newlines here; we'll also take the nostops text for further count analysis
rhetorical_keywords_df = clean_essay_grades_df[['ID', 'Section', 'Portfolio_Score', 'Text_Newlines']].copy()

#Add ID and score in one column
rhetorical_keywords_df['Score_ID'] = 'Score: ' + rhetorical_keywords_df['Portfolio_Score'].astype(str) + ', ID:' + rhetorical_keywords_df['ID'].astype(str)

#Check new df
rhetorical_keywords_df.head()


#### Paragraph Segmentation

In [None]:
#We only need one newlines version here
rhetorical_keywords_df = rhetorical_keywords_df[['Score_ID', 'Text_Newlines']].copy()

#Check new df
rhetorical_keywords_df.head()


In [None]:
#Count number of paragraphs in each text
paragraph_counts = rhetorical_keywords_df['Text_Newlines'].str.count(r'\n')
paragraph_counts

#Append paragraphs counts to dataframe
rhetorical_keywords_df["Paragraph_Counts"] = paragraph_counts
rhetorical_keywords_df

In [None]:
#Make new cell each time new paragraph starts 
new = rhetorical_keywords_df["Text_Newlines"].str.split(r'\n', expand = True).set_index(rhetorical_keywords_df['Score_ID'])

#Flatten dataframe so each chapter is on own row, designated by book and chapter 
paragraphs_df = new.stack().reset_index()
paragraphs_df.columns = ["Score_ID", "Paragraph", "Text"]
paragraphs_df

In [None]:
##Clean paragraphs
##Filter out paragraphs with 5 or less words (headers)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.split().str.len().lt(10)]

## Filter out paragraphs containing "http://", "doi:" , "https://" and "://www" (Works Cited citations)
paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("http://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("https://")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("://www")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("www.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains(".com/")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("Vol.")]

paragraphs_df = paragraphs_df[~paragraphs_df['Text'].str.contains("doi:")]

In [None]:
paragraphs_df

In [None]:
#Save new df to csv and download to clean further
paragraphs_df.to_csv('paragraphs.csv') 
files.download('paragraphs.csv')

In [None]:
##Set up new dataframe for keyword frequency counts
rhetorical_keywords_df = paragraphs_df.copy()

#Count number of occurences of rhetorical terms in each paper
pathos_counts = rhetorical_keywords_df['Text'].str.count('pathos')
ethos_counts = rhetorical_keywords_df['Text'].str.count('ethos')
logos_counts = rhetorical_keywords_df['Text'].str.count('logos')

#Append each count to the dataframe
rhetorical_keywords_df['Pathos_Counts'] = pathos_counts
rhetorical_keywords_df["Ethos_Counts"] = ethos_counts
rhetorical_keywords_df["Logos_Counts"] = logos_counts

#Get summ of all term usages
rhetorical_terms = ['Pathos_Counts', 'Ethos_Counts', 'Logos_Counts']
rhetorical_keywords_df['Sum_Terms'] = rhetorical_keywords_df[rhetorical_terms].sum(axis=1)

rhetorical_keywords_df

In [None]:
#Remove all rows with no rhetorical terms
rhetorical_keywords_df_no_blanks = rhetorical_keywords_df[rhetorical_keywords_df.Sum_Terms > 0]
rhetorical_keywords_df_no_blanks

In [None]:
#Save new df to csv and download
rhetorical_keywords_df.to_csv('rhetorical_keywords_df.csv') 
files.download('rhetorical_keywords_df.csv')

##5. Analyzing Term Count Frequencies


In [None]:
#Chart number of times all terms were used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='All Terms', x=rhetorical_keywords_df["Score_ID"], y=rhetorical_keywords_df["Sum_Terms"])
])
# Change the bar mode
fig.update_layout(title_text='Counts of All Rhetorical Terms in Each Essay')
fig.update_layout(barmode='stack')
fig.show()

In [None]:
#Get scores on their own to calculate regression
rhetorical_keywords_df[['Score','ID']] = rhetorical_keywords_df.Score_ID.str.split(", ",expand=True)
rhetorical_keywords_df['Score'] = rhetorical_keywords_df['Score'].map(lambda x: x.lstrip('Score: '))

#Create new df for numerical values for regression calculations
rhetorical_regression_df = rhetorical_keywords_df[['Score','Pathos_Counts',	'Ethos_Counts',	'Logos_Counts','Sum_Terms']].copy()
rhetorical_regression_df = rhetorical_regression_df.apply(pd.to_numeric) 
rhetorical_regression_df

In [None]:
#Check if amount of all term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
from scipy import stats


#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_regression_df['Score'])
y = np.array(rhetorical_regression_df['Sum_Terms'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Sum Counts By Score")
plt.xlabel("Score")
plt.ylabel("Sum Counts")
plt.show()

print("R value for Total Rhetorical Terms is " + str(r))

In [None]:
#Chart number of times each term was used in each essay 
#Create bar graph
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='Pathos Counts', x=rhetorical_keywords_df["Score_ID"], y=rhetorical_keywords_df["Pathos_Counts"]),
    go.Bar(name='Ethos Counts', x=rhetorical_keywords_df["Score_ID"], y=rhetorical_keywords_df["Ethos_Counts"]),
    go.Bar(name='Logos Counts', x=rhetorical_keywords_df["Score_ID"], y=rhetorical_keywords_df["Logos_Counts"]),

])


# Change the bar mode
fig.update_layout(title_text='Counts of Each Rhetorical Term in Each Essay')
fig.update_layout(barmode='stack')
fig.show()

In [None]:
#Check if amount of usages of pathos is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_regression_df['Score'])
y = np.array(rhetorical_regression_df['Pathos_Counts'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Pathos Counts By Score")
plt.xlabel("Score")
plt.ylabel("Pathos Counts")
plt.show()

print("R value for Pathos is " + str(r))


#Check if amount of usages of logos is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_regression_df['Score'])
y = np.array(rhetorical_regression_df['Logos_Counts'])


#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()

print("R value for Logos is " + str(r))


#Check if amount of usages of ethos is indicative of grade
#Create arrays of independent (x) and dependent (y) variables
x = np.array(rhetorical_regression_df['Score'])
y = np.array(rhetorical_regression_df['Ethos_Counts'])


#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()

print("R value for Ethos is " + str(r))

In [None]:
#Plot # paragraphs in which terms were used vs. essay grade?
##In other words, do more successful writers use terms in multiple paragrpahs (indicating more coherence)?

#Count number of paragraphs where terms used and append to new dataframe
new_Series = rhetorical_keywords_df_no_blanks['Score_ID'].value_counts(ascending=True)
df3 = pd.DataFrame(new_Series).reset_index()
df3

df3.rename(columns={"index": "Score_ID", "Score_ID": "Paragraph_Counts"}, errors="raise", inplace=True)
df3[['ID','Score']] = df3.Score_ID.str.split(", ",expand=True)

df3

#Plot paragraph counts per paper
fig = go.Figure(data=[
    go.Bar(name='Paragraph Counts', x=df3["Score_ID"], y=df3["Paragraph_Counts"]),

])
# Change the bar mode
fig.update_layout(title_text='Number of Paragraphs Where Rhetorical Terms Were Used')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})
fig.show()

In [None]:
df3[['Score','ID']] = df3.Score_ID.str.split(", ",expand=True)
df3['Score'] = df3['Score'].map(lambda x: x.lstrip('Score: '))
df3 = df3[['Score','Paragraph_Counts']].copy()
df3 = df3.apply(pd.to_numeric)
df3

In [None]:
#Check if amount of paragraph term usage is indicative of grade
#Based on results (r = .08, there is little relationship between amount of rhetorical terms used and grade...at least between A and B range essays)
from scipy import stats

#Check if amount of usages of all terms per paragraph is indicative of grade
#Create arrays of independent (x) and dependent (y) variables


x = np.array(df3['Score'])
y = np.array(df3['Paragraph_Counts'])

#Return key values of linear regression
slope, intercept, r, p, std_err = stats.linregress(x, y)

#Create function to return new equation
def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.title("Paragraph Counts By Score")
plt.xlabel("Score")
plt.ylabel("Paragrah Counts")
plt.show()

print("R value for Terms per Paragraph is " + str(r))

In [None]:
rhetorical_keywords_df_no_blanks

In [None]:
rhetorical_df_filtered = rhetorical_keywords_df_no_blanks[rhetorical_keywords_df_no_blanks['Paragraph'] < 50]

import plotly.express as px
fig = px.line(rhetorical_df_filtered, x='Paragraph', y='Sum_Terms', color='Score_ID', markers=True)
fig.update_layout(title_text='Term Usage by Paragraph (B-, C-Range, and D-Range Papers)')
fig.show()

##6. Get Positions of Term Usages

In [None]:
rhetorical_keywords_df_no_blanks

In [None]:
#https://stackoverflow.com/questions/4664850/how-to-find-all-occurrences-of-a-substring
#Get position of all occurences of pathos
pathos_results = []
for text in rhetorical_keywords_df_no_blanks.Text:
  result = [m.start() for m in re.finditer('pathos', text)]
  pathos_results.append(result)

rhetorical_keywords_df_no_blanks['Pathos_Positions'] = pathos_results

#Get position of all occurences of ethos
ethos_results = []
for text in rhetorical_keywords_df_no_blanks.Text:
  result = [m.start() for m in re.finditer('ethos', text)]
  ethos_results.append(result)

rhetorical_keywords_df_no_blanks['Ethos_Positions'] = ethos_results

#Get position of all occurences of logos
logos_results = []
for text in rhetorical_keywords_df_no_blanks.Text:
  result = [m.start() for m in re.finditer('logos', text)]
  logos_results.append(result)

rhetorical_keywords_df_no_blanks['Logos_Positions'] = logos_results
rhetorical_keywords_df_no_blanks


##7. Get Rhetorical Term Synonyms with Word2Vec

In [None]:
#Create new dataframe for word2vec
word2vec_essays = essays_grades_master[['ID', 'Portfolio_Score', 'NoStops_Text']].copy()
word2vec_essays

#Split dataframe into three groups based on grades
low = 83
high = 93
c_range = word2vec_essays[word2vec_essays['Portfolio_Score'] <= low]
b_range = word2vec_essays[word2vec_essays['Portfolio_Score'] > low]
b_range = b_range[word2vec_essays['Portfolio_Score'] <= high]
a_range = word2vec_essays[word2vec_essays['Portfolio_Score'] > high]

### Word2Vec on A Range Essays

In [None]:
#Define spaCy function to lemmatize, remove stopwords and non-alphanumeric characters
nlp = spacy.load('en_core_web_sm')

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
#Remove characters
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in a_range['NoStops_Text'])
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

#Put results in a new dataframe
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

In [None]:
#Use bigrams to detect common phrases
from gensim.models.phrases import Phrases, Phraser

#Take list of list of words as input
sent = [row.split() for row in df_clean['clean']]

#Creates relevant list of phrases from all sentences
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)

#Transform the corpus based on the bigrams detected:
sentences = bigram[sent]

In [None]:
#Count word frequency
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
#Get most frequent words
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

In [None]:
#Import word2vec
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [None]:
#Build word2vecmodel (check how to set parameters in tutorial)
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
#Build vocab table--digest all words, filter out unique words, do counts on them
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#Set parameters to train the model
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#Now that the model has been trained, make it more memory efficient
w2v_model.init_sims(replace=True)

In [None]:
#Find most similar words to pathos in a-range essays
w2v_model.wv.most_similar(positive=["pathos"])

In [None]:
#Find most similar words to key terms in corpus
w2v_model.wv.most_similar(positive=["ethos"])

In [None]:
#Find most similar words to key terms in corpus
#I think I cleaned logos out of the corpus accidentally! Only "logo" shows up
w2v_model.wv.most_similar(positive=["logo"])

In [None]:
#Check similarity between words
w2v_model.wv.similarity("pathos", 'logo')

In [None]:
#Analogy difference
w2v_model.wv.most_similar(positive=["pathos", "logo"], negative=["emotion"], topn=3)

### Word2Vec on B Range Essays

In [None]:
#Define spaCy function to lemmatize, remove stopwords and non-alphanumeric characters
nlp = spacy.load('en_core_web_sm')

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
#Remove characters
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in b_range['NoStops_Text'])
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

#Put results in a new dataframe
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

In [None]:
#Use bigrams to detect common phrases
from gensim.models.phrases import Phrases, Phraser

#Take list of list of words as input
sent = [row.split() for row in df_clean['clean']]

#Creates relevant list of phrases from all sentences
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)

#Transform the corpus based on the bigrams detected:
sentences = bigram[sent]

In [None]:
#Count word frequency
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
#Get most frequent words
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

In [None]:
#Import word2vec
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [None]:
#Build word2vecmodel (check how to set parameters in tutorial)
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
#Build vocab table--digest all words, filter out unique words, do counts on them
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#Set parameters to train the model
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#Now that the model has been trained, make it more memory efficient
w2v_model.init_sims(replace=True)

In [None]:
#Find most similar words to key terms in corpus
w2v_model.wv.most_similar(positive=["pathos"])

In [None]:
#Find most similar words to key terms in corpus
w2v_model.wv.most_similar(positive=["ethos"])

In [None]:
#Find most similar words to key terms in corpus
#I think I cleaned logos out of the corpus accidentally! Only "logo" shows up
w2v_model.wv.most_similar(positive=["logo"])

In [None]:
#Check similarity between words
w2v_model.wv.similarity("pathos", 'ethos')

In [None]:
#Analogy difference
w2v_model.wv.most_similar(positive=["pathos", "ethos"], negative=["emotion"], topn=3)

### Word2Vec on C Range Essays

In [None]:
#Define spaCy function to lemmatize, remove stopwords and non-alphanumeric characters
nlp = spacy.load('en_core_web_sm')

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
#Remove characters
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in c_range['NoStops_Text'])
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

#Put results in a new dataframe
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

In [None]:
#Use bigrams to detect common phrases
from gensim.models.phrases import Phrases, Phraser

#Take list of list of words as input
sent = [row.split() for row in df_clean['clean']]

#Creates relevant list of phrases from all sentences
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)

#Transform the corpus based on the bigrams detected:
sentences = bigram[sent]

In [None]:
#Count word frequency
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
#Get most frequent words
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

In [None]:
#Import word2vec
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [None]:
#Build word2vecmodel (check how to set parameters in tutorial)
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
#Build vocab table--digest all words, filter out unique words, do counts on them
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#Set parameters to train the model
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#Now that the model has been trained, make it more memory efficient
w2v_model.init_sims(replace=True)

In [None]:
#Find most similar words to key terms in corpus of c-range essays
w2v_model.wv.most_similar(positive=["pathos"])

In [None]:
#Find most similar words to key terms in corpus
w2v_model.wv.most_similar(positive=["ethos"])

In [None]:
#Find most similar words to key terms in corpus
#I think I cleaned logos out of the corpus accidentally! Only "logo" shows up
w2v_model.wv.most_similar(positive=["logo"])

In [None]:
#Check similarity between words
w2v_model.wv.similarity("pathos", 'ethos')

In [None]:
#Analogy difference
w2v_model.wv.most_similar(positive=["pathos", "ethos"], negative=["emotion"], topn=3)