<a href="https://colab.research.google.com/github/mkane968/Text-Mining-with-Student-Papers/blob/main/notebooks/Section_Texts_Based_on_Outcomes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Upload Texts for Analysis

In [None]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Selet tsv file to upload from local folder
from google.colab import files

uploaded = files.upload()

In [None]:
#Add file into dataframe
import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['all_output.tsv']), index_col=0, sep=',')
df.head()

#Get Sections of Each Essay Containing Rhetorical Analysis Terms

Outcome: *To learn to employ rhetorical terms and strategies and strengthen your ability to analyze rhetorical techniques in published essays and visual texts.*

In [None]:
#We only need one version of the cleaned text for this essay
df_rhetorical = df.drop(["lemma_list", "Text_Lowercased", "Text_NoHeaders", 'Text_NoPunct', "pos_list", "ent_list"], axis=1)
df_rhetorical.head()

In [None]:
#Set up column for score plus ID
df_rhetorical['ID + Score'] = df_rhetorical['ID'].astype(str) + '_' + df_rhetorical['Final Portfolio'].astype(str)

#Count number of occurences of rhetorical terms in each paper
pathos_counts = df_rhetorical['no_stops'].str.count('pathos')
ethos_counts = df_rhetorical['no_stops'].str.count('ethos')
logos_counts = df_rhetorical['no_stops'].str.count('logos')

df_rhetorical.head()


In [None]:
#Graph number of pathos, ethos and logos mentions across essays
#https://plotly.com/python/bar-charts/
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='Pathos Counts', x=df_rhetorical["Final Portfolio"], y=df_rhetorical["Pathos_Counts"]),
    go.Bar(name='Ethos Counts', x=df_rhetorical["Final Portfolio"], y=df_rhetorical["Ethos_Counts"]),
    go.Bar(name='Logos Counts', x=df_rhetorical["Final Portfolio"], y=df_rhetorical["Logos_Counts"])
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()

In [None]:
#Use concordancing to get context around each rhetorical term

def concordance(ci, word, width=400, lines=25):
    """
    Rewrite of nltk.text.ConcordanceIndex.print_concordance that returns results
    instead of printing them. 

    See:
    http://www.nltk.org/api/nltk.html#nltk.text.ConcordanceIndex.print_concordance
    """
    half_width = (width - len(word) - 2) // 2
    context = width // 2 # approx number of words of context

    results = []
    offsets = ci.offsets(word)
    if offsets:
        lines = min(lines, len(offsets))
        for i in offsets:
            if lines <= 0:
                break
            left = (' ' * half_width +
                    ' '.join(ci._tokens[i-context:i]))
            right = ' '.join(ci._tokens[i+1:i+context])
            left = left[-half_width:]
            right = right[:half_width]
            results.append('%s %s %s' % (left, ci._tokens[i], right))
            lines -= 1

    return results

In [None]:
#Test out concordancing on one sentence
import nltk
nltk.download('punkt')
from nltk import Text, word_tokenize

test= "This is a test. These are test sentences. This is another test sentence. There are many test sentences, but this one is the best."
from  nltk.text import ConcordanceIndex

ci = ConcordanceIndex((word_tokenize(test)))
results = concordance(ci, 'is')

results

In [None]:
#Get context around each instance of pathos in each essay and append to dataframe
pathos_results = []
for text in df_rhetorical['no_stops']:
  ci = ConcordanceIndex((word_tokenize(text)))
  results = concordance(ci, 'pathos')
  pathos_results.append(results)

pathos_df = pd.DataFrame(pathos_results)


pathos_df.insert(loc = 0,
          column = 'ID_Score',
          value = df_rhetorical['ID + Score'])

pathos_df.head()


In [None]:
#Get similar context around use of pathos in each essay
similar_pathos_results = []
for text in df_rhetorical['no_stops']:
  ci = ConcordanceIndex((word_tokenize(text)))
  results = concordance(ci, 'pathos')
  similar_pathos_results.append(results)

similar_pathos_results

In [None]:
#Associate each instance with score and reset index, tidy column names
pathos_df = pathos_df.set_index('ID_Score')
pathos_clean = pathos_df.stack().reset_index()

pathos_clean.columns = ["ID_Score","Pathos_Count","Pathos_Context"]

pathos_clean.head()

In [None]:
#Repeat the above steps for ethos concordancing

#Get context around each instance of ethos in each essay and append to dataframe
ethos_results = []
for text in df_rhetorical['no_stops']:
  ci = ConcordanceIndex((word_tokenize(text)))
  results = concordance(ci, 'ethos')
  ethos_results.append(results)

ethos_df = pd.DataFrame(ethos_results)


ethos_df.insert(loc = 0,
          column = 'ID_Score',
          value = df_rhetorical['ID + Score'])

#Associate each instance with score and reset index, tidy column names
ethos_df = ethos_df.set_index('ID_Score')
ethos_clean = ethos_df.stack().reset_index()

ethos_clean.columns = ["ID_Score","Ethos_Count","Ethos_Context"]

ethos_clean.head()


In [None]:
#Repeat the above steps for logos concordancing
#Repeat the above steps for ethos concordancing

#Get context around each instance of ethos in each essay and append to dataframe
logos_results = []
for text in df_rhetorical['no_stops']:
  ci = ConcordanceIndex((word_tokenize(text)))
  results = concordance(ci, 'logos')
  logos_results.append(results)

logos_df = pd.DataFrame(logos_results)


logos_df.insert(loc = 0,
          column = 'ID_Score',
          value = df_rhetorical['ID + Score'])

#Associate each instance with score and reset index, tidy column names
logos_df = logos_df.set_index('ID_Score')
logos_clean = logos_df.stack().reset_index()

logos_clean.columns = ["ID_Score","Logos_Count","Logos_Context"]

logos_clean.head()


In [None]:
#Combine into single dataframe and download
import functools as ft
rhetorical_dfs = [logos_clean, pathos_clean, ethos_clean]
rhetorical_df_final = ft.reduce(lambda left, right: pd.merge(left, right, on='ID_Score'), rhetorical_dfs)

#Clean by removing duplicate values of each (replicated during merge)
rhetorical_df_final.loc[rhetorical_df_final['Pathos_Context'].duplicated(), 'Pathos_Context'] = 'None'
rhetorical_df_final.loc[rhetorical_df_final['Logos_Context'].duplicated(), 'Logos_Context'] = 'None'
rhetorical_df_final.loc[rhetorical_df_final['Ethos_Context'].duplicated(), 'Ethos_Context'] = 'None'

rhetorical_df_final

#Download file to tsv
rhetorical_df_final.to_csv('rhetorical_context_df.tsv', encoding = 'utf-8-sig') 
files.download('rhetorical_context_df.tsv')

#Get Sections of Each Essay Containing Citations

Outcome: To learn to employ academic evidence 

In [None]:
#https://ideone.com/IqZvxm
#https://levelup.gitconnected.com/count-citations-in-a-word-document-with-python-and-regular-expressions-d068218c50b9

import re
pattern = r'\(([^"\)]*|\bAnonymous\b|"[^"\)]*")(, )([\d]+|n\.d\.|[\d]+[\w])\)'
num_replaces = 100000000


# Try to find citation matches (returned as an iterator of matches)
citation_results = []
for text in df['Text_NoHeaders']:
  results = re.finditer(pattern, text)
  citation_results.append(results)

citation_results

#Get Sections of Each Essay Containing Argumentative Terms

Outcome: To develop competent academic arguments 

In [None]:
#https://www.sciencedirect.com/science/article/abs/pii/S147515851730005X

# Old code

In [None]:
rhetorical_context_df = pd.merge(logos_clean, pathos_clean,how='left', on='ID_Score')
rhetorical_context_df = rhetorical_context_df.merge(ethos_clean, how='left', on='ID_Score')

#Clean by removing duplicate values of each (replicated during merge)
rhetorical_context_df.loc[rhetorical_context_df['Pathos_Context'].duplicated(), 'Pathos_Context'] = 'None'
rhetorical_context_df.loc[rhetorical_context_df['Logos_Context'].duplicated(), 'Logos_Context'] = 'None'
rhetorical_context_df.loc[rhetorical_context_df['Ethos_Context'].duplicated(), 'Ethos_Context'] = 'None'

rhetorical_context_df

#Download file to tsv
rhetorical_context_df.to_csv('rhetorical_context_df.tsv', encoding = 'utf-8-sig') 
files.download('rhetorical_context_df.tsv')

In [None]:
#https://avidml.wordpress.com/2017/08/05/natural-language-processing-concordance/
def concordanceBySentence(Ngram, rawText):
# Input: Takes raw text, tekeonzes it into sentences and tries to find the
# NGram that was passed in.
#
# Output: returns a list of sentences containing that NGram.
 
# This variable holds the list of sentences matching the topNGram, if any.
 
    matchingSentenceList = []
 
    print('\nWhole sentence concordance for the N-gram: %s' % (Ngram))
 
    sentenceList = nltk.sent_tokenize(rawText)
 
    Ngram = Ngram.lower()
 
    # Get a sentence then see if that Ngram exists in that sentence.
    # if it is then append the sentence to the sentence list return variable.
    for sentence in sentenceList:
        if Ngram in sentence.lower():
            matchingSentenceList.append(sentence)
 
    if (len(matchingSentenceList) == 0):
        print('No sentences were found with the N-gram: %s \n' % (Ngram))
 
    return matchingSentenceList
 

In [None]:
import nltk
nltk.download('punkt')
from nltk import Text, word_tokenize

text = 'This is a sentence'
c_text = Text(word_tokenize(text))
new_concordance('pathos', c_text, width=200)

In [None]:
#Experiments with concordancing
#https://www.nltk.org/howto/concordance.html
import nltk
from nltk import Text, word_tokenize

for text in df_rhetorical['Text_NoHeaders']:
  c_text = Text(word_tokenize(text))
  new_concordance('pathos', c_text, width=200)
  



In [None]:
#Experiments with concordancing
#https://www.nltk.org/howto/concordance.html
import nltk
from nltk import Text, word_tokenize

for text in df_rhetorical['Text_NoHeaders']:
  c_text = Text(word_tokenize(text))
  con_list = c_text.concordance('pathos')




In [None]:
print(test)

In [None]:
#Experiments with concordancing
#https://www.nltk.org/howto/concordance.html
import nltk
from nltk import Text, word_tokenize
all_con_list = []

for text in df_rhetorical['Text_NoHeaders']:
  c_text = Text(word_tokenize(text))
  con_list = c_text.concordance_list('pathos')
  all_con_list.append(con_list)
all_con_list


In [None]:
all_con_list[0][0]

In [None]:
#https://github.com/sgsinclair/alta/blob/a482d343142cba12030fea4be8f96fb77579b3ab/ipynb/utilities/Concordances.ipynb
def makeConc(word2conc,list2FindIn,context2Use,concList):
    # Lets get 
    end = len(list2FindIn)
    for location in range(end):
        if list2FindIn[location] == word2conc:
            # Here we check whether we are at the very beginning or end
            if (location - context2Use) < 0:
                beginCon = 0
            else:
                beginCon = location - context2Use
                
            if (location + context2Use) > end:
                endCon = end
            else:
                endCon = location + context2Use + 1
                
            theContext = (list2FindIn[beginCon:endCon])
            concordanceLine = ' '.join(theContext)
            # print(str(location) + ": " + concordanceLine)
            concList.append(str(location) + ": " + concordanceLine)

theConc = []
makeConc(word2find,listOfTokens,int(context),theConc)
theConc[-5:]


In [None]:
#https://simply-python.com/2014/03/14/saving-output-of-nltk-text-concordance/
def get_all_phrases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10):
    """
        Function to get all the phases that contain the target word in a text/passage tar_passage.
        Workaround to save the output given by nltk Concordance function
         
        str target_word, str tar_passage int left_margin int right_margin --> list of str
        left_margin and right_margin allocate the number of words/pununciation before and after target word
        Left margin will take note of the beginning of the text
    """
     
    ## Create list of tokens using nltk function
    tokens = nltk.word_tokenize(tar_passage)
     
    ## Create the text of tokens
    text = nltk.Text(tokens)
 
    ## Collect all the index or offset position of the target word
    c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
 
    ## Collect the range of the words that is within the target word by using text.tokens[start;end].
    ## The map function is use so that when the offset position - the target range < 0, it will be default to zero
    concordance_txt = ([text.tokens[map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset])[0]:offset+right_margin]
                        for offset in c.offsets(target_word)])
                         
    ## join the sentences for each of the target phrase and return it
    return [''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]



In [None]:
text4.dispersion_plot

In [None]:
#Experiments with keyword function

test = []
def process(words, search):
    #print(words)
    if search in words:
        pos = words.index(search)
        context = words[pos-20:pos+20]
        test.append(context)
    else:
        #return words
        return []

for text in df_rhetorical['Text_NoHeaders']:
  process(text, "pathos")


In [None]:
test[1]

In [None]:
#Experiments with concordancing
#https://www.nltk.org/howto/concordance.html
for text in df_rhetorical['Text_NoHeaders']:
  results = get_all_phrases_containing_tar_wrd('pathos', text)
  for result in results:
    print(result)
 

In [None]:
#Experiments with KWIC analysis (ngrams)
#https://programminghistorian.org/en/lessons/keywords-in-context-using-n-grams

i = 0

for text in df_rhetorical['Text_NoHeaders']:
  wordlist = text.split()
  print(wordlist[0:4])
  

def getNGrams(wordlist, n):
    ngrams = []
    for i in range(len(wordlist)-(n-1)):
        ngrams.append(wordlist[i:i+n])
    return ngrams

import obo


In [None]:

(lambda row: Text(row['Text_NoHeaders']), axis=1)

df_rhetorical['Text'] = df_rhetorical.apply(lambda row: Text(row['Text_NoHeaders']), axis=1)


In [None]:
import nltk
nltk.download('punkt')
df_rhetorical['Tokens'] = df_rhetorical.apply(lambda row: nltk.word_tokenize(row['Text_NoHeaders']), axis=1)
df_rhetorical


In [None]:
def process(words, search):
    #print(words)
    for word in words:
     if search in words:
        pos = words.index(search)
        return words[pos-10:pos+10]
    else:
        #return words
        return []

for 


df_rhetorical["Pathos"] = df_rhetorical["Tokens"].apply(lambda words:process(words, 'pathos'))

df_rhetorical

In [None]:
df_rhetorical["Pathos"] = df_rhetorical["Tokens"].apply(lambda words:process(words, 'pathos'))

df_rhetorical["Pathos"]

In [None]:
df_rhetorical.Text_NoHeaders.str.split('pathos', expand=True)\
  .apply(lambda x: x.str.extract('(\w+)', expand=False))\
  .apply(lambda x: 'pathos '.join(x), 1)

In [None]:
df_rhetorical.apply(getNGrams(df_rhetorical['Text_NoHeaders'],5))

In [None]:
def search(text,n):
    '''Searches for text, and retrieves n words either side of the text, which are retuned seperatly'''
    word = r"\W*([\w]+)"
    groups = re.search(r'{}\W*{}{}'.format(word*n,'place',word*n), text).groups()
    return groups[:n],groups[n:]

In [None]:
#Some basic analyses

#How many instances of each term papers scoring in different grade ranges? 
import matplotlib.pyplot as plt

df_rhetorical = df_rhetorical.sort_values(by=['Final Portfolio'], ascending=True)

df_rhetorical.plot(kind='bar',x='Final Portfolio',y='Pathos_Counts')