# Reading at Scale

This workshop is focused on turning annotations into usable data for Natural Language Processing applications. It focuses primarily on defining and evaluating a formal feature across texts. We will be producing what is referred to as "Gold Standard" Data and then (briefly) seeing what we can do with this data.

### Installations

In [None]:
!pip3 install spacy 

In [None]:
!pip install attachment-downloader

In [None]:
!python -m spacy download en

In [None]:
import numpy as np
from IPython.display import clear_output
from termcolor import colored, cprint
import imaplib
import requests
import spacy
import attachment_downloader
from email import encoders
from email.mime.base import MIMEBase
import os
import smtplib
import csv
from email.mime.multipart import MIMEMultipart
from email.message import Message
from email.mime.text import MIMEText
import matplotlib.pyplot as plt
import re

## Logsitics and Scraping Text

Finding a source of clean, easily parsable text is a cornerstone of most DH work. Clean text is available as a string (one of four data types) rather than a pdf or image. We'll be scraping a website for clean text and applying a few processes to make it usable.

In [None]:
def email_results_to_malcolm(annotation_results, filename):

    COMMASPACE = ', '

    msg = MIMEMultipart()
    msg['Subject'] = 'Annotation Results'
    emailfrom = "lubinworkshop@gmail.com"
    emailto = ['mtb236@cornell.edu', "lubinworkshop@gmail.com"]

    msg['From'] = emailfrom
    msg['To'] = COMMASPACE.join(emailto)
    msg.preamble = 'List of  audit records '
    csvfiles = [filename]

    for csv in csvfiles:
            print(csv)
            with open(csv) as fp:
                record = MIMEBase('application', 'octet-stream')
                record.set_payload(fp.read())
                encoders.encode_base64(record)
                record.add_header('Content-Disposition', 'attachment',
                                  filename=os.path.basename(csv))
            msg.attach(record)

    print ("INFO: ")
    server = smtplib.SMTP('smtp.gmail.com', 587)
    server.ehlo()
    server.starttls()
    server.login("lubinworkshop@gmail.com", 'raven1119')
    server.sendmail(emailfrom, emailto, msg.as_string())
    server.quit()

In [None]:
nlp = spacy.load("en_core_web_sm")

result = requests.get("http://xroads.virginia.edu/~hyper/POE/masque.html")
poe_masque = [line for line in result.text.splitlines()]

In [None]:
#identifies paragraphs in html#
def para_helper(data):
    para = []
    for line in range(len(data)):
        if "<p>" in data[line]:
            para.append(line)
    return para

In [None]:
#returns the start and end of paragraph as a string#
def para_extractor(data):
    p_list = para_helper(data)
    paragraph = []
    for item in range(len(p_list)):
        if item >= len(p_list) -1:
            pass
        else:
            paragraph.append(data[p_list[item]:p_list[item+1]])
    return paragraph

In [None]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [None]:
def para_clean(data):
    p_data = para_extractor(data)
    cleaned_story = []
    for paragraph in p_data:
        cleaned_paragraph = []
        for line in paragraph:
            cleaned_line = remove_html_tags(line)
            if len(cleaned_line) > 3:
                cleaned_paragraph.append(cleaned_line)
        if len(cleaned_paragraph) > 1:
            cleaned_story.append(" ".join(cleaned_paragraph))
    return cleaned_story

### Tidying Text for Annotation

While text in just a string can be more than enough for most applications, adding formatting information will allow us to localize the effects we're noticing; knowing the chapter, sentence, and word position can be invaluable to analysis.

In [None]:
def sentence_tokenize_and_tag(data):
    story = para_clean(data)
    tagged = []
    for paragraph_number in range(len(story)):
        paragraph = nlp(story[paragraph_number])
        sentences = [sent.string.strip() for sent in paragraph.sents]
        for sentence_number in range(len(sentences)):
            tagged.append((sentences[sentence_number], paragraph_number, sentence_number))
    return tagged
    

In [None]:
st = sentence_tokenize_and_tag(poe_masque)

In [None]:
st

## Annotation and Annotation Guidelines

[Crowdsource Annotation Rules Here]

In [None]:
def basic_annotator(sentence_tokenized_data):
    annotation_results = []
    count = 0
    while count != len(sentence_tokenized_data):  
        print(colored(sentence_tokenized_data[count][0],'blue', attrs=['bold']))
        variable = input("Feature Present? yes = f or no = j")
        if variable == 'f':
            annotation_results.append([sentence_tokenized_data[count][0],sentence_tokenized_data[count][1], sentence_tokenized_data[count][2], 1])
            count +=1
            clear_output()
        elif variable == 'j':
            annotation_results.append([sentence_tokenized_data[count][0],sentence_tokenized_data[count][1], sentence_tokenized_data[count][2],0])
            count +=1
            clear_output()
        else:
            print("Not a valid input. Please stop trying to sabotage my workshop.")
    return(annotation_results)

In [None]:
annotation_results = basic_annotator(st)

In [None]:
#add last name in both the first and last line#
with open("LASTNAME_masque_annotation_results.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(annotation_results)
email_results_to_malcolm(annotation_results, "LASTNAME_masque_annotation_results.csv")

In [None]:
def print_annotations(annotation_results):
    paragraph = []
    for line in range(len(annotation_results)):
        element_colors = {'G': 'grey', 'Y': 'yellow'}
        if annotation_results[line - 1][1] == annotation_results[line][1]:
            line_text = annotation_results[line][0]
            if annotation_results[line][3] == 1:
                paragraph.append(["Y",line_text])
            else:
                paragraph.append(["G",line_text])
        else:
            print(" ".join(colored(element[1], element_colors[element[0]]) for element in paragraph))
            paragraph =[]
            line_text =  "     " + annotation_results[line][0]
            if annotation_results[line][3] == 1:
                paragraph.append(["Y",line_text])
            else:
                paragraph.append(["G",line_text])
    print(" ".join(colored(element[1], element_colors[element[0]]) for element in paragraph))   

In [None]:
print_annotations(annotation_results)

In [None]:
wp_Mimno = re.compile("\w[\w\-\']*\w|\w")

In [None]:
def basic_line_graph(annotation_results):
    x = range(len(annotation_results))
    y = []
    for line in annotation_results:
        if line[3] == 1:
            score = len(wp_Mimno.findall(line[0]))
            y.append(score)
        else:
            y.append(0)
    plt.plot(x,y)
    plt.xlabel('Narrative')
    plt.ylabel('# of Tokens')
    plt.title('Feature Progression')
    plt.show()

In [None]:
basic_line_graph(annotation_results)

In [None]:
def basic_bar_graph(annotation_results):
    x = 1
    y = []
    count = 0
    for line in range(len(annotation_results)):
        if annotation_results[line - 1][1] == annotation_results[line][1]:
            if annotation_results[line][3] == 1:
                count += 1
        else:
            x += 1
            y.append(count)
            count = 0
    y.append(count)
    z = max(y) +2
    plt.bar(range(x), y)
    plt.xlabel('Paragraph Number')
    plt.ylabel('# of Sentences')
    plt.title('Feature Progression')
    plt.show() 

In [None]:
basic_bar_graph(annotation_results)

In [None]:
def weighted_bar_graph(annotation_results):
    x = 1
    y = []
    count = 0
    word_count =0
    for line in range(len(annotation_results)):
        if annotation_results[line - 1][1] == annotation_results[line][1]:
            if annotation_results[line][3] == 1:
                count += 1
                word_count += len(wp_Mimno.findall(annotation_results[line][0]))
        else:
            x += 1
            if count > 0:
                y.append(word_count/count)
            else:
                y.append(0)
            count = 0
            word_count =0
    if count > 0:
        y.append(count/word_count)
    else:
        y.append(0)
    plt.bar(range(x), y)
    plt.xlabel('Paragraph Number')
    plt.ylabel('# of tokens/# of sentences')
    plt.title('Feature Progression')
    plt.show()

In [None]:
weighted_bar_graph(annotation_results)

## Group Annotation Results

Interannotator agreement is important for producing gold standard data; if annotation guidelines allow for too much flexibility or are not clear enough, it is difficult to say with confidence that the feature you think you are observing is being observed.

In [None]:
#Get class CSV files#
!attachment-downloader --host imap.gmail.com --username lubinworkshop@gmail.com --password raven1119 \\
    --imap-folder Inbox --output ~/Downloads

In [None]:
import glob

group_masque_results_csvs = glob.glob("*_masque_annotation_results.csv")

In [None]:
group_masque_results_csvs

In [None]:
def calculate_group_results(results):
    group_score= []
    count = 0
    for filename in results:
        with open(filename) as f: 
            file = csv.reader(f, delimiter=',')
            if count == 0:
                for row in file:
                    group_score.append(int(row[3]))
                count += 1
            else:
                lc = 0
                for row in file:
                    group_score[lc] = group_score[lc] + int(row[3])
                    lc += 1
    return(group_score)

In [None]:
group_masque_results = calculate_group_results(group_masque_results_csvs)

In [None]:
print(group_masque_results)

In [None]:
def calculate_disagreement(group_results):
    complete = 0
    majority = 0
    comp_line = []
    maj_line = []
    full = max(group_results)
    for line in range(len(group_results)):
        if group_results[line] == full or group_results[line] == 0:
            complete += 1
            comp_line.append(line)
        elif line != group_results[line] and line >= group_results[line]/2:
            majority += 1
            maj_line.append(line)
    print(complete)
    print(majority)
    return ([comp_line, maj_line])
                
    

In [None]:
def show_majority_lines(annotation_results, group_results):
    maj_dis = calculate_disagreement(group_results)[1]
    for line in maj_dis:
        print(annotation_results[line][0])

In [None]:
show_majority_lines(annotation_results, group_masque_results)

Decide on disagreed lines, update accordingly

## Revise Guidelines?

## Expanding Annotation

In [None]:
#Group_One
#shorter#
telltale = requests.get("http://xroads.virginia.edu/~hyper/POE/telltale.html")
poe_telltale = [line for line in telltale.text.splitlines()]
telltale_sent = sentence_tokenize_and_tag(poe_telltale)
telltale_results = basic_annotator(telltale_sent)

#Add last name before running cell#
with open("LASTNAME_telltale_annotation_results.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(telltale_results)
email_results_to_malcolm(annotation_results, "LASTNAME_telltale_annotation_results.csv")

In [None]:
#Group_Two
#medium#
result = requests.get("http://xroads.virginia.edu/~hyper/POE/fact.html")
poe_valdemar = [line for line in result.text.splitlines()]
valdemar_sent = sentence_tokenize_and_tag(poe_valdemar)
valdemar_results = basic_annotator(valdemar_sent)

#Add last name before running cell#
with open("LASTNAME_valdemar_annotation_results.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(valdemar_results)
email_results_to_malcolm(annotation_results, "LASTNAME_valdemar_annotation_results.csv")

In [None]:
#Group_Three
#longer#
result = requests.get("http://xroads.virginia.edu/~hyper/POE/fall.html")
poe_usher = [line for line in result.text.splitlines()]
usher_sent = sentence_tokenize_and_tag(poe_usher)
usher_results = basic_annotator(usher_sent)

#Add last name before running cell#
with open("LASTNAME_usher_annotation_results.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(usher_results)
email_results_to_malcolm(annotation_results, "LASTNAME_usher_annotation_results.csv")

### Comparing Annotations

#Get CSV files#
!attachment-downloader --host imap.gmail.com --username lubinworkshop@gmail.com --password raven1119 \\
    --imap-folder Inbox --output ~/Downloads

In [None]:
group_telltale_results_csvs = glob.glob("*_telltale_annotation_results.csv")
show_majority_lines(telltale_results, group_telltale_results_csvs)

In [None]:
group_valdemar_results_csvs = glob.glob("*_valdemar_annotation_results.csv")
show_majority_lines(valdemar_results, group_valdemar_results_csvs)

In [None]:
group_usher_results_csvs = glob.glob("*_usher_annotation_results.csv")
show_majority_lines(usher_results, group_usher_results_csvs)

## Using Annotations

Group Dataset

In [None]:
#Get CSV files#
!attachment-downloader --host imap.gmail.com --username lubinworkshop@gmail.com --password raven1119 \\
    --imap-folder Inbox --output ~/Downloads

In [None]:
#make this file by hand#
group_annotation_results = []

In [None]:
with open("compiled_annotation_results.csv") as f: 
    file = csv.reader(f, delimiter=',')
        for row in file:
            group_annotation_results.append(row)

### SVM Classifier 

In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [None]:
trainDF = pandas.DataFrame()
trainDF['text'] = [i[0] for i in group_annotation_results]
trainDF['label'] = [i[1] for i in group_annotation_results]

In [None]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

In [None]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [None]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print("SVM: ", accuracy)