In [1]:
import spacy
import json
import pandas as pd
import requests 
import enchant
from application import *
from application.models import Metadata, Work, Author
from gensim.models import Word2Vec

  from flask.ext.sqlalchemy import SQLAlchemy


In [2]:
import pandas as pd
all_rows = pd.read_csv("metadata.csv")

In [3]:
female_rows = all_rows.loc[all_rows['assumed_gender'] == 'f']
male_rows = all_rows.loc[all_rows['assumed_gender'] == 'm']

print(len(male_rows), len(female_rows))

(703, 159)


In [4]:
nlp = spacy.load('en')

In [45]:
words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words')
stoplist1 = words.text.split("\r\n")

from nltk.corpus import stopwords
stoplist2 = set(stopwords.words('english'))

stoplist1.extend(stoplist2)

fullstops = list(set(stoplist1))

def remove_stops(stoplist, wordlist):
    result = []
    for i in wordlist:
        if i not in stoplist:
                result.append(i)
    return result

def spellcheck(wordlist):
    result = []
    d = enchant.Dict("en_US")
    for i in wordlist:
        if d.check(i) or d.check(i.capitalize()):
            result.append(i)
         
    return result

def get_term_tree(list_of_texts, term, nyt_ids_list):
    term_list =[]
    #normalize ocr errors
    for h,i in enumerate(list_of_texts):
        #lowercase all
        ocr_lower = i.lower()
        #tokenize, remove punctuation and numbers, remove tabs, newlines, etc.
        ocr_cleaner = ocr_lower.replace("\n", " ").replace("\t", " ")
        doc = nlp(ocr_cleaner)
        ocr_tokens = []
        #add to term list any lemma or token that matches term, but not both if they are the same
        for token in doc:
            ocr_tokens.append(unicode(token))
        for z, token in enumerate(doc):
            if token.lemma_ == term or unicode(token) == term:
                context_list = []
                if z >= 6:
                    context_list.extend(ocr_tokens[z-6:z-1])
                else: 
                    context_list.extend(ocr_tokens[0:z-1])
                
                try:
                    context_list.extend(ocr_tokens[z:z+7])
                except:
                    context_list.extend(ocr_tokens[z:[len(ocr_tokens)]])
                result = {"lemma": token.lemma_, "token": unicode(token), "pos": token.pos_, "nyt_id": nyt_ids_list[h] }
                term_list.append(result)
    return term_list
            
def clean_text(list_of_texts):
    fully_cleaned =[]
    #normalize ocr errors
    for i in list_of_texts:
        #lowercase all
        ocr_lower = i.lower()
        #tokenize, remove punctuation and numbers, remove tabs, newlines, etc.
        ocr_cleaner = ocr_lower.replace("\n", " ").replace("\t", " ")
        doc = nlp(ocr_cleaner)
        ocr_tokens = []
        for token in doc:
            
            if token.lemma_ == u'-PRON-' or token.lemma_.isupper():
                ocr_tokens.append(unicode(token))
            else:
                ocr_tokens.append(token.lemma_)
        #ocr_tokens = ocr_cleaner.split(" ")
        
        no_numbers_or_punct = []
        for token in ocr_tokens:
            if token.isalpha():
                no_numbers_or_punct.append(token)
            else:
                
                new_token = ""
                for letter in token:
                    if letter.isalpha():
                        new_token += letter
                if new_token != "":
                    no_numbers_or_punct.append(new_token)  
        
        
        spellchecked = spellcheck(no_numbers_or_punct)
        fully_cleaned.append(spellchecked)
    return fully_cleaned

def clean_text_sentences(list_of_texts):
    fully_cleaned =[]
    #normalize ocr errors
    for i in list_of_texts:
        #lowercase all
        ocr_lower = i.lower()
        #tokenize, remove punctuation and numbers, remove tabs, newlines, etc.
        ocr_cleaner = ocr_lower.replace("\n", " ").replace("\t", " ")
        doc = nlp(ocr_cleaner)
        
        sentences = [sent for sent in doc.sents]
        
        sentences_tokenized = []
        for s in sentences:
            sentence_tokens = []
            for token in s:
                
                if token.lemma_ == u'-PRON-' or token.lemma_.isupper():
                    sentence_tokens.append(unicode(token))
                else:
                    sentence_tokens.append(token.lemma_)    
            sentences_tokenized.append(sentence_tokens)
        
        no_numbers_or_punct = []
        for sentence in sentences_tokenized:
            no_numbers_or_punct_sentence = []
            for token in sentence:
                if token.isalpha():
                    no_numbers_or_punct_sentence.append(token)
                else:

                    new_token = ""
                    for letter in token:
                        if letter.isalpha():
                            new_token += letter
                    if new_token != "":
                        no_numbers_or_punct_sentence.append(new_token)
            
            no_numbers_or_punct.append(no_numbers_or_punct_sentence)
        
        spellchecked = [spellcheck(i) for i in no_numbers_or_punct]
        fully_cleaned.extend(spellchecked)
    return fully_cleaned

In [6]:
del fullstops[0]

#this list of gender terms was generated iteratively by running the logistic regression with all terms, 
#seeing what correlated the most with gender, and removing words that seemed to have direct gender info in them

gender_terms = ["mr", "he", "his", "him", "himself", "man", "men", "boy", "boys", "manly", "masculine", "boyish", "father", \
                "brother", "girls", "men", "women", "sisters", "daughters", "brothers", "sons", "wife", "husband", "niece",\
                "uncle", "nephew", "dad", "grandfather", "son", "mrs", "miss", "her", "hers", "she", "herself", "woman",\
                "girl", "nieces", "nephews", "fer", "mme", "mlle", \
                "lady", "womanly", "girlish", "girly", "mother", "daughter", "aunt", "niece" "grandmother", "mom", "sister" ]

from nltk.corpus import names 
male = [o.lower() for o in names.words('male.txt')]
female = [o.lower() for o in names.words('female.txt')]

fullstops_and_pronouns = []

for u in [fullstops, gender_terms]:
    for i in u:
        fullstops_and_pronouns.append(unicode(i))

fullstops_and_pronouns = list(set(fullstops_and_pronouns))

In [7]:
from string import ascii_lowercase
fullstops_pronouns_and_names = []

for u in [fullstops_and_pronouns, male, female]:
    for i in u:
        fullstops_pronouns_and_names.append(unicode(i))

fullstops_pronouns_and_names.append(unicode("thoma"))

for ltr in ascii_lowercase:
    fullstops_pronouns_and_names.append(unicode(ltr))

fullstops_pronouns_and_names = list(set(fullstops_pronouns_and_names))

print(len(fullstops), len(fullstops_and_pronouns), len(fullstops_pronouns_and_names))

(353, 396, 7982)


In [8]:
ocr_list_all = []
nyt_ids = []
for i in all_rows.iterrows():
    row = Metadata().query.filter(Metadata.id == int(i[1][0])).one_or_none()
    ocr_list_all.append(row.ocr_transcription)
    nyt_ids.append(row.nyt_id)

In [None]:
ocr_cleaned_sentences = clean_text_sentences(ocr_list_all)

In [None]:
ocr_cleaned_sentences[0]