In [14]:
import spacy
import json
import pandas as pd
import requests 
import enchant
from application import *
from application.models import Metadata, Work, Author

In [15]:
import pandas as pd
all_rows = pd.read_csv("metadata.csv")

In [3]:
female_rows = all_rows.loc[all_rows['assumed_gender'] == 'f']
male_rows = all_rows.loc[all_rows['assumed_gender'] == 'm']

print(len(male_rows), len(female_rows))

(703, 159)


In [4]:
nlp = spacy.load('en')

In [37]:
words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words')
stoplist1 = words.text.split("\r\n")

from nltk.corpus import stopwords
stoplist2 = set(stopwords.words('english'))

stoplist1.extend(stoplist2)

fullstops = list(set(stoplist1))

def remove_stops(stoplist, wordlist):
    result = []
    for i in wordlist:
        if i not in stoplist:
                result.append(i)
    return result

def spellcheck(wordlist):
    result = []
    d = enchant.Dict("en_US")
    for i in wordlist:
        if d.check(i) or d.check(i.capitalize()):
            result.append(i)
         
    return result

def get_term_tree(list_of_texts, term, nyt_ids_list):
    term_list =[]
    #normalize ocr errors
    for h,i in enumerate(list_of_texts):
        #lowercase all
        ocr_lower = i.lower()
        #tokenize, remove punctuation and numbers, remove tabs, newlines, etc.
        ocr_cleaner = ocr_lower.replace("\n", " ").replace("\t", " ")
        doc = nlp(ocr_cleaner)
        ocr_tokens = []
        #add to term list any lemma or token that matches term, but not both if they are the same
        for token in doc:
            ocr_tokens.append(unicode(token))
        for z, token in enumerate(doc):
            if token.lemma_ == term or unicode(token) == term:
                context_list = []
                if z >= 6:
                    context_list.extend(ocr_tokens[z-6:z-1])
                else: 
                    context_list.extend(ocr_tokens[0:z-1])
                
                try:
                    context_list.extend(ocr_tokens[z:z+7])
                except:
                    context_list.extend(ocr_tokens[z:[len(ocr_tokens)]])
                result = {"lemma": token.lemma_, "token": unicode(token), "pos": token.pos_, "nyt_id": nyt_ids_list[h] }
                term_list.append(result)
    return term_list
            
def clean_text(list_of_texts):
    fully_cleaned =[]
    #normalize ocr errors
    for i in list_of_texts:
        #lowercase all
        ocr_lower = i.lower()
        #tokenize, remove punctuation and numbers, remove tabs, newlines, etc.
        ocr_cleaner = ocr_lower.replace("\n", " ").replace("\t", " ")
        doc = nlp(ocr_cleaner)
        ocr_tokens = []
        for token in doc:
            
            if token.lemma_ == u'-PRON-' or token.lemma_.isupper():
                ocr_tokens.append(unicode(token))
            else:
                ocr_tokens.append(token.lemma_)
        #ocr_tokens = ocr_cleaner.split(" ")
        
        no_numbers_or_punct = []
        for token in ocr_tokens:
            if token.isalpha():
                no_numbers_or_punct.append(token)
            else:
                
                new_token = ""
                for letter in token:
                    if letter.isalpha():
                        new_token += letter
                if new_token != "":
                    no_numbers_or_punct.append(new_token)  
        
        
        spellchecked = spellcheck(no_numbers_or_punct)
        fully_cleaned.append(spellchecked)
    return fully_cleaned

del fullstops[0]

#this list of gender terms was generated iteratively by running the logistic regression with all terms, 
#seeing what correlated the most with gender, and removing words that seemed to have direct gender info in them

gender_terms = ["mr", "he", "his", "him", "himself", "man", "men", "boy", "boys", "manly", "masculine", "boyish", "father", \
                "brother", "girls", "men", "women", "sisters", "daughters", "brothers", "sons", "wife", "husband", "niece",\
                "uncle", "nephew", "dad", "grandfather", "son", "mrs", "miss", "her", "hers", "she", "herself", "woman",\
                "girl", "nieces", "nephews", "fer", "mme", "mlle", \
                "lady", "womanly", "girlish", "girly", "mother", "daughter", "aunt", "niece" "grandmother", "mom", "sister" ]

from nltk.corpus import names 
male = [o.lower() for o in names.words('male.txt')]
female = [o.lower() for o in names.words('female.txt')]

fullstops_and_pronouns = []

for u in [fullstops, gender_terms]:
    for i in u:
        fullstops_and_pronouns.append(unicode(i))

fullstops_and_pronouns = list(set(fullstops_and_pronouns))

In [6]:
from string import ascii_lowercase
fullstops_pronouns_and_names = []

for u in [fullstops_and_pronouns, male, female]:
    for i in u:
        fullstops_pronouns_and_names.append(unicode(i))

fullstops_pronouns_and_names.append(unicode("thoma"))

for ltr in ascii_lowercase:
    fullstops_pronouns_and_names.append(unicode(ltr))

fullstops_pronouns_and_names = list(set(fullstops_pronouns_and_names))

print(len(fullstops), len(fullstops_and_pronouns), len(fullstops_pronouns_and_names))

(353, 396, 7982)


In [7]:
ocr_list_male = []
male_nyt_ids = []
ocr_list_female = []
female_nyt_ids = []
for i in male_rows.iterrows():
    row = Metadata().query.filter(Metadata.id == int(i[1][0])).one_or_none()
    ocr_list_male.append(row.ocr_transcription)
    male_nyt_ids.append(row.nyt_id)

In [8]:
for i in female_rows.iterrows():
    row = Metadata().query.filter(Metadata.id == int(i[1][0])).one_or_none()
    ocr_list_female.append(row.ocr_transcription)
    female_nyt_ids.append(row.nyt_id)

In [38]:
tree_for_volume = get_term_tree(ocr_list_male, "volume", male_nyt_ids)

In [39]:
[i["nyt_id"] for i in tree_for_volume]

[u'4fc0532945c1498b0d250e4c',
 u'4fc0532945c1498b0d250e4c',
 u'4fc0532945c1498b0d250e4c',
 u'4fc0532945c1498b0d250e4c',
 u'4fc03b9245c1498b0d1e86a8',
 u'4fc045fc45c1498b0d2162b0',
 u'4fc03b9245c1498b0d1e86b8',
 u'4fc03b9245c1498b0d1e86b8',
 u'4fc03b9245c1498b0d1e86b8',
 u'4fc03b9245c1498b0d1e86b8',
 u'4fc03b9245c1498b0d1e86b8',
 u'4fc045fc45c1498b0d216298',
 u'4fc045fc45c1498b0d216298',
 u'4fc0499e45c1498b0d226c41',
 u'4fc0499e45c1498b0d226c41',
 u'4fc0532945c1498b0d250e59',
 u'4fc0532945c1498b0d250e48',
 u'4fc0528045c1498b0d24e521',
 u'4fc0528045c1498b0d24e521',
 u'4fc0471045c1498b0d21b1cd',
 u'4fc0471045c1498b0d21b1cd',
 u'4fc0471045c1498b0d21b1cd',
 u'4fc0471045c1498b0d21b1cd',
 u'4fc045fc45c1498b0d2162b9',
 u'4fc03b9245c1498b0d1e8723',
 u'4fc03b9245c1498b0d1e8735',
 u'4fc0532945c1498b0d250e96',
 u'4fc0532945c1498b0d250e96',
 u'4fc0532945c1498b0d250e96',
 u'4fc054cf45c1498b0d258bdf',
 u'4fc03b9245c1498b0d1e8728',
 u'4fc03b9245c1498b0d1e8738',
 u'4fc03b9245c1498b0d1e8738',
 u'4fc03b9

In [40]:
tree_for_garden = get_term_tree(ocr_list_female, "garden", female_nyt_ids)

In [63]:
tree_for_gardener = get_term_tree(ocr_list_female, "gardener", female_nyt_ids)

In [64]:
len(list(set([u['nyt_id'] for u in tree_for_gardener])))

2

In [48]:
pd.DataFrame(tree_for_gardening)

Unnamed: 0,lemma,nyt_id,pos,token
0,garden,4fc045fe45c1498b0d216556,NOUN,gardens
1,garden,4fc045fe45c1498b0d2165e8,NOUN,garden
2,garden,4fc0466945c1498b0d218080,NOUN,garden
3,garden,4fc03a7d45c1498b0d1e2e2c,NOUN,garden
4,garden,4fc0478645c1498b0d21d8dd,NOUN,garden
5,garden,4fc0478645c1498b0d21d8dd,NOUN,garden
6,garden,4fc043f145c1498b0d20d151,NOUN,garden
7,garden,4fc043f145c1498b0d20d151,NOUN,garden
8,garden,4fc045fe45c1498b0d216692,NOUN,garden
9,garden,4fc045fe45c1498b0d216692,NOUN,garden


In [27]:
from nltk.corpus import wordnet as wn
for a in wn.synsets('garden'):
    print(a.definition())


a plot of ground where plants are cultivated
the flowers or vegetables or fruits or herbs that are cultivated in a garden
a yard or lawn adjoining a house
work in the garden


In [9]:
ocr_cleaned_male = clean_text(ocr_list_male)
ocr_cleaned_female = clean_text(ocr_list_female)

In [10]:
from collections import Counter 
ocr_counters_male = [Counter(i) for i in ocr_cleaned_male]
ocr_counters_female = [Counter(i) for i in ocr_cleaned_female]
ocr_counters_all = []
for i in ocr_counters_male:
    ocr_counters_all.append(i)
for i in ocr_counters_female:
    ocr_counters_all.append(i)
nyt_ids_all = []
for i in male_nyt_ids:
    nyt_ids_all.append(i)
for i in female_nyt_ids:
    nyt_ids_all.append(i)

In [11]:
import pickle
with open('pickled_data/ocr_counters_all.pickle', 'wb') as handle:
    pickle.dump(ocr_counters_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('pickled_data/nyt_ids_all.pickle', 'wb') as handle3:
    pickle.dump(nyt_ids_all, handle3, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open('pickled_data/fullstops.pickle', 'wb') as handle2:
    pickle.dump(fullstops, handle2, protocol=pickle.HIGHEST_PROTOCOL)
with open('pickled_data/fullstops_and_pronouns.pickle', 'wb') as handle4:
    pickle.dump(fullstops_and_pronouns, handle4, protocol=pickle.HIGHEST_PROTOCOL)
with open('pickled_data/fullstops_pronouns_and_names.pickle', 'wb') as handle5:
    pickle.dump(fullstops_pronouns_and_names, handle5, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
nrc_rows = pd.read_csv("lexicons/NRC-emotion-lexicon.txt", sep="\t", header=None, names=["term", "valence", "score"])
nrc_rows_positive = nrc_rows.loc[(nrc_rows['score'] == 1) & (nrc_rows['valence'] == 'positive')]
nrc_rows_negative = nrc_rows.loc[(nrc_rows['score'] == 1) & (nrc_rows['valence'] == 'negative')]

pos_terms = list(nrc_rows_positive['term'])
neg_terms = list(nrc_rows_negative['term'])

def remove_dupes(source, target):
    unique = []
    for s in source: 
        if s not in target:
            unique.append(s)
    return unique

pos_terms_unique = remove_dupes(pos_terms, neg_terms)
neg_terms_unique = remove_dupes(neg_terms, pos_terms)

with open('pickled_data/nrc_positive.pickle', 'wb') as handle6:
    pickle.dump(pos_terms_unique, handle6, protocol=pickle.HIGHEST_PROTOCOL)
with open('pickled_data/nrc_negative.pickle', 'wb') as handle7:
    pickle.dump(neg_terms_unique, handle7, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
len(list(nrc_rows_negative['term']) + list(nrc_rows_positive['term']))

5636

In [15]:
"he" == unicode("he")

True