# Initial Setup

- import various helpers, load data, select reviews by status and category

In [1]:
import sys
sys.path.append('../')

In [2]:
from application.name_obj_classes import PubName, PersonName, remove_punct

In [3]:
from application.review_obj_class import ReviewObj

In [4]:
from application.text_preprocessing import preprocess_text

In [5]:
import os
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
from nltk.metrics import edit_distance
%pprint

Pretty printing has been turned OFF


In [7]:
from database import *
import database.models as models

In [10]:
# load full text from db
aps_details_single = models.Review().query.filter(models.Review.status.in_(('needs_crosscheck', 'done'))).filter(models.Review.review_type == 'single_focus').all()

In [11]:
len(aps_details_single)

561

In [12]:
one_review = ReviewObj(aps_details_single[1].record_id, aps_details_single[1].full_text)

# Begin Demonstrations and Instantiations

- Classify review/not review
- Classify single_focus, multi_focus
- With only single_focus, extract information such as the title of the book being reviewed, the assumed gender of the author, the assumed genre and/or subgenre of the book, the reported publisher, and the price
- Do this as a closed set problem, using fuzzy matching, clustering, and maybe a reinforcement learning or deep learning
- Do this as an open set problem, or as a problem where "not in the set" is possible 

## In NYTBR section, Book Review or Not Book Review

In [39]:
nyt_rows[0][3]

'A NEW ESSAYIST.; C.F.G. Masterman, M.P., Criticises Kipling and Other British Institutions.'

In [35]:
# need more non-review content
nyt_not_review = [i for i in nyt_rows if i[12] == 'not_review']
nyt_review = [i for i in nyt_rows if i[12] in ('multi', 'cluster', 'really_multi', 'single_focus')]
len(nyt_review), len(nyt_not_review)

(4242, 4327)

In [66]:
I2mos = [i for i in nyt_not_review if 'I2mo' in i[4]]
I2mos[1][4][:100]

'LATEST PUBLICATIONS  Books Received During the Week Ended July 25 Classified and Annotated According'

In [41]:
list_of_full_txt = [i[4] for i in nyt_review] + [i[4] for i in nyt_not_review]
# make "true labels" (0s and 1s so scikit learn can score them)
labels = [0 for i in range(len(nyt_review))] + [1 for i in range(len(nyt_not_review))]
len(list_of_full_txt) == len(labels)

True

In [44]:
# import various from scikit learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# set up logistic regression with gender labels
v = CountVectorizer()
X = v.fit_transform(list_of_full_txt)
tfidf = TfidfTransformer()
Z = tfidf.fit_transform(X)

In [58]:
# split the rows into training data, training labels, test data, and test labels
# test on 33% of the data
X_train, X_test, y_train, y_test = train_test_split(Z, labels, test_size=0.33, random_state=12)

# instantiate the model and fit to the training data
lr = LogisticRegression()
lr.fit(X_train, y_train)

# make label predictions
results = lr.predict(X_test)

# generate probabilities for each label
probs = lr.predict_proba(X_test)

In [59]:
scores = {}
# generate f1, precision, recall, and accuracy scores
# I will discuss each of these in the lesson
for y,z in [("review",0),("not_review",1)]:
    scores[y] = {}
    scores[y]["f1"] = f1_score(y_test, results, pos_label=z, average='binary')  
    scores[y]["precision"] = precision_score(y_test, results, pos_label=z, average='binary')
    scores[y]["recall"] = recall_score(y_test, results, pos_label=z, average='binary')

scores["accuracy"] = accuracy_score(y_test, results)
scores

{'review': {'f1': 0.884814942926323, 'precision': 0.8498338870431894, 'recall': 0.9227994227994228}, 'not_review': {'f1': 0.8795660036166365, 'precision': 0.9191232048374905, 'recall': 0.8432732316227461}, 'accuracy': 0.8822489391796322}

In [60]:
terms = []
coefs = []
for key,val in v.vocabulary_.items():
    terms.append(key)
    coefs.append(lr.coef_[0][val])

In [61]:
# this block produces a dataframe with the top 30 terms associated with label 0
df_coef = pd.DataFrame()
df_coef['term'] = terms
df_coef['coef'] = coefs
df_coef = df_coef.sort_values(by='coef').reset_index(drop=True)
df_coef.head(30)

Unnamed: 0,term,coef
0,pp,-5.146061
1,he,-3.361216
2,and,-2.945701
3,her,-2.848645
4,volume,-2.781168
5,she,-2.397521
6,with,-2.388235
7,tile,-2.328123
8,reader,-2.254851
9,12mo,-2.213153


In [62]:
# to view the top 30 terms associated with label 1, we look at the bottom rows of the same dataframe
#I2mo ... pub announcements have both, but OCR may be of a lower quality with long lists and blurb announcements
#or just more hances to get it wrong
df_coef.tail(30)

Unnamed: 0,term,coef
408132,literary,1.966858
408133,number,2.005087
408134,has,2.0067
408135,publishers,2.025496
408136,magazine,2.029833
408137,yesterday,2.068803
408138,performance,2.113786
408139,l2mo,2.126622
408140,london,2.154999
408141,times,2.157221


In [None]:
# run on aps_reviews
# make predictions using nonbinary data 
aps_reviews = [i.full_text for i in aps_rows if i.review_type in ('single_focus', 'multi', 'cluster')]
aps_ids = [i.record_id for i in aps_rows if i.review_type in ('single_focus', 'multi', 'cluster')]
aps_urls = ["https://aps-web-app.matthew-lavin.com/static/pdf/%s.pdf"%i for i in aps_ids]

aps_vectors = v.transform(aps_reviews)
aps_tfidf = tfidf.fit_transform(aps_vectors)

# generate probabilities for each label
aps_probs = lr.predict_proba(aps_tfidf)

#display the results as a pandas dataframe
aps_results = pd.DataFrame()

# make columns for the original label, the nyt_id, the cluster_id, the pdf url, and the predicted probabilities

aps_results['aps_id'] = aps_ids
aps_results['url'] = aps_urls
aps_results['prob_review'] = [i[0] for i in aps_probs]
aps_results['prob_not_review'] = [i[1] for i in aps_probs]
len(aps_results.loc[aps_results['prob_review'] > 0.5].reset_index())/len(aps_results)
#79.39% of aps reviews have a naive probability score over .5

In [86]:
#this doesn't tell us how many false positives we might get, just that a model trained on NYT reviews usually recognizes APS reviews as reviews
aps_not_reviews = [i.full_text for i in aps_rows if i.review_type == 'not_review']
aps_non_review_ids = [i.record_id for i in aps_rows if i.review_type == 'not_review']
aps_non_review_urls = ["https://aps-web-app.matthew-lavin.com/static/pdf/%s.pdf"%i for i in aps_non_review_ids]

aps_non_review_vectors = v.transform(aps_not_reviews)
aps_non_review_tfidf = tfidf.fit_transform(aps_non_review_vectors)

# generate probabilities for each label
aps_non_review_probs = lr.predict_proba(aps_non_review_tfidf)

#display the results as a pandas dataframe
aps_non_review_results = pd.DataFrame()

# make columns for the original label, the nyt_id, the cluster_id, the pdf url, and the predicted probabilities

aps_non_review_results['aps_id'] = aps_non_review_ids
aps_non_review_results['url'] = aps_non_review_urls
aps_non_review_results['prob_review'] = [i[0] for i in aps_non_review_probs]
aps_non_review_results['prob_not_review'] = [i[1] for i in aps_non_review_probs]
len(aps_non_review_results.loc[aps_non_review_results['prob_not_review'] > 0.5].reset_index())/len(aps_non_review_results)
# 58.93% of non-reviews would have a non-review probability over 50%, so we might want to adjust to reduce false positives 
# However, say we started with a mix of 80/20 reviews and not reviews
# If we got these results with 1000 objects, we would have 635 true postives, 165 false negatives, 118 true negatives and 82 false positives
# If this were all true, we'd be running calculations on a sample that's 88.5% book reviews and 11.5% not
# Pretty good, but we want better, especially the false positive
# Option 1: improve the model with data, setup, or learning method (labor)
# Option 2: raise the probability threshold to be considered a review (also creates more false negatives)

0.8856345885634589

## Single-work vs. Multi-work Reviews

- The exemplar of a single-work review is very clear, as is the exemplar of review that covers more than one 
- Complications and edge cases arise when it's predominantly a review of one book, with a section that compares it to another book, or in that there is a great variety to multi-work reviews. Some columns like "Latest Fiction" are scanned as separate single work reviews, some as one object. In general, I have found it desirable to isolate clear single-work reviews from others for information extraction or review classification tasks, but other methods wouldn't require this.
- It may be desirable to target multi-work reviews if, for example, you want "in the same review" to be edge weights in a network

In [121]:
aps_single = [i for i in aps_rows if i.review_type == 'single_focus']
aps_not_single = [i for i in aps_rows if i.review_type in ('multi', 'cluster')]
#len(aps_single), len(aps_not_single) >>> (1003, 550)
aps_list_of_full_txt = [i.full_text for i in aps_single] + [i.full_text for i in aps_not_single]
# make "true labels" (0s and 1s so scikit learn can score them)
aps_labels = [0 for i in range(len(aps_single))] + [1 for i in range(len(aps_not_single))]
#len(aps_list_of_full_txt) == len(aps_labels) >>> True
# set up logistic regression with gender labels
v = CountVectorizer()
X = v.fit_transform(aps_list_of_full_txt)
tfidf = TfidfTransformer()
Z = tfidf.fit_transform(X)

# split the rows into training data, training labels, test data, and test labels
# test on 33% of the data
X_train, X_test, y_train, y_test = train_test_split(Z, aps_labels, test_size=0.33, random_state=81)

# instantiate the model and fit to the training data
lr = LogisticRegression(class_weight={0:0.35, 1:0.65})
lr.fit(X_train, y_train)

# make label predictions
results = lr.predict(X_test)

# generate probabilities for each label
probs = lr.predict_proba(X_test)

In [125]:
scores = {}
# generate f1, precision, recall, and accuracy scores
# I will discuss each of these in the lesson
for y,z in [("single-work review",0),("multi-work review",1)]:
    scores[y] = {}
    scores[y]["f1"] = f1_score(y_test, results, pos_label=z, average='binary')  
    scores[y]["precision"] = precision_score(y_test, results, pos_label=z, average='binary')
    scores[y]["recall"] = recall_score(y_test, results, pos_label=z, average='binary')

scores["accuracy"] = accuracy_score(y_test, results)
scores


{'single-work review': {'f1': 0.8204334365325077, 'precision': 0.8412698412698413, 'recall': 0.8006042296072508}, 'multi-work review': {'f1': 0.6947368421052631, 'precision': 0.6666666666666666, 'recall': 0.7252747252747253}, 'accuracy': 0.7738791423001949}

In [126]:
terms = []
coefs = []
for key,val in v.vocabulary_.items():
    terms.append(key)
    coefs.append(lr.coef_[0][val])

# this block produces a dataframe with the top 30 terms associated with label 0
df_coef = pd.DataFrame()
df_coef['term'] = terms
df_coef['coef'] = coefs
df_coef = df_coef.sort_values(by='coef').reset_index(drop=True)
df_coef.head(30)

Unnamed: 0,term,coef
0,he,-0.721452
1,his,-0.668302
2,that,-0.667718
3,was,-0.639716
4,him,-0.435286
5,not,-0.409498
6,had,-0.405744
7,were,-0.373912
8,to,-0.30882
9,as,-0.307325


In [127]:
df_coef.tail(30)

Unnamed: 0,term,coef
69454,illustrated,0.44149
69455,edited,0.443003
69456,contains,0.451154
69457,edition,0.4686
69458,stories,0.482132
69459,boston,0.485738
69460,series,0.494175
69461,00,0.498977
69462,books,0.550457
69463,mr,0.553731


In [120]:
#one_review.cleaned_toks
#one_review.cleaned_text

import string
from nltk.util import ngrams
from collections import Counter
from nltk.corpus import stopwords


title_candidates = [list(),]
for token in one_review.cleaned_toks:
    if token.istitle() or token in stopwords.words('english') or token in string.punctuation:
        if len(title_candidates[-1]) > 0:
            if token not in string.punctuation:
                title_candidates[-1].append(token)
        else:
            if token.istitle():
                title_candidates[-1].append(token)
    else:
        if len(title_candidates[-1]) > 0:
            title_candidates.append(list())

def remove_function_tail(sequence):
    if sequence[-1].lower() in stopwords.words('english'):
        sequence.pop()
        return remove_function_tail(sequence)
    else:
        return sequence
    
candidates_tidy = []
for sequence in title_candidates:
    # rule out if all function words
    all_function = True
    for word in sequence:
        if word.lower() not in stopwords.words('english'):
            all_function = False
            break
    if all_function == False:
        #remove function word tails recursively
        sequence = remove_function_tail(sequence)
        candidates_tidy.append(sequence)
    

NameError: name 'one_review' is not defined

{'review': {'f1': 0.7794994040524433, 'precision': 0.638671875, 'recall': 1.0}, 'not_review': {'f1': 0.010695187165775402, 'precision': 1.0, 'recall': 0.005376344086021506}, 'accuracy': 0.6393762183235867}

In [82]:
single_focus[1].reviewed_book_title, candidates_tidy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
texts = []
for i in candidates_tidy:
    texts.extend(i)
texts = [" ".join(texts),]    
texts.append(single_focus[1].reviewed_book_title)
texts.append(single_focus[0].reviewed_book_title)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
vectors = X.toarray()

cosine_similarity([vectors[0]], [vectors[1]]), cosine_similarity([vectors[0]], [vectors[2]])

(array([[0.65731842]]), array([[0.]]))

## Publisher Names

In [None]:
def clean_pub_matches(match_list):
    cleaned_matches = []
    for match in match_list:
        index_to_start = 0
        for i, x in enumerate(match[1].split()):
            if x[0].islower() and x[0]!='&':
                index_to_start = i+1
        cleaned_matches.append(' '.join(match[1].split()[index_to_start:]))
    return cleaned_matches

In [None]:
def remove_dash_for_pub(pub_match):
    return re.sub(r'(?<!/w)-(?!/w)', '', pub_match)

In [None]:
pub_ends = ['company','co','incorporated','inc','firm','press','group','publishers','publishing',
                    'publications','pub','books','ltd','limited','society','house','associates']

In [None]:
pub_ends_list = '|'.join([x.capitalize()+'\.?(?!\w)' for x in pub_ends])

In [None]:
pub_ends = [x.capitalize() for x in pub_ends]

In [None]:
pub_ends

``` 
    for e, index in enumerate(indices):
        
        if (e==len(indices)-1):
            end_index = -1
        else:
            end_index = indices[e+1][0]
        
        end_span = len(txt[indices[e][0]:end_index])   
        get_match = re.finditer('[A-Z]\w+[^A-Z]|[A-Z].[^A-Z]', txt[indices[e][0]:end_index])
        matches = [(m.span(), m.group()) for m in get_match]
        matches.reverse()
        
        for n, m in enumerate(matches):
            if n<len(matches)-1:
                if (m[0][1] != matches[n-1][0][0]):
                    end_span = m[0][1]
        
        result = txt[indices[e][0]:(indices[e][0] + end_span - 1)]
        
        if len(result) > len(indices[e][1]):
            names.append(txt[indices[e][0]:(indices[e][0] + end_span - 1)])
            spans.append(indices[e][0])
```

+ split into tokens
+ stop as soon 
+ stop as city names
+ put word tokenizer in review obj

In [None]:
known_publishers = ["Charles Scribner's Sons","Scribner","Macmillan","Funk & Wagnalls","McClure, Philips","Houghton Mifflin","G.P. Putnam's Sons", "G.P. Putnam",
 "Harper & Brothers","Harper","J.B. Lippincott","J. B. Lippincott", "Doubleday, Page","Doubleday","D. Appleton","Longmans, Green",
 "Longman","Henry Holt","Holt","Adam & Charles Black"]

In [None]:
review_list = []
for filename, txt in zip([x.split('.')[0] for x in filenames], txts):
    review_list.append(ReviewObj(filename, txt))

In [None]:
toks = review_list[24].cleaned_toks

In [None]:
temp_text = review_list[24].cleaned_text

In [None]:
def obscure_single_match(text, x, y):
    text_list = list(text)
    text_list[x:y] = list(len(text[x:y]) * '@')
    return ''.join(text_list)

In [None]:
obscure_single_match(temp_text, *(70,82))

In [None]:
for x in review_list[24].pub_names:
    print(x.review_loc)
    tbr = temp_text[x.review_loc[0]:x.review_loc[1]]
    tr = len(tbr) * '@'
    print(tbr)
    print(tr)
    temp_text.replace(tbr, tr, 1)

In [None]:
temp_text

```
def __obscure_matches(self, name = 'ex'):
    text_list = list(self.cleaned_text)
    if name == 'pub':
        for (x, y) in [pub.review_loc for pub in self.pub_names]:
                text_list[x:y] = list(len(self.cleaned_text[x:y]) * '@')
        if name == 'person':
            for (x, y) in [pers.review_loc for pers in self.person_names]:
                text_list[x:y] = list(len(self.cleaned_text[x:y]) * '@')
        return ''.join(text_list)
```

In [None]:
toks

In [None]:
city_dict = pickle.load(open('../data/city_dict.pkl', 'rb'))

In [None]:
from symspellpy.symspellpy import SymSpell, Verbosity

In [None]:
def is_part_of_pub(pub_part):
    if (pub_part == 'and') or (pub_part =='&'):
        return True
    elif city_dict.lookup(pub_part.lower(), Verbosity.CLOSEST, max_edit_distance=1):
        return False
    else:
        return pub_part.istitle()

In [None]:
is_part_of_pub('Egg')

In [None]:
pubnames = []
for e, tok in enumerate(toks):
    if tok.replace(",","").replace(".","").replace('-',"") in pub_ends:
        if is_part_of_pub(toks[e-1]):
            pub_name = [tok]
            pub_span = []
            for pos in range(e-1, e-6, -1):
                if toks[pos] == '.':
                    break
                elif not is_part_of_pub(toks[pos]):
                    break
                pub_name.append(toks[pos])
                pub_span.append(pos)   
            pubnames.append((pub_span[-1], e+1))

In [None]:
pubnames

In [None]:
def remove_duplicate_pubnames(pnlist):
    cleaned = []
    for e, (x, y) in enumerate(pnlist):
        starts = [a for (a,b) in pnlist[e+1:]]
        if x in starts:
            pass
        else:
            cleaned.append(pnlist[e])
    return cleaned

In [None]:
cleaned = []
for e, (x, y) in enumerate(pubnames):
    starts = [a for (a,b) in pubnames[e+1:]]
    if x in starts:
        pass
    else:
        cleaned.append(pubnames[e])

In [None]:
remove_duplicate_pubnames(pubnames)

In [None]:
for (x, y) in pubnames:
    newname = ' '.join(toks[x:y])
    print(' '.join(toks[x:y]))
    match = re.search(newname, review_list[0].cleaned_text)
    print(match.span())

In [None]:
any([x.isalpha() for x in ['Cassel','&','Co.']])

In [None]:
def get_publishers(review):
    """
    Takes a ReviewObj. 
    Returns a list of potential publishers. Searches using pub_ends, capitalization, and associates.
    
    For reference:
    -------------
    pub_ends = ['co','company','inc','incorporated','firm','press','group', 'pub','publishers','publishing',
                    'publications','books','ltd','limited','society','house','associates']
                    
    pub_associates = ['sons','son','brother','brothers']
    
    """
    
    pubs = []
    spans = []
    
    toks = review.cleaned_toks
    txt = review.cleaned_text
    
    pubnames = []
    
    for e, tok in enumerate(toks):
        if tok.replace(",","").replace(".","") in pub_ends:
            if is_part_of_pub(toks[e-1]):
                pub_name = []
                pub_span = []
                for pos in range(e-1, e-6, -1):
                    if toks[pos] == '.':
                        break
                    if not is_part_of_pub(toks[pos]):
                        break
                    pub_name.append(toks[pos])
                    pub_span.append(pos)
                if any([x.isalpha() for x in [word for word in pub_name if word !='and']]) and any([len(x)>2 for x in [word for word in pub_name if word !='and']]):
                    pubnames.append((pub_span[-1], e+1))
    
    if len(pubnames) > 0:
        for (x, y) in pubnames:
            newname = ' '.join(toks[x:y])
            pubs.append(newname)
            match = re.search(newname, txt)
            spans.append(match.span())

    pubs = [PubName(word) for word in pubs]
    
    for pub in pubs:
        pub.review_id = review.review_id
    
    for e, pub in enumerate(pubs):
        pub.review_id = review.review_id
        pub.review_loc = spans[e]
    
    return pubs

In [None]:
for filename, txt in zip([x.split('.')[0] for x in filenames],txts):
    rev = ReviewObj(filename, txt)
    print(filename)
    print(get_publishers(rev))
    print()

In [None]:
for filename, txt in zip([x.split('.')[0] for x in filenames],txts):
    rev = ReviewObj(filename, txt)
    print(filename)
    if rev.person_names:
        for x in rev.pub_names:
            print(x)
        print()

## Part 1: Person Names

In [None]:
titles = """Doctor,Dr,Mr,Mrs,Miss,Msgr,Monsignor,Rev,Reverend,Hon,Honorable,Honourable,Prof,Professor,Madame,Madam,Lady,Lord,Sir,Dame,Master,Mistress,Princess,Prince,Duke,Duchess,Baron,Father,Chancellor,Principal,President,Pres,Warden,Dean,Regent,Rector,Provost,Director
"""

In [None]:
titles = titles.rstrip().split(',')

In [None]:
title_list = '\.?\s(?=[A-Z])|'.join(titles)

In [None]:
def remove_punct_not_following_title_or_initial(name):
    name_parts = name.split()
    cleaned_name = []
    for part in name_parts:
        if part[-1] in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~':
            if (len(part)>2) and (part[:-1] not in titles):
                cleaned_name.append(part[:-1])
            else:
                cleaned_name.append(part)
        else:
            cleaned_name.append(part)
    return ' '.join(cleaned_name)

In [None]:
def clean_name(name):
    name = remove_punct_not_following_title_or_initial(name)
    cleaned_name = []
    return ' '.join([word for word in name.split() if (word[0].isalpha())])

In [None]:
def getCapitalizedWords(txt):
    """
    Returns strings of capitalized words up 3 words long. 
    Removes words/phrases containing stopwords and words found later in the text lowercased.
    """
    #listen idk why it won't just let me put in an optional repeat either
    all_words = []
    
    #three words
    all_words.extend([match for match in re.findall('[A-Z]\S* [A-Z]\S* [A-Z]\S+', txt) if 
                      all([(remove_punct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(remove_punct(word).lower() not in txt) for word in match.split()])])
    
    #two words
    two_words = [match for match in re.findall('[A-Z]\S* [A-Z]\S+', txt) if 
                      all([(remove_punct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(remove_punct(word).lower() not in txt) for word in match.split()])
                      and all([match not in x for x in all_words])]
    
    all_words.extend(two_words)
    
    #one word
    one_words = [match for match in re.findall('[A-Z]\S+', txt) if 
                      (remove_punct(match).lower() not in stopword_list) 
                      and (remove_punct(match).lower() not in txt)
                      and all([match not in x for x in all_words])]
    
    all_words.extend(one_words)
    
    return [word for word in [' '.join([removePunct(y) for y in x.split() if remove_punct(y) not in titles]) 
                      for x in all_words] if (len(word)>1)]

In [None]:
def consolidateNames(name_list):
    """
    Takes list of AuthNames and returns list of lists, consolidating by likely identical authors.
    """
    name_set = []
    used_indices = []
    last_names = sorted([name.last_name for name in name_list], key=len)
    
    for i, name in enumerate(last_names):
        if i not in used_indices:
            full_name = [x for x in name_list if x.last_name == name][0]
            name_holder = [full_name]
            for j, name2 in enumerate(last_names):
                full_name2 = [x for x in name_list if x.last_name == name2][0]
                if (i < (len(last_names) - 1)) and (i!=j):
                    if (edit_distance(name, name2[:len(name)+1]) < 2) and (j not in used_indices):
                        if (full_name.first_initial==full_name2.first_initial and full_name.middle_initial==full_name2.middle_initial or full_name.title==full_name2.title) or (full_name.first_initial==full_name2.first_initial or full_name.middle_initial==full_name2.middle_initial and full_name.title==full_name2.title):
                            name_holder.append(full_name2)
                            used_indices.append(j)
            used_indices.append(i)
            name_set.append(name_holder)
    
    return name_set                  

In [None]:
def obscure_matches(self, name = 'ex'):
    text_list = list(self.cleaned_text)
    if name == 'pub':
        for (x, y) in [pub.review_loc for pub in self.pub_names]:
            text_list[x:y] = list(len(self.cleaned_text[x:y]) * '@')
    if name == 'person':
        for (x, y) in [pers.review_loc for pers in self.person_names]:
            text_list[x:y] = list(len(self.cleaned_text[x:y]) * '@')
    return ''.join(text_list)

In [None]:
obscure_matches(review_list[35], name = 'pub')

In [None]:
review_list[0].no_pubs_text

In [None]:
cleaned_text = review_list[0].cleaned_text

In [None]:
text_list = list(cleaned_text)

In [None]:
for (x, y) in [x.review_loc for x in review_list[0].pub_names]:
    print(text_list[x:y])
    print(len(review_list[0].cleaned_text[x:y]) * '@')
    text_list[x:y] = list(len(review_list[0].cleaned_text[x:y]) * '@')

In [None]:
''.join(text_list)

In [None]:
def get_names_following_titles(review):
    """
    Returns names following titles - specifically capitalized titles followed by capitalized names.
    Names can be any number of words in length, and can include punctuation.
    
    """
    names = []
    spans = []
    
    txt = review.cleaned_text

    iterx = re.finditer(title_list, txt)
    indices = [(m.start(), m.group()) for m in iterx]
    
    for e, index in enumerate(indices):
        
        if (e==len(indices)-1):
            end_index = -1
        else:
            end_index = indices[e+1][0]
        
        end_span = len(txt[indices[e][0]:end_index])   
        get_match = re.finditer('[A-Z]\w+[^A-Z]|[A-Z].[^A-Z]', txt[indices[e][0]:end_index])
        matches = [(m.span(), m.group()) for m in get_match]
        matches.reverse()
        
        for n, m in enumerate(matches):
            if n<len(matches)-1:
                if (m[0][1] != matches[n-1][0][0]):
                    end_span = m[0][1]
        
        result = txt[indices[e][0]:(indices[e][0] + end_span - 1)]
        
        if len(result) > len(indices[e][1]):
            names.append(txt[indices[e][0]:(indices[e][0] + end_span - 1)])
            spans.append(indices[e][0])
        
    names = [word.replace("'s", "") for word in names]
    names = [PersonName(clean_name(word)) for word in names]
    
    for e, name in enumerate(names):
        name.review_id = review.review_id
        name.review_loc = (spans[e], spans[e]+len(name))
    
    return names

In [None]:
get_names_following_titles(review_list[20])

## ReviewObject

In [None]:
# class ReviewObj():
    
#     def __findnames(self):
#         self.pub_names = get_publishers(self)
#         self.person_names = get_names_following_titles(self)
        
#     def __init__(self, aps_id, txt):
#         self.review_id = aps_id
#         self.original_text = txt
#         self.cleaned_text = preprocess_text(txt)
        
#         self.__findnames()

In [None]:
# review_ex = ReviewObj(136726613, txts[0])

In [None]:
# for x in review_ex.person_names:
#     print(x)

In [None]:
# for filename, txt in zip([x.split('.')[0] for x in filenames],txts):
#     rev = ReviewObj(filename, txt)
#     print(filename)
#     if rev.person_names:
#         for x in rev.person_names:
#             print(x)
#         print()

In [None]:
# for x in review_ex.person_names:
#     print(x)

spacy NLP object steal their code

In [None]:
# for txt, filename in zip(txts, [x.split('.')[0] for x in filenames]):
#     print(filename)
#     docnames = get_names_following_titles(txt, filename)
#     for name in docnames:
#         print(name, name.review_loc)
#     print()

## Goals
Make each Person name aware of others and able to check for potential matches?

Unnamed: 0,date,C,55 day high,20 day low
0,2020-01-01 01:01:00,7147.69,7163.32,7147.69
1,2020-01-01 01:02:00,7158.31,7163.32,7147.69
2,2020-01-01 01:03:00,7164.08,7163.32,7147.69
3,2020-01-01 01:04:00,7157.01,7163.32,7147.69
4,2020-01-01 01:05:00,7159.85,7163.32,7147.69
5,2020-01-01 01:06:00,7161.29,7163.32,7147.69
6,2020-01-01 01:07:00,7161.29,7163.32,7147.69
7,2020-01-01 01:08:00,7145.28,7162.03,7147.69
8,2020-01-01 01:09:00,7161.29,7162.03,7147.69


In [9]:
import pandas as pd 
df = pd.DataFrame([
    ['2020-01-01 01:01:00', 7147.69, 7163.32, 7147.69],
    ['2020-01-01 01:02:00', 7158.31, 7163.32, 7147.69],
    ['2020-01-01 01:03:00', 7164.08, 7163.32, 7147.69],
    ['2020-01-01 01:04:00', 7157.01, 7163.32, 7147.69],
    ['2020-01-01 01:05:00', 7159.85, 7163.32, 7147.69],
    ['2020-01-01 01:06:00', 7161.29, 7163.32, 7147.69],
    ['2020-01-01 01:07:00', 7161.29, 7163.32, 7147.69],
    ['2020-01-01 01:08:00', 7145.28, 7162.03, 7147.69],
    ['2020-01-01 01:09:00', 7161.29, 7162.03, 7147.69],
], columns=['date', 'C', '55 day high', '20 day low'])

new_col = []
state = 0
for row in df.iterrows():
    if row[1]['C'] > row[1]['55 day high']:
        state = 1
    if row[1]['C'] < row[1]['20 day low']:
        state = 0
    new_col.append(state)

df['result'] = new_col
df

Unnamed: 0,date,C,55 day high,20 day low,result
0,2020-01-01 01:01:00,7147.69,7163.32,7147.69,0
1,2020-01-01 01:02:00,7158.31,7163.32,7147.69,0
2,2020-01-01 01:03:00,7164.08,7163.32,7147.69,1
3,2020-01-01 01:04:00,7157.01,7163.32,7147.69,1
4,2020-01-01 01:05:00,7159.85,7163.32,7147.69,1
5,2020-01-01 01:06:00,7161.29,7163.32,7147.69,1
6,2020-01-01 01:07:00,7161.29,7163.32,7147.69,1
7,2020-01-01 01:08:00,7145.28,7162.03,7147.69,0
8,2020-01-01 01:09:00,7161.29,7162.03,7147.69,0
