In [9]:
import math, ast
import pandas as pd 
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from utilities import text_functions as tf
from utilities import scikit_functions as sf

In [10]:
df_meta = pd.read_csv('meta.csv', index_col=0)
type_summary = df_meta.groupby('review_type').agg({'review_type':['count',]}).reset_index(drop=False).sort_values(by=('review_type', 'count'), ascending=False)
type_summary

Unnamed: 0_level_0,review_type,review_type
Unnamed: 0_level_1,Unnamed: 1_level_1,count
5,single_focus,1093
2,multi,487
4,not_review,247
1,cluster,244
0,brief,82
3,no_pdf,2


In [11]:
df_single = pd.read_csv('single_author_meta.csv', index_col=0)
df_single_genre_summary = df_single.groupby('genre_parsed').agg({'genre_parsed':['count',]}).reset_index(drop=False).sort_values(by=('genre_parsed', 'count'), ascending=False)
df_single_genre_summary

Unnamed: 0_level_0,genre_parsed,genre_parsed
Unnamed: 0_level_1,Unnamed: 1_level_1,count
2,nonfiction,801
1,fiction,226
3,poetry,34
0,drama,18


In [13]:
# load ngrams data ... this will probably take about 30-60 seconds wall time

ngram_stores_lower = {}

base = 'extracted_features/ngrams'
    
for e in range(0,5):
    ngram_store = {}
    for i in df_single['record_id']:
        this_csv= f'{base}/{str(e)}/{str(i)}.csv'
        df = pd.read_csv(this_csv, index_col=0).dropna().reset_index(drop=True).set_index('ngram')
        mydict = df['count'].to_dict()
        try:
            mycounter = Counter({ast.literal_eval(k):v for k,v in mydict.items()})
        except:
            mycounter = Counter(mydict)
        ngram_store[i] = mycounter
    ngram_stores_lower[e] = ngram_store

In [14]:
# review text list of counters 
review_counters_all = [ngram_stores_lower[0][i] for i in df_single['record_id']]
review_counters_all_no_stops = tf.remove_from_list_of_dicts(stopwords.words('english')+['nan'], review_counters_all)

In [15]:
# metadata text list of counters
df_authors = pd.read_csv('authors_meta.csv', index_col=0)
df_titles = pd.read_csv('titles_meta.csv', index_col=0)
df_publishers = pd.read_csv('publishers_meta.csv', index_col=0)

df_authors = df_authors.rename(mapper={'reviewed_author_name':'match_string_raw'}, axis=1)
df_titles = df_titles.rename(mapper={'reviewed_book_title':'match_string_raw'}, axis=1)
df_publishers = df_publishers.rename(mapper={'reviewed_book_publisher':'match_string_raw'}, axis=1)

def make_columns(df):
    df['match_string_raw'] = df['match_string_raw'].fillna('')
    df['match_column'] = [tf.preprocess_text(str(i).lower()) for i in df['match_string_raw']]
    df['no_stops'] = [[d for d in i if d not in stopwords.words('english')] for i in df['match_column']]
    return df

df_authors = make_columns(df_authors)
df_titles = make_columns(df_titles)
df_publishers = make_columns(df_publishers)
df_authors.head(5)

Unnamed: 0,record_id,match_string_raw,match_column,no_stops
0,124900101,Lucien Carr,"[lucien, carr]","[lucien, carr]"
1,89760874,W. S. Jeans,"[w, s, jeans]","[w, jeans]"
2,89659668,Lord Byron,"[lord, byron]","[lord, byron]"
3,89659668,W. A. Lewis Bettany,"[w, a, lewis, bettany]","[w, lewis, bettany]"
4,124736362,Compton Mackenzie,"[compton, mackenzie]","[compton, mackenzie]"


## Naive BOW Matching

In [16]:
meta_counters_store = {}

for df in [df_authors, df_titles, df_publishers]:
    
    for e, row in df.iterrows():
        data = row['no_stops'].copy()
        try:
            meta_counters_store[row['record_id']].extend([i for i in data if i !='nan'])
        except KeyError:
            meta_counters_store[row['record_id']] = [i for i in data if i !='nan']

meta_counters_all = [Counter(meta_counters_store[i]) for i in df_single['record_id']]

In [17]:
review_meta_combined = review_counters_all_no_stops + meta_counters_all

In [18]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(review_meta_combined)
X_reviews_only = v.transform(review_counters_all_no_stops)
X_meta_only = v.transform(meta_counters_all)

# to here

In [19]:
%%time
reviews_meta_records = sf.pairwise_cosine(X_reviews_only, X_meta_only)

CPU times: user 3min 52s, sys: 14.4 s, total: 4min 7s
Wall time: 4min 8s


In [20]:
%%time
reviews_meta_scores = sf.pairwise_performance(reviews_meta_records)

CPU times: user 17.9 s, sys: 4.89 s, total: 22.8 s
Wall time: 22.9 s


In [21]:
reviews_meta_scores[0], len(reviews_meta_scores) 

([1, 0.8619091751621872], 791)

## Pretrained NER

In [None]:
df_ents_joined = pd.read_csv('extracted_features/spacy_entities_all.csv', index_col=0)
df_ents_joined.head(5)

In [None]:
spacy_ner_counters_store = {}
entities = ['PERSON', 'GPE', 'NORP', 'ORG', 'FAC', 'EVENT', 'LOC', 'PRODUCT', 'WORK_OF_ART', 'LAW']        
df_ents_selected = df_ents_joined.loc[df_ents_joined['label'].isin(entities)]
df_ents_selected['match_column'] = [tf.preprocess_text(str(i).lower()) for i in df_ents_selected['text']]
df_ents_selected['no_stops'] = [[d for d in i if d not in stopwords.words('english')] for i in df_ents_selected['match_column']]
df_ents_selected.head(5)


In [None]:
for e, row in df_ents_selected.iterrows():
    data = row['no_stops'].copy()
    try:
        spacy_ner_counters_store[row['record_id']].extend(data)
    except KeyError:
        spacy_ner_counters_store[row['record_id']] = data
            
spacy_ner_counters_all = [Counter(spacy_ner_counters_store[i]) for i in df_single['record_id']]

In [None]:
spacy_meta_combined = spacy_ner_counters_all + meta_counters_all

v = DictVectorizer(sparse=False)
X = v.fit_transform(spacy_meta_combined)
X_spacy_ner_only = v.transform(spacy_ner_counters_all)
X_meta_only = v.transform(meta_counters_all)

In [None]:
spacy_meta_records = sf.pairwise_cosine(X_spacy_ner_only, X_meta_only)

In [None]:
spacy_meta_scores = sf.pairwise_performance(spacy_meta_records)

In [None]:
spacy_meta_scores[0], len(spacy_meta_scores)

## Rule-Based Matching

In [None]:
# select features based on all matching title ngrams, publisher ngrams, author surnames and associated names 
x1 = list(df_publishers['no_stops'])
pub_tokens = list(set([j for i in x1 for j in i]))

x2 = list(df_titles['no_stops'])
title_tokens = list(set([j for i in x2 for j in i]))

In [None]:
# load extracted author data
df_ac = pd.read_csv('extracted_features/author_candidates.csv', index_col=0)
df_an = pd.read_csv('extracted_features/associated_names.csv', index_col=0)

# remove nan, lowercase all, reduce to unique 
x3 = [eval(i) for i in df_ac.loc[df_ac['entity'] != 'nan']['entity']] 
x4 = [eval(i) for i in df_an.loc[df_an['entity'] != 'nan']['entity']]  

candidate_tokens = list(set([j.lower() for i in x3 for j in i]))
associated_tokens = list(set([j.lower() for i in x4 for j in i]))

In [None]:
extracted_feature_tokens = [i for i in list(set(candidate_tokens + associated_tokens + pub_tokens + title_tokens)) if i != 'nan']
len(pub_tokens), len(title_tokens), len(candidate_tokens), len(associated_tokens), len(extracted_feature_tokens)

In [None]:
review_counters_extracted_features = tf.cull_list_of_dicts(extracted_feature_tokens, review_counters_all)

In [None]:
extracted_meta_combined = review_counters_extracted_features + meta_counters_all

v = DictVectorizer(sparse=False)
X = v.fit_transform(extracted_meta_combined)
X_extracted_only = v.transform(review_counters_extracted_features)
X_meta_only = v.transform(meta_counters_all)

In [None]:
extracted_meta_records = sf.pairwise_cosine(X_extracted_only, X_meta_only)
extracted_meta_scores = sf.pairwise_performance(extracted_meta_records)

In [None]:
df_scores_bow = pd.DataFrame.from_records(reviews_meta_scores, columns = ['window', 'recall']).set_index('window')
df_scores_rules = pd.DataFrame.from_records(extracted_meta_scores, columns = ['window', 'recall']).set_index('window')
df_scores_ner = pd.DataFrame.from_records(spacy_meta_scores, columns = ['window', 'recall_ner']).set_index('window')

df_scores_all = df_scores_bow.join(df_scores_rules, lsuffix='_bow', rsuffix='_rules').join(df_scores_ner).reset_index()
df_scores_all = df_scores_all.loc[df_scores_all['window'] % 5 == 0]
df_scores_all = df_scores_all.head(10)


dfm = df_scores_all.rename(mapper={'recall_bow':'Naive BOW', 'recall_ner':'Pretrained NER', 'recall_rules':'Rule-based', }, axis=1).melt('window', var_name='column', value_name='score')
dfm

In [None]:
fig = plt.subplots(figsize=(9, 7))
sns.lineplot(x='window', y='score', hue='column', data=dfm, marker='o', markerfacecolor='black').set(title='Predicted Label is Correct or Close', ylabel='Percent of Reviews Correct or Close', xlabel='Proximity Threshold')


In [None]:
extracted_meta_scores[789]

In [None]:
spacy_meta_scores[-1]

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def prf(records):
    df = pairwise_df(records)
    return precision_score(df['source'], df['target'], average='weighted', zero_division=0.0), recall_score(df['source'], df['target'], average='weighted'), f1_score(df['source'], df['target'], average='weighted')

prf(reviews_meta_records), prf(extracted_meta_records), prf(spacy_meta_records)

## False Positives Analysis

In [None]:
def analyze_fp(df):
    df_fp = df.loc[df['source'] != df['target']]
    fp_counts = df_fp.groupby('target').count()[['score']].sort_values(by='score', ascending=False)
    return fp_counts

def tp_vs_fp_scores(df, index):
    return df.loc[df['target'] == index].sort_values(by='score', ascending=False)

In [None]:
naive_df = pairwise_df(reviews_meta_records)
naive_fp = analyze_fp(naive_df)

spacy_df = pairwise_df(spacy_meta_records)
spacy_fp = analyze_fp(spacy_df)

extracted_df = pairwise_df(extracted_meta_records)
extracted_fp = analyze_fp(extracted_df)

In [None]:
fps_joined = naive_fp.join(spacy_fp, how='outer', lsuffix='_naive', rsuffix='_spacy').join(extracted_fp, how='outer').fillna(0).rename(mapper={'score':'extracted_score'}, axis=1)
fps_joined['sum'] = fps_joined.sum(axis=1)
fps_joined['mean_naive_extracted'] = (fps_joined['score_naive'] + fps_joined['extracted_score'])/2

In [None]:
fps_joined.sort_values(by='score_spacy', ascending=False).head(8)

In [None]:
fps_joined.sort_values(by='mean_naive_extracted', ascending=False).head(15).reset_index()

In [None]:
worst_spacy = fps_joined.sort_values(by='score_spacy', ascending=False).head(3).index.to_list()
worst_overall = fps_joined.sort_values(by='sum', ascending=False).head(15).index.to_list()
worst_naive_extracted = fps_joined.sort_values(by='mean_naive_extracted', ascending=False).head(15).index.to_list()

In [None]:
print(fps_joined[['score_naive','score_spacy','extracted_score']].corr().to_markdown())

In [None]:
def index_to_meta(fp_index):
    ri = df_single.reset_index().iloc[fp_index]['record_id']
    a = ' '.join(df_authors.loc[df_authors['record_id'] == ri]['match_string_raw'].to_list())
    t = ' '.join(df_titles.loc[df_titles['record_id'] == ri]['match_string_raw'].to_list())
    p = ' '.join(df_publishers.loc[df_publishers['record_id'] == ri]['match_string_raw'].to_list())
    return [a,t,p]


In [None]:
for i in worst_spacy:
    print(index_to_meta(i))

In [None]:
for i in worst_spacy:
    print(meta_counters_all[i].keys())

In [None]:
for i in worst_naive_extracted :
    print(index_to_meta(i))

In [None]:
for i in worst_naive_extracted:
    print(meta_counters_all[i].keys())

In [None]:
for i in worst_spacy:
    print(list(spacy_ner_counters_all[i].keys()))

In [None]:
tp_vs_fp_scores(naive_df, 63)

In [None]:
tp_vs_fp_scores(naive_df, 707)

In [None]:
def describe_tp_fp(df, strategy):
    # naive TPs
    tp = pd.DataFrame(df.loc[df['source'] == df['target']]['score'].describe())
    fp = pd.DataFrame(df.loc[df['source'] != df['target']]['score'].describe())
    joined = tp.join(fp, lsuffix='_tp', rsuffix='_fp')
    return joined.rename(mapper={'score_tp':f'{strategy}_tp', 'score_fp':f'{strategy}_fp', }, axis=1)


In [None]:
naive_d = describe_tp_fp(naive_df, 'naive')
spacy_d = describe_tp_fp(spacy_df, 'ner')
extracted_d = describe_tp_fp(extracted_df, 'rule_based')

joined_d = naive_d.join(spacy_d).join(extracted_d)
print(joined_d[joined_d.index.isin(['mean', '25%','50%','75%','min', 'max'])].to_markdown())

In [None]:

def logit_on_cos_sim(df):
    X = df.loc[df['source'] != df['target']]['score'].to_list() +  df.loc[df['source'] == df['target']]['score'].to_list()
    X_array = np.array(X).reshape(-1, 1)
    y = [ 1 for i in df.loc[df['source'] != df['target']]['score'].to_list()] + [0 for i in df.loc[df['source'] == df['target']]['score'].to_list()]
    clf = LogisticRegression(class_weight={0: 0.2, 1:0.8}).fit(X_array, y)
    predicted = clf.predict(X_array)
    probs = clf.predict_proba(X_array)
    #return clf.score(X_array, y)
    df_logit = pd.DataFrame()
    df_logit['score'] = X
    df_logit['predicted'] = predicted
    df_logit['actual'] = y
    df_logit['prob_correct'] = [i[0] for i in probs]
    df_logit['prob_incorrect'] = [i[1] for i in probs]
    df_logit['correctly_classified'] = df_logit['predicted'] == df_logit['actual']
    df_logit_sorted = df_logit.sort_values(by='score')
    return df_logit_sorted

def model_acc(df):
    return df.loc[df['predicted'] == df['actual']].shape[0]/df.shape[0]

In [None]:
df_logit_naive = logit_on_cos_sim(naive_df)
model_acc(df_logit_naive)

In [None]:
df_logit_spacy = logit_on_cos_sim(spacy_df)
model_acc(df_logit_spacy)

In [None]:
df_logit_extracted = logit_on_cos_sim(extracted_df)
model_acc(df_logit_extracted)

In [None]:
def get_quantile_accuracy(df, q=5):
    r = math.ceil(df.shape[0]/q)
    df_low = df.head(r)
    df_high = df.tail(r*(q-1))
    output = []
    for i in [df_low, df_high]:
        this_df = i.groupby(['predicted', 'actual']).count()[['score']].rename(mapper={'score':'count'}, axis=1)
        output.append(this_df)
        this_df['min_cos_sim'] = i['score'].min()
        this_df['max_cos_sim'] = i['score'].max()
    return output[0].join(output[1], how='outer', lsuffix='_low', rsuffix='_high')

get_quantile_accuracy(df_logit_naive)

In [None]:
quantiles_naive = get_quantile_accuracy(df_logit_naive)
quantiles_extracted = get_quantile_accuracy(df_logit_extracted)
quantiles_spacy = get_quantile_accuracy(df_logit_spacy)

m = {}
for i in quantiles_spacy.columns:
    m[i] = i + '_spacy'    
    
quantiles_spacy = quantiles_spacy.rename(mapper=m, axis=1)

In [None]:
quantiles_joined = quantiles_naive.join(quantiles_extracted, lsuffix='_naive', rsuffix='_extracted').join(quantiles_spacy).fillna(0).reset_index()

for i in ['naive', 'extracted', 'spacy']:
    quantiles_joined['sum_'+i] = quantiles_joined[f'count_low_{i}'] + quantiles_joined[f'count_high_{i}']

quantiles_joined

quantiles_separated = []
for x in ['naive', 'extracted', 'spacy']:
    quantiles_separated.append(quantiles_joined[['predicted', 'actual'] + [i for i in quantiles_joined.columns if x in i]])
quantiles_separated[2][[i for i in quantiles_separated[2].columns if 'cos_sim' in i]]


In [None]:
for x in quantiles_separated:
    #x_selected = x[[i for i in x.columns if 'count' in i]]
    correct = x.loc[x['predicted'] == x['actual']].set_index('actual')
    accuracies = []
    for y in ['count_low', 'count_high']:
        c = [i for i in x.columns if y in i]
        acc = correct.sum()[c[0]] / x.sum()[c[0]]
        print(y, acc)
        accuracies.append(acc)

In [None]:
math.ceil(1079/4)

In [None]:
df_logit_naive.tail(500).groupby(['actual']).count()

In [None]:
def p_r_plot(df, strategy):
    mpl.rcParams['lines.markersize'] = 4 
    fig = plt.subplots(figsize=(9, 7))
    precision, recall, thresholds = precision_recall_curve(df['actual'],df['prob_correct'], pos_label=0)
    df_pr_curve = pd.DataFrame()
    df_pr_curve['precision'] = precision
    df_pr_curve['recall'] = recall
    df_pr_curve['thresholds'] = np.insert(thresholds, 0, 0)
    sns.lineplot(x='recall', y='precision', data=df_pr_curve, marker='o', markerfacecolor='black').set(title=f'Precision-Recall Curve ({strategy} Model)', ylabel='Precision', xlabel='Recall')
    return df_pr_curve


In [None]:
naive_curve = p_r_plot(df_logit_naive, "Naive BOW")

In [None]:
spacy_curve = p_r_plot(df_logit_spacy, "Pre-trained NER")

In [None]:
extracted_curve = p_r_plot(df_logit_extracted, "Rule-Based")