# Initial Setup

- import various helpers, load data, select reviews by status and category

In [1]:
import sys
sys.path.append('../')

In [2]:
import os
import pandas as pd
import sqlite3

In [3]:
from database import *
import database.models as models

In [4]:
# load full text from db
aps_rows = models.Review().query.filter(models.Review.status.in_(('needs_crosscheck', 'needs_details', 'done'))).all()

In [5]:
conn = sqlite3.connect('/Volumes/TOSHIBA EXT/datasets/nyt_reviews_datastore.db')
c = conn.cursor()
# query nyt reviews and not
nyt_rows = c.execute("SELECT * FROM metadata WHERE review_type IN ('not_review', 'multi', 'cluster', 'really_multi', 'single_focus')").fetchall()
len(nyt_rows)

8569

In [6]:
aps_single_focus = [i for i in aps_rows if i.review_type == 'single_focus']
len(aps_single_focus)

1003

## In NYTBR section, Book Review or Not Book Review

In [7]:
nyt_rows[0][3]

'A NEW ESSAYIST.; C.F.G. Masterman, M.P., Criticises Kipling and Other British Institutions.'

In [8]:
nyt_not_review = [i for i in nyt_rows if i[12] == 'not_review']
nyt_review = [i for i in nyt_rows if i[12] in ('multi', 'cluster', 'really_multi', 'single_focus')]
len(nyt_review), len(nyt_not_review)

(4242, 4327)

In [9]:
list_of_full_txt = [i[4] for i in nyt_review] + [i[4] for i in nyt_not_review]
# make "true labels" (0s and 1s so scikit learn can score them)
nyt_labels = [0 for i in range(len(nyt_review))] + [1 for i in range(len(nyt_not_review))]
len(list_of_full_txt) == len(nyt_labels)

True

In [10]:
# import various from scikit learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import cross_val_score

In [11]:
# set up logistic regression
v = CountVectorizer(max_features)
X = v.fit_transform(list_of_full_txt)
tfidf = TfidfTransformer()
Z = tfidf.fit_transform(X)
# instantiate the model
lr = LogisticRegression()

In [12]:
scores = cross_val_score(lr, Z, nyt_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.84 (+/- 0.09)


In [13]:
# split the rows into training data, training labels, test data, and test labels
# test on 33% of the data
X_train, X_test, y_train, y_test = train_test_split(Z, nyt_labels, test_size=0.33, random_state=12)

# fit to the training data
lr.fit(X_train, y_train)

# make label predictions
results = lr.predict(X_test)

# generate probabilities for each label
probs = lr.predict_proba(X_test)

In [22]:
scores = {}
# generate f1, precision, recall, and accuracy scores
# I will discuss each of these in the lesson
for y,z in [("review",0),("not_review",1)]:
    scores[y] = {}
    scores[y]["f1"] = f1_score(y_test, results, pos_label=z, average='binary')  
    scores[y]["precision"] = precision_score(y_test, results, pos_label=z, average='binary')
    scores[y]["recall"] = recall_score(y_test, results, pos_label=z, average='binary')

scores["accuracy"] = accuracy_score(y_test, results)
scores

{'review': {'f1': 0.884814942926323,
  'precision': 0.8498338870431894,
  'recall': 0.9227994227994228},
 'not_review': {'f1': 0.8795660036166365,
  'precision': 0.9191232048374905,
  'recall': 0.8432732316227461},
 'accuracy': 0.8822489391796322}

In [23]:
terms = []
coefs = []
for key,val in v.vocabulary_.items():
    terms.append(key)
    coefs.append(lr.coef_[0][val])

In [24]:
# this block produces a dataframe with the top 30 terms associated with label 0
df_coef = pd.DataFrame()
df_coef['term'] = terms
df_coef['coef'] = coefs
df_coef = df_coef.sort_values(by='coef').reset_index(drop=True)
df_coef.head(30)

Unnamed: 0,term,coef
0,pp,-5.146061
1,he,-3.361216
2,and,-2.945701
3,her,-2.848645
4,volume,-2.781168
5,she,-2.397521
6,with,-2.388235
7,tile,-2.328123
8,reader,-2.254851
9,12mo,-2.213153


In [25]:
# to view the top 30 terms associated with label 1, we look at the bottom rows of the same dataframe
#I2mo ... pub announcements have both, but OCR may be of a lower quality with long lists and blurb announcements
#or just more hances to get it wrong
df_coef.tail(30)

Unnamed: 0,term,coef
408132,literary,1.966858
408133,number,2.005087
408134,has,2.0067
408135,publishers,2.025496
408136,magazine,2.029833
408137,yesterday,2.068803
408138,performance,2.113786
408139,l2mo,2.126622
408140,london,2.154999
408141,times,2.157221


In [26]:
I2mos = [i for i in nyt_not_review if 'I2mo' in i[4]]
I2mos[1][4][:100]

'LATEST PUBLICATIONS  Books Received During the Week Ended July 25 Classified and Annotated According'

In [27]:
# run on aps_reviews
# make predictions using nonbinary data 
aps_reviews = [i.full_text for i in aps_rows if i.review_type in ('single_focus', 'multi', 'cluster')]
aps_ids = [i.record_id for i in aps_rows if i.review_type in ('single_focus', 'multi', 'cluster')]
aps_urls = ["https://aps-web-app.matthew-lavin.com/static/pdf/%s.pdf"%i for i in aps_ids]

aps_vectors = v.transform(aps_reviews)
aps_tfidf = tfidf.fit_transform(aps_vectors)

# generate probabilities for each label
aps_probs = lr.predict_proba(aps_tfidf)

#display the results as a pandas dataframe
aps_results = pd.DataFrame()

# make columns for the original label, the nyt_id, the cluster_id, the pdf url, and the predicted probabilities

aps_results['aps_id'] = aps_ids
aps_results['url'] = aps_urls
aps_results['prob_review'] = [i[0] for i in aps_probs]
aps_results['prob_not_review'] = [i[1] for i in aps_probs]
len(aps_results.loc[aps_results['prob_review'] > 0.5].reset_index())/len(aps_results)
#79.39% of aps reviews have a naive probability score over .5

0.793947198969736

In [28]:
#this doesn't tell us how many false positives we might get, just that a model trained on NYT reviews usually recognizes APS reviews as reviews
aps_not_reviews = [i.full_text for i in aps_rows if i.review_type == 'not_review']
aps_non_review_ids = [i.record_id for i in aps_rows if i.review_type == 'not_review']
aps_non_review_urls = ["https://aps-web-app.matthew-lavin.com/static/pdf/%s.pdf"%i for i in aps_non_review_ids]

aps_non_review_vectors = v.transform(aps_not_reviews)
aps_non_review_tfidf = tfidf.fit_transform(aps_non_review_vectors)

# generate probabilities for each label
aps_non_review_probs = lr.predict_proba(aps_non_review_tfidf)

#display the results as a pandas dataframe
aps_non_review_results = pd.DataFrame()

# make columns for the original label, the nyt_id, the cluster_id, the pdf url, and the predicted probabilities

aps_non_review_results['aps_id'] = aps_non_review_ids
aps_non_review_results['url'] = aps_non_review_urls
aps_non_review_results['prob_review'] = [i[0] for i in aps_non_review_probs]
aps_non_review_results['prob_not_review'] = [i[1] for i in aps_non_review_probs]
len(aps_non_review_results.loc[aps_non_review_results['prob_not_review'] > 0.5].reset_index())/len(aps_non_review_results)
# 58.93% of non-reviews would have a non-review probability over 50%, so we might want to adjust to reduce false positives 
# However, say we started with a mix of 80/20 reviews and not reviews
# If we got these results with 1000 objects, we would have 635 true postives, 165 false negatives, 118 true negatives and 82 false positives
# If this were all true, we'd be running calculations on a sample that's 88.5% book reviews and 11.5% not
# Pretty good, but we want better, especially the false positive
# Option 1: improve the model with data, setup, or learning method (labor)
# Option 2: raise the probability threshold to be considered a review (also creates more false negatives)

0.5893719806763285

## Other Models

- As the number of models evaluated on the same data increases, the odds of model performance be good just by chance goes up. This is not as straightforward as a p-value with multiple hypotheses, but it needs to be considered.
- As a result, this is exploratory, and models should be validated against separate datasets in the future 

In [14]:
#svm
from sklearn.svm import SVC
svm = SVC()
scores = cross_val_score(svm, Z, nyt_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.50 (+/- 0.00)


In [15]:
from sklearn import linear_model
lasso = linear_model.Lasso()
scores = cross_val_score(lasso, Z, nyt_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: -0.00 (+/- 0.00)


In [16]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(neigh, Z, nyt_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.61 (+/- 0.13)


In [18]:
from sklearn.svm import SVC
svm = SVC(kernel='sigmoid')
scores = cross_val_score(svm, Z, nyt_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.50 (+/- 0.00)


In [77]:
from tensorflow import keras 

v = CountVectorizer(max_features=10000)
X = v.fit_transform(list_of_full_txt)
v.vocabulary_

tfidf = TfidfTransformer()
Z = tfidf.fit_transform(X)

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(10000,)),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
])
X_train, X_test, y_train, y_test = train_test_split(Z, nyt_labels, test_size=0.33, random_state=42)

In [78]:
tfidf = TfidfTransformer()
Z = tfidf.fit_transform(X)

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(10000,)),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
])
X_train, X_test, y_train, y_test = train_test_split(Z, nyt_labels, test_size=0.33, random_state=42)

In [79]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [80]:
model.fit(X_train, y_train, epochs=20, batch_size=500)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x12d863748>

In [81]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.8988684583587053


In [82]:
from tensorflow import keras 

v = CountVectorizer(max_features=30)
X = v.fit_transform(list_of_full_txt)

tfidf = TfidfTransformer()
Z = tfidf.fit_transform(X)

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(30,)),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
])
X_train, X_test, y_train, y_test = train_test_split(Z, nyt_labels, test_size=0.33, random_state=14)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=5)
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test accuracy: 0.7142857142014079
