# Initial Setup

- import various helpers, load data, select reviews by status and category

In [4]:
import sys
sys.path.append('../')

In [5]:
import sqlite3
import pandas as pd
import re
from collections import Counter

In [6]:
from database import *
import database.models as models

In [7]:
# load full text from db
aps_rows = models.Review().query.filter(models.Review.status.in_(('needs_crosscheck', 'needs_details', 'done'))).all()

In [8]:
conn = sqlite3.connect('/Volumes/TOSHIBA EXT/datasets/nyt_reviews_datastore.db')
c = conn.cursor()
# query nyt reviews and not
nyt_rows = c.execute("SELECT * FROM metadata WHERE review_type IN ('not_review', 'multi', 'cluster', 'really_multi', 'single_focus')").fetchall()
len(nyt_rows)

8569

In [9]:
# import various from scikit learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

## Single-work vs. Multi-work Reviews

- The exemplar of a single-work review is very clear, as is the exemplar of review that covers more than one 
- Complications and edge cases arise when it's predominantly a review of one book, with a section that compares it to another book, or in that there is a great variety to multi-work reviews. Some columns like "Latest Fiction" are scanned as separate single work reviews, some as one object. In general, I have found it desirable to isolate clear single-work reviews from others for information extraction or review classification tasks, but other methods wouldn't require this.
- It may be desirable to target multi-work reviews if, for example, you want "in the same review" to be edge weights in a network

In [10]:
aps_single = [i for i in aps_rows if i.review_type == 'single_focus']
aps_not_single = [i for i in aps_rows if i.review_type in ('multi', 'cluster')]
#len(aps_single), len(aps_not_single) >>> (1003, 550)
aps_list_of_full_txt = [i.full_text for i in aps_single] + [i.full_text for i in aps_not_single]
# make "true labels" (0s and 1s so scikit learn can score them)
aps_labels = [0 for i in range(len(aps_single))] + [1 for i in range(len(aps_not_single))]
#len(aps_list_of_full_txt) == len(aps_labels) >>> True

# set up logistic regression with labels
v = CountVectorizer()
X = v.fit_transform(aps_list_of_full_txt)
tfidf = TfidfTransformer()
Z = tfidf.fit_transform(X)

# split the rows into training data, training labels, test data, and test labels
# test on 33% of the data
X_train, X_test, y_train, y_test = train_test_split(Z, aps_labels, test_size=0.33, random_state=81)

# instantiate the model and fit to the training data
lr = LogisticRegression(class_weight={0:0.35, 1:0.65})

In [21]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr, Z, aps_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.79 (+/- 0.06)


In [17]:
lr.fit(X_train, y_train)

# make label predictions
results = lr.predict(X_test)

# generate probabilities for each label
probs = lr.predict_proba(X_test)

scores = {}
# generate f1, precision, recall, and accuracy scores
# I will discuss each of these in the lesson
for y,z in [("single-work review",0),("multi-work review",1)]:
    scores[y] = {}
    scores[y]["f1"] = f1_score(y_test, results, pos_label=z, average='binary')  
    scores[y]["precision"] = precision_score(y_test, results, pos_label=z, average='binary')
    scores[y]["recall"] = recall_score(y_test, results, pos_label=z, average='binary')

scores["accuracy"] = accuracy_score(y_test, results)
scores

{'single-work review': {'f1': 0.8204334365325077,
  'precision': 0.8412698412698413,
  'recall': 0.8006042296072508},
 'multi-work review': {'f1': 0.6947368421052631,
  'precision': 0.6666666666666666,
  'recall': 0.7252747252747253},
 'accuracy': 0.7738791423001949}

In [18]:
terms = []
coefs = []
for key,val in v.vocabulary_.items():
    terms.append(key)
    coefs.append(lr.coef_[0][val])

# this block produces a dataframe with the top 30 terms associated with label 0
df_coef = pd.DataFrame()
df_coef['term'] = terms
df_coef['coef'] = coefs
df_coef = df_coef.sort_values(by='coef').reset_index(drop=True)
df_coef.head(30)

Unnamed: 0,term,coef
0,he,-0.721452
1,his,-0.668302
2,that,-0.667718
3,was,-0.639716
4,him,-0.435286
5,not,-0.409498
6,had,-0.405744
7,were,-0.373912
8,to,-0.30882
9,as,-0.307325


In [22]:
df_coef.tail(30)

Unnamed: 0,term,coef
69454,illustrated,0.44149
69455,edited,0.443003
69456,contains,0.451154
69457,edition,0.4686
69458,stories,0.482132
69459,boston,0.485738
69460,series,0.494175
69461,00,0.498977
69462,books,0.550457
69463,mr,0.553731


In [24]:
#svm
from sklearn.svm import SVC
svm = SVC()
scores = cross_val_score(svm, Z, aps_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.65 (+/- 0.00)


In [25]:
from sklearn import linear_model
lasso = linear_model.Lasso()
scores = cross_val_score(lasso, Z, aps_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: -0.32 (+/- 1.26)


In [32]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(neigh, Z, aps_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.37 (+/- 0.01)


In [33]:
from sklearn.svm import SVC
svm = SVC(kernel='sigmoid')
scores = cross_val_score(svm, Z, aps_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.65 (+/- 0.00)


In [62]:
from tensorflow import keras 

v = CountVectorizer(max_features=10000)
X = v.fit_transform(aps_list_of_full_txt)
tfidf = TfidfTransformer()
Z = tfidf.fit_transform(X)

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(10000,)),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
])
X_train, X_test, y_train, y_test = train_test_split(Z, aps_labels, test_size=0.33, random_state=17)


In [63]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [66]:
model.fit(X_train, y_train, epochs=30, batch_size=300)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1262734e0>

In [67]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.783625730994152


In [57]:
#