In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('books_big.csv')

In [3]:
# Drop missing reviews.
df = df.dropna(subset=['reviewText'])

In [4]:
df.shape

(8897450, 10)

In [5]:
feature = 'reviewText'
target = 'overall'

df = df[['reviewText', 'overall']]

In [7]:
df['overall'] = df['overall'].replace({5.0:1.0, 4.0:1.0, 3.0:0.0, 2.0:0.0, 1.0:0.0})
df.overall.value_counts(normalize=True)

1.0    0.809599
0.0    0.190401
Name: overall, dtype: float64

In [8]:
df.overall.value_counts()

1.0    7203363
0.0    1694087
Name: overall, dtype: int64

In [15]:
sample = df.sample(10000, random_state=42)

In [16]:
sample.overall.value_counts(normalize=True)

1.0    0.8078
0.0    0.1922
Name: overall, dtype: float64

In [17]:
from sklearn.model_selection import train_test_split

X = sample[feature]
y = sample[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify=y, random_state=1)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(tokenizer = None,
                            ngram_range = (1,2),
                            max_df = 0.5,
                            min_df = 2)

X_train_vecs = tfidf_vec.fit_transform(X_train)
X_test_vecs = tfidf_vec.transform(X_test)

In [19]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(objective='binary', is_unbalance=True, n_iterations=1571, 
                             max_bin=696, num_leaves=993, num_jobs=-1, seed=0)

model.fit(X_train_vecs, y_train);

In [20]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test_vecs)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.862


In [21]:
%%time
X = df['reviewText']
y = df['overall']
X_vecs = tfidf_vec.transform(X)

Wall time: 1h 2min 33s


In [22]:
df_pred = model.predict(X_vecs)
print("Test Accuracy:", accuracy_score(y, df_pred))

Test Accuracy: 0.8577329459564257


In [28]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y, df_pred)

array([[ 761486,  932601],
       [ 333213, 6870150]], dtype=int64)

In [31]:
cr = classification_report(y, df_pred)

In [32]:
print(cr)

              precision    recall  f1-score   support

         0.0       0.70      0.45      0.55   1694087
         1.0       0.88      0.95      0.92   7203363

    accuracy                           0.86   8897450
   macro avg       0.79      0.70      0.73   8897450
weighted avg       0.85      0.86      0.85   8897450



In [37]:
index = ['Negative', 'Positive', 'macro avg', 'weighted avg']
columns = ['Precision', "Recall", "f1-Score"]
observations = [[0.70, 0.45, 0.55],[0.88, 0.95, 0.92],[0.79, 0.70, 0.73],[0.85, 0.86, 0.85]]

In [38]:
binary_report = pd.DataFrame(index=index, data=observations, columns=columns)

In [39]:
binary_report.to_csv('c:/Users/Owen/unit2_build1/notebooks/cr.csv')