In [1]:
# Load one million book reviews.
import pandas as pd
df = pd.read_csv('c:/Users/Owen/books_small.csv')

# Drop missing reviews.
df = df.dropna(subset=['reviewText'])

In [2]:
import string
import spacy
from spacy.lang.en import English
# Create punctuation list.
punctuations = string.punctuation

# Create stopwords list.
# nlp = spacy.load('en_core_web_lg')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load tokenizer, tagger, parser, NER, and word vectors.
parser = English()

# Create tokenizer function.
def spacy_tokenizer(sentence):
    # Create token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)
    
    # For each token, lemmatize and change to lowercase.
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != '-PRON-'
                else word.lower_ for word in mytokens]
    # Remove stop words.
    mytokens = [word for word in mytokens if word not in stop_words
                and word not in punctuations]
    # Return preprocessed list of tokens.
    return mytokens

In [2]:
# Make a copy of original df so you don't have to reload the original every new experiment.
copy = df.copy()

feature = 'reviewText'
target = 'overall'

copy = copy[['reviewText', 'overall']]

In [4]:
small_copy = copy.sample(100000, random_state=1)
small_copy = small_copy.replace({5.0:1.0, 4.0:1.0, 3.0:0.0, 2.0:0.0, 1.0:0.0})
small_copy.overall.value_counts(normalize=True)

1.0    0.80892
0.0    0.19108
Name: overall, dtype: float64

In [5]:
from sklearn.model_selection import train_test_split
X = small_copy[feature]
y = small_copy[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify=y, random_state=1)

In [6]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(tokenizer = spacy_tokenizer,
                            ngram_range = (1,1),
                            max_df = 0.5,
                            min_df = 2)

X_train_vecs = tfidf_vec.fit_transform(X_train)
X_test_vecs = tfidf_vec.transform(X_test)

Wall time: 2min 30s


In [7]:
X_train_vecs.shape, X_test_vecs.shape

((80000, 69425), (20000, 69425))

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from scipy.stats import randint

# n_estimators for final model: 1571
model = LGBMClassifier(objective='binary', is_unbalance=True, num_jobs=-1, seed=0)

params = { 
    'model__num_leaves': randint(1, 1000), 
    'model__max_bin': randint(254, 1000),  
}

search = RandomizedSearchCV(
    model, 
    param_distributions=params, 
    n_iter=5, 
    cv=3, 
    scoring='f1', 
    verbose=30, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train_vecs, y_train);

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  2.8min remaining: 18.0min
[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed:  2.8min remaining: 11.1min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  2.8min remaining:  7.7min
[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed:  2.8min remaining:  5.6min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:  2.8min remaining:  4.2min
[Parallel(n_jobs=-1)]: Done   7 out of  15 | elapsed:  2.8min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  2.8min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done   9 out of  15 | elapsed:  5.4min remaining:  3.6min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  5.4min remaining:  2.7min
[Parallel(n_jobs=-1)]: Done  11 out of  15 | elapsed:  5.4min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed

In [9]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation Accuracy', -search.best_score_)

Best hyperparameters {'model__max_bin': 696, 'model__num_leaves': 993}
Cross-validation Accuracy -0.8685522721018395


In [12]:
from sklearn.metrics import accuracy_score
y_pred = search.predict(X_test_vecs)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.80555


In [13]:
%%time
model_final = LGBMClassifier(objective='binary', is_unbalance=True, n_iterations=1571, 
                             max_bin=696, num_leaves=993, num_jobs=-1, seed=0)

model_final.fit(X_train_vecs, y_train);

Test Accuracy: 0.80555


In [16]:
y_preds = model_final.predict(X_test_vecs)
print("Test Accuracy:", accuracy_score(y_test, y_preds))

Test Accuracy: 0.8686


In [21]:
%%time
final = copy.sample(10000, random_state=42)
final = final.replace({5.0:1.0, 4.0:1.0, 3.0:0.0, 2.0:0.0, 1.0:0.0})
X = final[feature]
y = final[target]
X_final = tfidf_vec.transform(X)
y_final = model_final.predict(X_final)
print("Test Accuracy:", accuracy_score(y, y_final))

Test Accuracy: 0.8713
Wall time: 9.18 s


In [23]:
# Pickle stuff.
from joblib import dump
dump(model_final, 'model.joblib', compress=True)
dump(tfidf_vec, 'vectorizer.joblib', compress=True)

['vectorizer.joblib']

In [31]:
!pip install dill

Collecting dill
  Downloading https://files.pythonhosted.org/packages/c7/11/345f3173809cea7f1a193bfbf02403fff250a3360e0e118a1630985e547d/dill-0.3.1.1.tar.gz (151kB)
Building wheels for collected packages: dill
  Building wheel for dill (setup.py): started
  Building wheel for dill (setup.py): finished with status 'done'
  Created wheel for dill: filename=dill-0.3.1.1-cp37-none-any.whl size=78598 sha256=9fcd2d6f1d6d8e6bce40d9b401338cd465b381a09082eed5c4bc02cef54c9964
  Stored in directory: C:\Users\Owen\AppData\Local\pip\Cache\wheels\59\b1\91\f02e76c732915c4015ab4010f3015469866c1eb9b14058d8e7
Successfully built dill
Installing collected packages: dill
Successfully installed dill-0.3.1.1


In [4]:
copy = df.sample(10000, random_state=42)
copy = copy.replace({5.0:1.0, 4.0:1.0, 3.0:0.0, 2.0:0.0, 1.0:0.0})
copy.to_csv('10k_books.csv')

In [None]:
import dill

In [None]:
dill.dump_session('foobar.pkl')

In [29]:
# Get versions of packages used.
import joblib
import sklearn
import lightgbm
import spacy
print(f'joblib=={joblib.__version__}')
print(f'scikit-learn=={sklearn.__version__}')
print(f'lightgbm=={lightgbm.__version__}')
print(f'spacy=={spacy.__version__}')

joblib==0.14.0
scikit-learn==0.21.3
lightgbm==2.3.0
spacy==2.2.1


In [28]:
import string
print(f'string=={string.__version__}')

AttributeError: module 'string' has no attribute '__version__'