In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.metrics import accuracy_score
from gensim.parsing.preprocessing import strip_punctuation, remove_stopwords, strip_non_alphanum

In [2]:
# Load data from csv
df = pd.read_csv('goodreads-300k-dataset/goodreads.csv')
test_df = pd.read_csv('test.csv', index_col=False)

# Split rating count
rating_count_split = 1000
df = df[df.rating_count >= rating_count_split]

# Combine title and description into one
df['title_description'] = df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)
test_df['title_description'] = test_df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)

# Remove non-ascii rows
df = df[df.title_description.map(lambda x: x.isascii())]
test_df = test_df[test_df.title_description.map(lambda x: x.isascii())]

# Convert to lowercase
df.title_description = df.title_description.str.lower()
test_df.title_description = test_df.title_description.str.lower()

# Remove punctuation
df.title_description = df.title_description.apply(lambda x: strip_punctuation(x))
test_df.title_description = test_df.title_description.apply(lambda x: strip_punctuation(x))

# Remove stopwords
df.title_description = df.title_description.apply(lambda x: remove_stopwords(x))
test_df.title_description = test_df.title_description.apply(lambda x: remove_stopwords(x))

# Remove non-alpha
df.title_description = df.title_description.apply(lambda x: strip_non_alphanum(x))
test_df.title_description = test_df.title_description.apply(lambda x: strip_non_alphanum(x))

  df = pd.read_csv('goodreads-300k-dataset/goodreads.csv')


In [3]:
# Binning label
bins = (0, 3, 3.5, 4, 4.5, 5)
labels = np.arange(len(bins)-1)
df.rating = pd.cut(df.rating, bins=bins, labels=labels, include_lowest=True)
test_df.rating = pd.cut(test_df.rating, bins=bins, labels=labels, include_lowest=True)

In [4]:
train_df, valid_df = train_test_split(df, test_size=0.1, random_state=1, stratify=df.rating.values)

print('Train:', train_df.shape)
print('Valid:', valid_df.shape)
print('Test:', test_df.shape)

Train: (45576, 11)
Valid: (5064, 11)
Test: (51, 6)


In [5]:
# Word embedding
vectorizer = TfidfVectorizer()
vectorizer.fit(df.title_description)

X_train = vectorizer.transform(train_df.title_description)
X_valid = vectorizer.transform(valid_df.title_description)
X_test = vectorizer.transform(test_df.title_description)

y_train = train_df.rating.values
y_valid = valid_df.rating.values
y_test = test_df.rating.values

print('Input shape:', X_train[0].shape)

Input shape: (1, 108138)


In [6]:
# Training
model = LogisticRegression(multi_class='multinomial', max_iter=1000)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_train)
print('train acc:', accuracy_score(y_pred, y_train))

y_pred = model.predict(X_valid)
print('valid acc:', accuracy_score(y_pred, y_valid))

y_pred = model.predict(X_test)
print('test acc:', accuracy_score(y_pred, y_test))

train acc: 0.8377654906090926
valid acc: 0.6958925750394944
test acc: 0.5882352941176471


In [7]:
joblib.dump(model, 'LR_classify_model.abc')

['LR_classify_model.abc']