In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.metrics import mean_absolute_error as mae, r2_score, mean_squared_error as mse
from sklearn.feature_selection import SelectKBest, f_regression
from gensim.parsing.preprocessing import strip_punctuation, remove_stopwords, strip_non_alphanum

In [2]:
def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

In [3]:
# Load data from csv
df = pd.read_csv('../datasets/goodreads-300k-dataset/goodreads.csv')
test_df = pd.read_csv('../datasets/test.csv', index_col=False)

# Split rating count
rating_count_split = 1000
df = df[df.rating_count >= rating_count_split]

# Combine title and description into one
df['title_description'] = df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)
test_df['title_description'] = test_df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)

# Remove non-ascii rows
df = df[df.title_description.map(lambda x: x.isascii())]

# Convert to lowercase
df.title_description = df.title_description.str.lower()
test_df.title_description = test_df.title_description.str.lower()

# Remove punctuation
df.title_description = df.title_description.apply(lambda x: strip_punctuation(x))
test_df.title_description = test_df.title_description.apply(lambda x: strip_punctuation(x))

# Remove stopwords
df.title_description = df.title_description.apply(lambda x: remove_stopwords(x))
test_df.title_description = test_df.title_description.apply(lambda x: remove_stopwords(x))

# Remove non-alpha
df.title_description = df.title_description.apply(lambda x: strip_non_alphanum(x))
test_df.title_description = test_df.title_description.apply(lambda x: strip_non_alphanum(x))

  df = pd.read_csv('goodreads-300k-dataset/goodreads.csv')


In [4]:
train_df, valid_df = train_test_split(df, test_size=0.1, random_state=1)

print('Train:', train_df.shape)
print('Valid:', valid_df.shape)
print('Test:', test_df.shape)

Train: (45576, 11)
Valid: (5064, 11)
Test: (100, 6)


In [5]:
# Word embedding
vectorizer = TfidfVectorizer()
vectorizer.fit(df.title_description)

X_train = vectorizer.transform(train_df.title_description)
X_valid = vectorizer.transform(valid_df.title_description)
X_test = vectorizer.transform(test_df.title_description)

y_train = train_df.rating.values
y_valid = valid_df.rating.values
y_test = test_df.rating.values

print('Input shape:', X_train[0].shape)

Input shape: (1, 108138)


In [6]:
# Feature selection
select = SelectKBest(score_func=f_regression, k=1500)
select.fit(X_train, y_train)

X_train_ = select.transform(X_train)
X_valid_ = select.transform(X_valid)
X_test_ = select.transform(X_test)

print('Input shape:', X_train_[0].shape)

# Training
model = LinearRegression()
model.fit(X_train_, y_train)

# Evaluation
y_pred = model.predict(X_train_)
# print('MAE:', mae(y_train, y_pred))
# print('MSE:', mse(y_train, y_pred))
# print('R2:', r2_score(y_train, y_pred))
# print('')

# y_pred = model.predict(X_valid_)
# print('MAE:', mae(y_valid, y_pred))
# print('MSE:', mse(y_valid, y_pred))
# print('R2:', r2_score(y_valid, y_pred))
# print('')

y_pred = model.predict(X_test_)
print('MAE:', mae(y_test, y_pred))
print('MSE:', mse(y_test, y_pred))
print('RMSE:', rmse(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

Input shape: (1, 1500)
MAE: 0.23814821612663029
MSE: 0.08921124918963719
RMSE: 0.29868252240403553
R2: -0.27698232145918955


In [7]:
# Save model
joblib.dump(model, 'LR_1500_model.abc')
joblib.dump(vectorizer, 'tfidf_vectorizer.abc')
joblib.dump(select, 'selector_1500.abc')

['selector_1500.abc']