In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import mean_absolute_error as mae, r2_score
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack
from gensim import downloader as api
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, remove_stopwords, split_on_space, strip_non_alphanum

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from tqdm import tqdm

%matplotlib inline

# Vectorizer

In [2]:
# Load data from csv
df = pd.read_csv('goodreads-300k-dataset/goodreads.csv')
test_df = pd.read_csv('test.csv', index_col=False)

# Split rating count
rating_count_split = 1000
df = df[df.rating_count >= rating_count_split]

# Combine title and description into one
df['title_description'] = df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)
test_df['title_description'] = test_df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)

# Remove non-ascii rows
df = df[df.title_description.map(lambda x: x.isascii())]

# Convert to lowercase
df.title_description = df.title_description.str.lower()
test_df.title_description = test_df.title_description.str.lower()

# Remove punctuation
df.title_description = df.title_description.apply(lambda x: strip_punctuation(x))
test_df.title_description = test_df.title_description.apply(lambda x: strip_punctuation(x))

# Remove stopwords 
df.title_description = df.title_description.apply(lambda x: remove_stopwords(x))
test_df.title_description = test_df.title_description.apply(lambda x: remove_stopwords(x))

# Remove non-alpha
df.title_description = df.title_description.apply(lambda x: strip_non_alphanum(x))
test_df.title_description = test_df.title_description.apply(lambda x: strip_non_alphanum(x))

  df = pd.read_csv('goodreads-300k-dataset/goodreads.csv')


In [3]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=1)

print('Train:', train_df.shape)
print('Valid:', valid_df.shape)
print('Test:', test_df.shape)

Train: (40512, 11)
Valid: (10128, 11)
Test: (51, 6)


In [4]:
# Word embedding
vectorizer = TfidfVectorizer()
vectorizer.fit(df.title_description)

X_train = vectorizer.transform(train_df.title_description)
X_valid = vectorizer.transform(valid_df.title_description)
X_test = vectorizer.transform(test_df.title_description)

y_train = train_df.rating.values
y_valid = valid_df.rating.values
y_test = test_df.rating.values

print('Input shape:', X_train[0].shape)

Input shape: (1, 108138)


In [5]:
# Feature selection
select = SelectKBest(score_func=f_regression, k=2000)
select.fit(X_train, y_train)

X_train = select.transform(X_train)
X_valid = select.transform(X_valid)
X_test = select.transform(X_test)

print('Input shape:', X_train[0].shape)

Input shape: (1, 2000)


In [6]:
# Training
model = LinearRegression()
model.fit(X_train, y_train)

In [7]:
# Evaluation
y_pred = model.predict(X_train)
print('mae loss:', mae(y_train, y_pred))
print('R2:', r2_score(y_train, y_pred))
# print('min:', (y_pred - y_train).min())
# print('max:', (y_pred - y_train).max())
print('')

y_pred = model.predict(X_valid)
print('mae loss:', mae(y_valid, y_pred))
print('R2:', r2_score(y_valid, y_pred))
# print('min:', (y_pred - y_valid).min())
# print('max:', (y_pred - y_valid).max())
print('')

y_pred = model.predict(X_test)
print('mae loss:', mae(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))
# print('min:', (y_pred - y_test).min())
# print('max:', (y_pred - y_test).max())

mae loss: 0.1557586006512277
R2: 0.3933373647381966

mae loss: 0.17194372037487718
R2: 0.23331070451753144

mae loss: 0.2983513485031425
R2: -0.784601424120958


In [16]:
# wv = KeyedVectors.load_word2vec_format('word2vec-google-news-300.bin', binary=True)
# wv = api.load('fasttext-wiki-news-subwords-300')
wv = api.load('glove-twitter-25')

In [17]:
def sentence_to_vector(doc):
    doc = [wv.get_vector(word) for word in doc if wv.has_index_for(word)]
    if len(doc) == 0:
        return wv.vectors[0]

    return np.mean(doc, axis=0)

In [18]:
# Load data from csv
df = pd.read_csv('goodreads-300k-dataset/goodreads.csv')
test_df = pd.read_csv('test.csv', index_col=False)

# Split rating count
rating_count_split = 1000
df = df[df.rating_count >= rating_count_split]

# Combine title and description into one
df['title_description'] = df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)
test_df['title_description'] = test_df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)

# Remove non-ascii rows
df = df[df.title_description.map(lambda x: x.isascii())]

# Split token
df.title_description = df.title_description.apply(lambda x: preprocess_string(x, [lambda x: x.lower(), strip_punctuation, remove_stopwords, strip_non_alphanum]))
test_df.title_description = test_df.title_description.apply(lambda x: preprocess_string(x, [lambda x: x.lower(), strip_punctuation, remove_stopwords, strip_non_alphanum]))

  df = pd.read_csv('goodreads-300k-dataset/goodreads.csv')


In [19]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=1, shuffle=False)

print('Train:', train_df.shape)
print('Valid:', valid_df.shape)
print('Test:', test_df.shape)

Train: (40512, 11)
Valid: (10128, 11)
Test: (51, 6)


In [20]:
X_train = np.vstack([sentence_to_vector(sentence) for sentence in train_df.title_description.values])
X_valid = np.vstack([sentence_to_vector(sentence) for sentence in valid_df.title_description.values])
X_test = np.vstack([sentence_to_vector(sentence) for sentence in test_df.title_description.values])

y_train = train_df.rating.values
y_valid = valid_df.rating.values
y_test = test_df.rating.values

model = LinearRegression()
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_train)
print('mae loss:', mae(y_train, y_pred))
print('R2:', r2_score(y_train, y_pred))
print('min:', (y_pred - y_train).min())
print('max:', (y_pred - y_train).max())
print('')

y_pred = model.predict(X_valid)
print('mae loss:', mae(y_valid, y_pred))
print('R2:', r2_score(y_valid, y_pred))
print('min:', (y_pred - y_valid).min())
print('max:', (y_pred - y_valid).max())
print('')

y_pred = model.predict(X_test)
print('mae loss:', mae(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))
print('min:', (y_pred - y_test).min())
print('max:', (y_pred - y_test).max())
print('')

mae loss: 0.19213211407452396
R2: 0.06503635274770203
min: -0.8258985233306886
max: 1.4844633102416993

mae loss: 0.19917783800459585
R2: 0.03063698624730815
min: -0.8102610778808597
max: 1.3751559257507324

mae loss: 0.33520325604607076
R2: -0.9888052337905717
min: -0.7697260093688962
max: 0.572516164779663

