In [97]:
import pandas as pd
import numpy as np
import string

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, remove_stopwords

%matplotlib inline

In [2]:
wv = KeyedVectors.load_word2vec_format('word2vec-google-news-300.bin', binary=True)

In [11]:
def sentence_to_vector(doc):
    doc = [wv.get_vector(word) for word in doc if wv.has_index_for(word)]
    if len(doc) == 0:
        return wv.vectors[0]

    return np.mean(doc, axis=0)

In [72]:
# Load data from csv
df = pd.read_csv('goodreads-300k-dataset/keywords.csv')
test_df = pd.read_csv('test.csv', index_col=False)

# Combine title and description into one | convert to lowercase
df['title_description'] = df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)
test_df['title_description'] = test_df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)

# Remove non-ascii rows
df = df[df.title_description.map(lambda x: x.isascii())]

# Remove punctuation and stopwords
df.title_description = df.title_description.apply(lambda x: preprocess_string(x, [strip_punctuation, remove_stopwords]))
test_df.title_description = test_df.title_description.apply(lambda x: preprocess_string(x, [strip_punctuation, remove_stopwords]))

  df = pd.read_csv('goodreads-300k-dataset/keywords.csv')


In [73]:
rating_count_split = 1000
train_df = df[df.rating_count >= rating_count_split]
train_df, valid_df = train_test_split(train_df, test_size=0.2, random_state=1)

print('Train:', train_df.shape)
print('Valid:', valid_df.shape)
print('Test:', test_df.shape)

# vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english')
# vectorizer.fit(df.title_description)

Train: (39910, 13)
Valid: (9978, 13)
Test: (51, 6)


In [100]:
# X_train = vectorizer.transform(train_df.title_description)
# X_valid = vectorizer.transform(valid_df.title_description)
# X_test = vectorizer.transform(test_df.title_description)

X_train = np.vstack([sentence_to_vector(sentence) for sentence in train_df.title_description.values])
X_valid = np.vstack([sentence_to_vector(sentence) for sentence in valid_df.title_description.values])
X_test = np.vstack([sentence_to_vector(sentence) for sentence in test_df.title_description.values])

y_train = train_df.rating.values
y_valid = valid_df.rating.values
y_test = test_df.rating.values

model = LinearRegression()
model.fit(X_train, y_train)

In [99]:
y_pred = model.predict(X_train)
train_loss = mae(y_train, y_pred)
print('mae loss:', train_loss)
print('min:', (y_pred - y_train).min())
print('max:', (y_pred - y_train).max())

y_pred = model.predict(X_valid)
valid_loss = mae(y_valid, y_pred)
print('mae loss:', valid_loss)
print('min:', (y_pred - y_valid).min())
print('max:', (y_pred - y_valid).max())

y_pred = model.predict(X_test)
test_loss = mae(y_test, y_pred)
print('mae loss:', test_loss)
print('min:', (y_pred - y_test).min())
print('max:', (y_pred - y_test).max())

mae loss: 0.17933610944692965
min: -0.868666286468506
max: 1.3581276035308836
mae loss: 0.18056245510696195
min: -0.7305306339263913
max: 1.3847862243652345
mae loss: 0.3234898503621419
min: -0.7175090026855466
max: 0.5639478778839111


In [76]:
print((model.coef_ == 0).sum() / model.coef_.shape[0] * 100)

0.0


In [77]:
test_df = pd.read_csv('test.csv', index_col=False)
test_df['title_description'] = test_df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)
test_df.title_description = test_df.title_description.apply(lambda x: preprocess_string(x, [strip_punctuation, remove_stopwords]))
X_test = np.vstack([sentence_to_vector(sentence) for sentence in test_df.title_description.values])
y_test = test_df.rating.values

In [94]:
y_pred = model.predict(X_train)

In [95]:
y_pred.min(), y_pred.max()

(3.4451265, 5.0342426)