In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error as mae, r2_score, mean_squared_error as mse
from sklearn.decomposition import TruncatedSVD
from gensim.parsing.preprocessing import strip_punctuation, remove_stopwords, strip_non_alphanum
from tqdm import tqdm

%matplotlib inline

In [3]:
# Load data from csv
df = pd.read_csv('../datasets/goodreads-300k-dataset/goodreads.csv')

# Split rating count
rating_count_split = 1000
df = df[df.rating_count >= rating_count_split]

# Combine title and description into one
df['title_description'] = df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)

# Remove non-ascii rows
df = df[df.title_description.map(lambda x: x.isascii())]

# Convert to lowercase
df.title_description = df.title_description.str.lower()

# Remove punctuation
df.title_description = df.title_description.apply(lambda x: strip_punctuation(x))

# Remove stopwords 
df.title_description = df.title_description.apply(lambda x: remove_stopwords(x))

# Remove non-alpha
df.title_description = df.title_description.apply(lambda x: strip_non_alphanum(x))

  df = pd.read_csv('../datasets/goodreads-300k-dataset/goodreads.csv')


In [None]:
train_df, valid_df = train_test_split(df, test_size=0.1, random_state=1)

print('Train:', train_df.shape)
print('Valid:', valid_df.shape)

In [12]:
# Word embedding
vectorizer = TfidfVectorizer()
vectorizer.fit(df.title_description)

X_train = vectorizer.transform(train_df.title_description)
X_valid = vectorizer.transform(valid_df.title_description)

y_train = train_df.rating.values
y_valid = valid_df.rating.values

In [None]:
train_mae = []
valid_mae = []
train_mse = []
valid_mse = []
train_r2 = []
valid_r2 = []

components = np.arange(100, 10000, 100)
for i in tqdm(components):
    svd = TruncatedSVD(n_components=i)
    svd.fit(X_train)
    
    X_train_svd = svd.transform(X_train)
    X_valid_svd = svd.transform(X_valid)

    model = LinearRegression()
    model.fit(X_train_svd, y_train)

    y_pred = model.predict(X_train_svd)
    train_mae.append(mae(y_train, y_pred))
    train_mse.append(mse(y_train, y_pred))
    train_r2.append(r2_score(y_train, y_pred))

    y_pred = model.predict(X_valid_svd)
    valid_mae.append(mae(y_valid, y_pred))
    valid_mse.append(mse(y_valid, y_pred))
    valid_r2.append(r2_score(y_valid, y_pred))

  0%|                                                                                                                         | 0/99 [00:00<?, ?it/s]

In [None]:
plt.plot(components, train_mae, label='train_mae')
plt.plot(components, valid_mae, label='valid_mae')

x_best = components[np.argmin(valid_mae)]
y_best = np.min(valid_mae)
plt.plot(x_best, y_best, marker="o", color="green")

plt.xlabel('d')
plt.ylabel('MAE')
plt.legend()
plt.savefig('TruncatedSVD_MAE.png')

In [None]:
plt.plot(components, train_mse, label='train_mse')
plt.plot(components, valid_mse, label='valid_mse')

x_best = components[np.argmin(valid_mse)]
y_best = np.min(valid_mse)
plt.plot(x_best, y_best, marker="o", color="green")

plt.xlabel('d')
plt.ylabel('MSE')
plt.legend()
plt.savefig('TruncatedSVD_MSE.png')

In [None]:
plt.plot(components, train_r2, label='train_r2')
plt.plot(components, valid_r2, label='valid_r2')

x_best = components[np.argmax(valid_r2)]
y_best = np.max(valid_r2)
plt.plot(x_best, y_best, marker="o", color="green")

plt.xlabel('d')
plt.ylabel('R-Squared')
plt.legend()
plt.savefig('TruncatedSVD_R2.png')