In [2]:
import pandas as pd
import numpy as np
import string

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from scipy.sparse import hstack
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, remove_stopwords

%matplotlib inline

In [2]:
wv = KeyedVectors.load_word2vec_format('word2vec-google-news-300.bin', binary=True)

In [3]:
def sentence_to_vector(doc):
    doc = [wv.get_vector(word) for word in doc if wv.has_index_for(word)]
    if len(doc) == 0:
        return wv.vectors[0]

    return np.mean(doc, axis=0)

In [4]:
# Load data from csv
df = pd.read_csv('goodreads-300k-dataset/keywords.csv')
test_df = pd.read_csv('test.csv', index_col=False)

# Combine title and description into one | convert to lowercase
df['title_description'] = df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)
test_df['title_description'] = test_df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)

# Remove non-ascii rows
df = df[df.title_description.map(lambda x: x.isascii())]

# Remove punctuation and stopwords
df.title_description = df.title_description.apply(lambda x: preprocess_string(x, [strip_punctuation, remove_stopwords]))
test_df.title_description = test_df.title_description.apply(lambda x: preprocess_string(x, [strip_punctuation, remove_stopwords]))

# Binning label
bins = (0, 3, 3.5, 4, 4.5, 5)
labels = np.arange(len(bins)-1)
df.rating = pd.cut(df.rating, bins=bins, labels=labels, include_lowest=True)
test_df.rating = pd.cut(test_df.rating, bins=bins, labels=labels, include_lowest=True)

  df = pd.read_csv('goodreads-300k-dataset/keywords.csv')


In [6]:
rating_count_split = 1000
train_df = df[df.rating_count >= rating_count_split]
train_df, valid_df = train_test_split(train_df, test_size=0.2, random_state=1)

print('Train:', train_df.shape)
print('Valid:', valid_df.shape)
print('Test:', test_df.shape)

# vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english')
# vectorizer.fit(df.title_description)

Train: (39910, 13)
Valid: (9978, 13)
Test: (51, 6)


In [7]:
# X_train = vectorizer.transform(train_df.title_description)
# X_valid = vectorizer.transform(valid_df.title_description)
# X_test = vectorizer.transform(test_df.title_description)

X_train = np.vstack([sentence_to_vector(sentence) for sentence in train_df.title_description.values])
X_valid = np.vstack([sentence_to_vector(sentence) for sentence in valid_df.title_description.values])
X_test = np.vstack([sentence_to_vector(sentence) for sentence in test_df.title_description.values])

y_train = train_df.rating.values
y_valid = valid_df.rating.values
y_test = test_df.rating.values

model = OneVsRestClassifier(SVC())
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_train)
print('train acc:', accuracy_score(y_pred, y_train))

y_pred = model.predict(X_valid)
print('valid acc:', accuracy_score(y_pred, y_valid))

y_pred = model.predict(X_test)
print('test acc:', accuracy_score(y_pred, y_test))

In [None]:
print((model.coef_ == 0).sum() / model.coef_.shape[0] * 100)

In [None]:
test_df = pd.read_csv('test.csv', index_col=False)
test_df['title_description'] = test_df[['title', 'description']].apply(lambda x: ' '.join(x), axis=1)
test_df.title_description = test_df.title_description.apply(lambda x: preprocess_string(x, [strip_punctuation, remove_stopwords]))
X_test = np.vstack([sentence_to_vector(sentence) for sentence in test_df.title_description.values])
y_test = test_df.rating.values

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [3]:
import cudf

In [5]:
df = cudf.read_csv('test.csv')

RuntimeError: Fatal CUDA error encountered at: /project/cpp/include/cudf/detail/utilities/vector_factories.hpp:103: 801 cudaErrorNotSupported operation not supported