#### Spotlight

The Quickstart Example

In [None]:
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import mrr_score
from spotlight.factorization.implicit import ImplicitFactorizationModel

In [None]:
dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset)

In [None]:
train

Not helpful... need to go deeper:

In [None]:
train.user_ids

In [None]:
train.item_ids

In [None]:
train.ratings

And peek inside:

In [None]:
train.tocsr().todense()

The model:

In [None]:
model = ImplicitFactorizationModel(n_iter=3,
                                   loss='bpr')
model.fit(train)

mrr = mrr_score(model, test)

In [None]:
mrr[:10]

### GitHub Stars

Data retrived from scraping GitHub:

In [None]:
import pandas as pd 

df = pd.read_csv('data/stars.csv')

In [None]:
df.sample(5)

In [None]:
df['language'].value_counts()

In [None]:
df = df[df.language == 'Python']
df = df[~df['repo'].isin(['maxhumber/gif', 'maxhumber/gazpacho'])]

In [None]:
df.shape

In [None]:
len(df['repo'].unique())

In [None]:
len(df['user'].unique())

In [None]:
df.head(3)

In [None]:
from spotlight.interactions import Interactions

In [None]:
# won't work
interactions = Interactions(df['user'], df['repo'])

"Everything must be a number"

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [None]:
users = user_encoder.fit_transform(df['user'])
items = item_encoder.fit_transform(df['repo'])

In [None]:
interactions = Interactions(users, items)

In [None]:
interactions

Be a good Data Scientist:

In [None]:
import numpy as np
from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=np.random.RandomState(42))

In [None]:
model = ImplicitFactorizationModel(loss='pointwise', n_iter=20)

In [None]:
model.fit(train)

In [None]:
df.head(3)

Examining one user:

In [None]:
garrry = df[df['user'] == 'garrrychan']

In [None]:
garrry['repo'].values

Looking at the user_id:

In [None]:
u = user_encoder.transform(['garrrychan'])

And all of the items:

In [None]:
item_encoder.classes_

In [None]:
preds = model.predict(u, np.arange(len(item_encoder.classes_)))

In [None]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': preds
}).sort_values('pred', ascending=False)

Evaulating the model:

In [None]:
from spotlight.evaluation import precision_recall_score

In [None]:
precision, recall = precision_recall_score(model, test, train, k=10)

In [None]:
precision.mean()

In [None]:
recall.mean()

Serialize:

In [None]:
import torch 

torch.save(model, 'model.spot')

In [None]:
del model

In [None]:
model = torch.load('model.spot')

Predict on another random user:

In [None]:
u = user_encoder.transform(['RandomOS'])

In [None]:
model.predict(u, np.arange(len(item_encoder.classes_)))

In [None]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': model.predict(u, np.arange(len(item_encoder.classes_)))
}).sort_values('pred', ascending=False).head(20)

Actual likes:

In [None]:
df[df['user'] == 'RandomOS']['repo']