#### Spotlight Quickstart

In [1]:
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import mrr_score
from spotlight.factorization.implicit import ImplicitFactorizationModel

In [2]:
dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset)

In [3]:
train

<Interactions dataset (944 users x 1683 items x 80000 interactions)>

In [4]:
train.user_ids

array([140, 465, 577, ..., 483, 833, 276], dtype=int32)

In [5]:
train.item_ids

array([325, 202, 194, ..., 510, 429, 383], dtype=int32)

In [6]:
train.ratings

array([3., 4., 4., ..., 3., 3., 2.], dtype=float32)

In [7]:
train.tocsr().todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 5., 3., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 5., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [8]:
model = ImplicitFactorizationModel(n_iter=3,
                                   loss='bpr')
model.fit(train)

mrr = mrr_score(model, test)

In [9]:
mrr[:10]

array([0.03674924, 0.01519986, 0.05784659, 0.04928692, 0.01246693,
       0.01152734, 0.01039997, 0.0712537 , 0.02532142, 0.00670062])

### Github stars

In [10]:
import pandas as pd 

df = pd.read_csv('data/stars.csv')

In [11]:
df.sample(5)

Unnamed: 0,user,repo,description,language,stargazers
37673,jocobtt,linkerd/linkerd2,A service mesh for Kubernetes and beyond. Main...,Go,5316
1346,feevos,lyakaap/VAT-pytorch,Virtual Adversarial Training (VAT) implementat...,Python,147
18418,MagicSword,myles/awesome-static-generators,A curated list of static web site generators.,,1085
34725,robcowie,JuliaStrings/utf8proc,a clean C library for processing UTF-8 Unicode...,C,507
2024,owen800q,jitsi/jitsi-meet,"Jitsi Meet - Secure, Simple and Scalable Video...",JavaScript,5172


In [12]:
df['language'].value_counts()

Python              25702
JavaScript           6459
Jupyter Notebook     3777
Go                   2571
C++                  2544
                    ...  
GAMS                    1
F*                      1
Squirrel                1
Mercury                 1
Vim Snippet             1
Name: language, Length: 178, dtype: int64

In [15]:
df = df[df.language == 'Python']
df = df[~df['repo'].isin(['maxhumber/gif', 'maxhumber/gazpacho'])]

In [16]:
df.shape

(25380, 5)

In [17]:
len(df['repo'].unique())

12222

In [18]:
len(df['user'].unique())

326

In [21]:
df.head()

Unnamed: 0,user,repo,description,language,stargazers
0,sbarman-mi9,as-ideas/ForwardTacotron,⏩ Generating speech in a single forward pass w...,Python,97
1,sbarman-mi9,abhishekkrthakur/bert-sentiment,,Python,21
4,sbarman-mi9,EmilyAlsentzer/clinicalBERT,repository for Publicly Available Clinical BER...,Python,160
5,sbarman-mi9,micheles/plac,Plac: Parsing the Command Line the Easy Way,Python,171
6,sbarman-mi9,alontalmor/oLMpics,,Python,16


In [22]:
df['rating'] = 1

In [23]:
from spotlight.interactions import Interactions

In [25]:
# won't work
interactions = Interactions(df['user'], df['repo'])

TypeError: must be str, not int

In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [28]:
users = user_encoder.fit_transform(df['user'])
items = item_encoder.fit_transform(df['repo'])

In [29]:
interactions = Interactions(users, items)

In [30]:
interactions

<Interactions dataset (326 users x 12222 items x 25380 interactions)>

In [32]:
import numpy as np
from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=np.random.RandomState(42))

In [57]:
model = ImplicitFactorizationModel(loss='pointwise', n_iter=20)

In [58]:
model.fit(train)

In [37]:
df.head()

Unnamed: 0,user,repo,description,language,stargazers,rating
0,sbarman-mi9,as-ideas/ForwardTacotron,⏩ Generating speech in a single forward pass w...,Python,97,1
1,sbarman-mi9,abhishekkrthakur/bert-sentiment,,Python,21,1
4,sbarman-mi9,EmilyAlsentzer/clinicalBERT,repository for Publicly Available Clinical BER...,Python,160,1
5,sbarman-mi9,micheles/plac,Plac: Parsing the Command Line the Easy Way,Python,171,1
6,sbarman-mi9,alontalmor/oLMpics,,Python,16,1


In [38]:
garrry = df[df['user'] == 'garrrychan']

In [40]:
garrry['repo'].values

array(['mnielsen/neural-networks-and-deep-learning',
       'brendan-rius/jupyter-c-kernel', 'google-research/uda',
       'hhatto/autopep8', 'MaxHalford/prince',
       'scikit-learn-contrib/sklearn-pandas', 'modin-project/modin',
       'VikParuchuri/apartment-finder', 'uber-research/parallax',
       'prabhupant/python-ds', 'tensorflow/nmt',
       'huggingface/transformers', 'h5py/h5py', 'google-research/bert',
       'sloria/TextBlob', 'openai/gpt-2', 'matsui528/rii',
       'ResidentMario/missingno', 'lmcinnes/enstop',
       'phatpiglet/autocorrect', 'barrust/pyspellchecker',
       'seatgeek/fuzzywuzzy', 'maxhumber/chart', 'apache/airflow',
       'graphql-python/graphene', 'maxhumber/marc', 'spotify/luigi',
       'garrrychan/recipe_recommender_system',
       'CamDavidsonPilon/lifetimes', 'maciejkula/spotlight',
       'lyst/lightfm', 'practical-recommender-systems/moviegeek',
       'uwescience/TrafficCruising-DSSG2017'], dtype=object)

In [42]:
u = user_encoder.transform(['garrrychan'])

In [59]:
preds = model.predict(u, np.arange(len(item_encoder.classes_)))

In [60]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': preds
}).sort_values('pred', ascending=False)

Unnamed: 0,repo,pred
11599,vinta/awesome-python,17.426773
9238,psf/black,13.547756
12129,zedr/clean-code-python,12.813071
11504,vaexio/vaex,12.159784
1982,TheAlgorithms/Python,11.827305
...,...,...
8408,negrinho/sane_tikz,-14.042694
4518,dmishin/tsp-solver,-14.309828
7247,leo-editor/leo-editor,-14.476953
122,AlessandroZ/LaZagne,-15.192022


In [53]:
from spotlight.evaluation import precision_recall_score

In [61]:
precision, recall = precision_recall_score(model, test, train, k=10)

In [62]:
precision.mean()

0.0303921568627451

In [63]:
recall.mean()

0.021713329128783685

In [None]:
import torch 

torch.save(model, 'model.spot')

In [None]:
del model

In [None]:
model = torch.load('model.spot')

In [None]:
user_encoder.transform(['RandomOS'])

In [None]:
model.predict(53, np.arange(len(item_encoder.classes_)))

In [None]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': model.predict(53, np.arange(len(item_encoder.classes_)))
}).sort_values('pred', ascending=False)