#### Spotlight

The Quickstart Example

In [1]:
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import mrr_score
from spotlight.factorization.implicit import ImplicitFactorizationModel

In [2]:
dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset)

In [3]:
train

<Interactions dataset (944 users x 1683 items x 80000 interactions)>

Not helpful... need to go deeper:

In [4]:
train.user_ids

array([416, 493, 654, ..., 293, 207, 415], dtype=int32)

In [5]:
train.item_ids

array([313, 881, 250, ..., 210, 997, 323], dtype=int32)

In [6]:
train.ratings

array([5., 1., 1., ..., 3., 1., 2.], dtype=float32)

And peek inside:

In [7]:
train.tocsr().todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 5., 0., ..., 0., 0., 0.],
        [0., 4., 0., ..., 0., 0., 0.],
        ...,
        [0., 5., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

The model:

In [8]:
model = ImplicitFactorizationModel(n_iter=3,
                                   loss='bpr')
model.fit(train)

mrr = mrr_score(model, test)

In [9]:
mrr[:10]

array([0.01792003, 0.07894809, 0.05453556, 0.00520833, 0.00964695,
       0.03831481, 0.01286256, 0.04568794, 0.12909755, 0.0457502 ])

### GitHub Stars

Data retrived from scraping GitHub:

In [10]:
import pandas as pd 

df = pd.read_csv('data/stars.csv')

In [11]:
df.sample(5)

Unnamed: 0,user,repo,description,language,stargazers
45306,Kiphen,brigadecore/brigade,Event-based Scripting for Kubernetes.,Go,1901
5232,andreztz,brettvanderwerff/Flaskerizer,Automatically create Flask apps from Bootstrap...,JavaScript,116
41667,GarrettMooney,IRkernel/IRkernel,R kernel for Jupyter,Jupyter Notebook,1252
47792,viseshrp,agiliq/building-api-django,,Python,189
55794,StevenLOL,Tencent/PocketFlow,An Automatic Model Compression (AutoMC) framew...,Python,2416


In [12]:
df['language'].value_counts()

Python              25702
JavaScript           6459
Jupyter Notebook     3777
Go                   2571
C++                  2544
                    ...  
Limbo                   1
Vim Snippet             1
Oz                      1
Mercury                 1
Ada                     1
Name: language, Length: 178, dtype: int64

In [13]:
df = df[df.language == 'Python']
df = df[~df['repo'].isin(['maxhumber/gif', 'maxhumber/gazpacho'])]

In [14]:
df.shape

(25380, 5)

In [15]:
len(df['repo'].unique())

12222

In [16]:
len(df['user'].unique())

326

In [17]:
df.head(3)

Unnamed: 0,user,repo,description,language,stargazers
0,sbarman-mi9,as-ideas/ForwardTacotron,⏩ Generating speech in a single forward pass w...,Python,97
1,sbarman-mi9,abhishekkrthakur/bert-sentiment,,Python,21
4,sbarman-mi9,EmilyAlsentzer/clinicalBERT,repository for Publicly Available Clinical BER...,Python,160


In [18]:
from spotlight.interactions import Interactions

In [19]:
# won't work
interactions = Interactions(df['user'], df['repo'])

TypeError: must be str, not int

"Everything must be a number"

In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [24]:
users = user_encoder.fit_transform(df['user'])
items = item_encoder.fit_transform(df['repo'])

In [25]:
interactions = Interactions(users, items)

In [26]:
interactions

<Interactions dataset (326 users x 12222 items x 25380 interactions)>

Be a good Data Scientist:

In [27]:
import numpy as np
from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=np.random.RandomState(42))

In [28]:
model = ImplicitFactorizationModel(loss='pointwise', n_iter=20)

In [29]:
model.fit(train)

In [30]:
df.head(3)

Unnamed: 0,user,repo,description,language,stargazers
0,sbarman-mi9,as-ideas/ForwardTacotron,⏩ Generating speech in a single forward pass w...,Python,97
1,sbarman-mi9,abhishekkrthakur/bert-sentiment,,Python,21
4,sbarman-mi9,EmilyAlsentzer/clinicalBERT,repository for Publicly Available Clinical BER...,Python,160


Examining one user:

In [31]:
garrry = df[df['user'] == 'garrrychan']

In [32]:
garrry['repo'].values

array(['mnielsen/neural-networks-and-deep-learning',
       'brendan-rius/jupyter-c-kernel', 'google-research/uda',
       'hhatto/autopep8', 'MaxHalford/prince',
       'scikit-learn-contrib/sklearn-pandas', 'modin-project/modin',
       'VikParuchuri/apartment-finder', 'uber-research/parallax',
       'prabhupant/python-ds', 'tensorflow/nmt',
       'huggingface/transformers', 'h5py/h5py', 'google-research/bert',
       'sloria/TextBlob', 'openai/gpt-2', 'matsui528/rii',
       'ResidentMario/missingno', 'lmcinnes/enstop',
       'phatpiglet/autocorrect', 'barrust/pyspellchecker',
       'seatgeek/fuzzywuzzy', 'maxhumber/chart', 'apache/airflow',
       'graphql-python/graphene', 'maxhumber/marc', 'spotify/luigi',
       'garrrychan/recipe_recommender_system',
       'CamDavidsonPilon/lifetimes', 'maciejkula/spotlight',
       'lyst/lightfm', 'practical-recommender-systems/moviegeek',
       'uwescience/TrafficCruising-DSSG2017'], dtype=object)

Looking at the user_id:

In [33]:
u = user_encoder.transform(['garrrychan'])

And all of the items:

In [34]:
item_encoder.classes_

array(['00111000/Imports-in-Python', '05bit/peewee-async',
       '0Kee-Team/WatchAD', ..., 'zzw922cn/Automatic_Speech_Recognition',
       'zzzDavid/ICDAR-2019-SROIE', 'zzzeek/sqlalchemy'], dtype=object)

In [35]:
preds = model.predict(u, np.arange(len(item_encoder.classes_)))

In [36]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': preds
}).sort_values('pred', ascending=False)

Unnamed: 0,repo,pred
9120,plotly/dash,14.197499
11599,vinta/awesome-python,11.208200
9238,psf/black,8.538667
11292,tqdm/tqdm,8.492222
5597,google/trax,8.418111
...,...,...
8,0voice/interview_internal_reference,-14.476551
11438,uiri/toml,-14.862925
2409,aidlearning/AidLearning-FrameWork,-14.865212
4236,davidteather/TikTok-Api,-15.087893


Evaulating the model:

In [37]:
from spotlight.evaluation import precision_recall_score

In [38]:
precision, recall = precision_recall_score(model, test, train, k=10)

In [39]:
precision.mean()

0.03431372549019608

In [40]:
recall.mean()

0.03234916539212286

Serialize:

In [41]:
import torch 

torch.save(model, 'model.spot')

In [42]:
del model

In [43]:
model = torch.load('model.spot')

Predict on another random user:

In [44]:
u = user_encoder.transform(['RandomOS'])

In [45]:
model.predict(u, np.arange(len(item_encoder.classes_)))

array([-8.888058 , -3.227444 , -7.1882577, ..., -3.6611528, -7.4558845,
       -4.7885914], dtype=float32)

In [46]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': model.predict(u, np.arange(len(item_encoder.classes_)))
}).sort_values('pred', ascending=False).head(20)

Unnamed: 0,repo,pred
9426,python-poetry/poetry,15.06713
11142,tiangolo/fastapi,14.133257
3927,cookiecutter/cookiecutter,13.093539
9348,pypa/pipenv,12.918298
9446,python/cpython,12.113222
1982,TheAlgorithms/Python,11.538029
10121,satwikkansal/wtfpython,11.168749
33,30-seconds/30-seconds-of-python,11.077389
3929,cool-RR/PySnooper,10.525791
9090,pipxproject/pipx,9.955636


Actual likes:

In [47]:
df[df['user'] == 'RandomOS']['repo']

58885               tortoise/tortoise-orm
58887                 mingrammer/diagrams
58905                       spulec/uncurl
58913         mozilla-iot/webthing-python
58922                       spotify/luigi
58924                    python-trio/trio
58925                       ranger/ranger
58928              googlefonts/noto-emoji
58937                Synss/python-mbedtls
58940                    pipxproject/pipx
58941                       linkedin/shiv
58942                      pantsbuild/pex
58946        PythonCharmers/python-future
58947     alan-turing-institute/CleverCSV
58948                 emeryberger/scalene
58949                 aouinizied/nfstream
58950               ionelmc/python-hunter
58951                      wolever/pip2pi
58956          fossasia/open-event-server
58994                python-poetry/poetry
58997                     thumbor/thumbor
59004                       holoviz/panel
59019             0xInfection/Awesome-WAF
59020    swisskyrepo/PayloadsAllTh

### But what if the user is brand new?~

In [48]:
user_encoder.transform(['maxhumber'])

ValueError: y contains previously unseen labels: ['maxhumber']

In [49]:
model.predict(1993, np.arange(len(item_encoder.classes_)))

ValueError: Maximum user id greater than number of users in model.