### The Data

In [10]:
import pandas as pd

In [11]:
df = pd.read_csv('data/stars.csv')

In [12]:
df.sample(5)

Unnamed: 0,description,language,repo,stargazers,user
6982,Clize: Turn Python functions into command-line...,Python,epsy/clize,409,DahlitzFlorian
3309,"Blazing fast, instant realtime GraphQL APIs on...",JavaScript,hasura/graphql-engine,12312,mjhea0
15001,Syphon plugin for Unity,Objective-C,keijiro/KlakSyphon,104,periode
4233,Analytic platform for real-time large-scale st...,C++,qminer/qminer,178,davidag
19181,Creates Wheel based archives to allow portable...,Python,cloudify-cosmo/wagon,80,robcowie


### The Cleanup

In [13]:
df = df[df['repo'] != 'maxhumber/gazpacho']
popular = pd.DataFrame(df['repo'].value_counts())
select_repos = popular[popular['repo'] >= 3].index.tolist()
df = df[df['repo'].isin(select_repos)]

In [14]:
df.sample(5)

Unnamed: 0,description,language,repo,stargazers,user
12587,"Context aware, pluggable and customizable data...",Python,microsoft/presidio,789,greed2411
14101,Google Drive CLI Client,Go,gdrive-org/gdrive,5936,kimthostrup
11290,Automatically generate a RESTful API service f...,Python,jeffknupp/sandman2,1401,aplgsd
1007,"A curated list of awesome Go frameworks, libra...",Go,avelino/awesome-go,48551,enzoftware
17891,:computer: An awesome & curated list of best ...,,Awesome-Windows/Awesome,10003,thejustin


### The Preparation

In [16]:
df = df.groupby(['user'])['repo'].apply(lambda x: ','.join(x))
df = pd.DataFrame(df)

In [17]:
df.sample(5)

Unnamed: 0_level_0,repo
user,Unnamed: 1_level_1
shadymoses,"daumann/ECMAScript-new-features-list,lydiahall..."
AndreWohnsland,"plasticityai/supersqlite,mherrmann/fbs"
edujtm,"servo/servo,flask-restful/flask-restful,encode..."
mazharul-miraz,"taniarascia/takenote,instantpage/instant.page,..."
Liitle,"tiangolo/fastapi,idealo/imagededup,eon01/kuber..."


### The Model

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors


class NNRecommender:

    def __init__(self, n_neighbors=10, max_features=1000, tokenizer=lambda x: x.split(',')):
        self.cv = CountVectorizer(tokenizer=tokenizer, max_features=max_features)
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, X):
        self.X = X
        X = self.cv.fit_transform(X)
        self.nn.fit(X)
        return self

    def predict(self, X):
        Xt = self.cv.transform(X)
        _, neighbors = self.nn.kneighbors(Xt)
        points = []
        for n in neighbors:
            repos = []
            for ni in n:
                r = self.X.iloc[int(ni)].split(',')
                repos.extend(r)
            repos = list(set(repos))
            points.append(repos)
        return points

In [24]:
model = NNRecommender()
model.fit(df['repo'])
model.predict(df['repo'])[14]

['kubernetes/kubernetes',
 'facebook/react',
 'torvalds/linux',
 'burnash/gspread',
 'getify/You-Dont-Know-JS',
 'Avik-Jain/100-Days-Of-ML-Code',
 'firmai/industry-machine-learning',
 'froala/design-blocks',
 'github/gitignore',
 'mherrmann/fbs',
 'plasticityai/supersqlite',
 'vinta/awesome-python',
 'mrdoob/three.js',
 'TheAlgorithms/Python',
 'fastai/fastai',
 'wistbean/learn_python3_spider',
 'sloria/TextBlob',
 'andkret/Cookbook',
 'shengqiangzhang/examples-of-web-crawlers']

### Under the Hood

In [29]:
cv = CountVectorizer(tokenizer=lambda x: x.split(','), max_features=1000)
X = cv.fit_transform(df['repo'])
X.todense()[:5]

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [30]:
nn = NearestNeighbors(n_neighbors=10)

In [35]:
nn.fit(X)
dist, ind = nn.kneighbors()
ind[:5]

array([[ 11,  93,  14,   3,  68,  22,  83, 111,   2,  51],
       [ 11,  22,   3,  14,  69,  93,  83,  68,   2,  51],
       [ 11,  93,  14,   3,  68,  22,  83,  69,  51,   4],
       [ 11,  14,  93,  22,   2,  68,  83,  69,  51,  12],
       [ 11,   3,  14,  93,   2,  83,  68,  22,  69,  51]])