In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

import mummify

In [2]:
df = pd.read_csv('../data/stars.csv')

In [3]:
df = df[df['repo'] != 'maxhumber/gazpacho']
df = df[df.language.isin(['Python', 'Jupyter Notebook'])]
popular = pd.DataFrame(df['repo'].value_counts())
select_repos = popular[popular['repo'] >= 5].index.tolist()
df = df[df['repo'].isin(select_repos)]
df = df.groupby(['user'])['repo'].apply(lambda x: ','.join(x))
df = pd.DataFrame(df)

In [4]:
class NNRecommender:

    def __init__(self, n_neighbors=10, max_features=1000, tokenizer=lambda x: x.split(',')):
        self.cv = CountVectorizer(tokenizer=tokenizer, max_features=max_features)
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, X):
        self.X = X
        X = self.cv.fit_transform(X)
        self.nn.fit(X)
        return self

    def predict(self, X):
        Xt = self.cv.transform(X)
        _, neighbors = self.nn.kneighbors(Xt)
        points = []
        for n in neighbors:
            repos = []
            for ni in n:
                r = self.X.iloc[int(ni)].split(',')
                repos.extend(r)
            repos = list(set(repos))
            points.append(repos)
        return points

In [5]:
n_neighbors=10
max_features=500

model = NNRecommender(n_neighbors, max_features)
model.fit(df['repo'])
model.predict(['TheAlgorithms/Python,jackfrued/Python-100-Days,hzwer/ICCV2019-LearningToPaint,deepfakes/faceswap'])

[['ytdl-org/youtube-dl',
  'sloria/TextBlob',
  'vinta/awesome-python',
  'minimaxir/big-list-of-naughty-strings',
  'PySimpleGUI/PySimpleGUI',
  'scikit-learn/scikit-learn',
  'donnemartin/system-design-primer',
  'donnemartin/interactive-coding-challenges',
  'fastai/fastai',
  'TheAlgorithms/Python',
  'deepfakes/faceswap',
  'jackfrued/Python-100-Days',
  'Avik-Jain/100-Days-Of-ML-Code',
  'hzwer/ICCV2019-LearningToPaint']]

In [6]:
mummify.log(f'n_neighbors={n_neighbors}, max_features={max_features}')

[35mNothing changed, nothing logged[0m
