In [None]:
!pip install --quiet hdmedians

In [10]:
SS_MODEL = "ss/sentencespace"
REPO_FILE = 'repos.txt'
GITHUB_EMBEDDINGS = './ss/ss_repo_embeds.txt'

In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
from lib.utils import *
from copy import deepcopy

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 7]

# Embedding



In [12]:
size = int(5472100/2)
embeddings = get_embeddings(GITHUB_EMBEDDINGS, size)

## Get & Embed Readmes

In [13]:
with open(REPO_FILE, 'r') as f: 
    repos = [l.strip() for i,l in enumerate(f)]
    
readmes = get_ai_readmes(repos, 'foo')

REPO 404: hakimsd9/ksupLogistic
REPO 404: hongyusu/SPIN
REPO 404: lmccalman/reverend
REPO 404: radioactive1014/ode_cmake_PBP
REPO 404: scand109/XPEB
REPO 404: rokon1014/Optimization-Particle-Belief-Propagation
REPO 404: lopezpaz/domain_adaptation_with_copulasREPO 404: luliu31415926/Reinforcement-Learning-for-Optimized-trade-execution

REPO 404: pengsun/ohnn-text-cls
REPO 404: danielhomola/mifs
REPO 404: pengsun/ohnn
REPO 404: AnOtterGithubUser/MVA_project_graph_in_machine_learning
REPO 404: EigenLab/DeepEmbeding
REPO 404: davidandrzej/cvbLDA
REPO 404: hannawallach/uniform-process
REPO 404: sbos/seqddcrp.jl
REPO 404: tapilab/active-prior
REPO 404: markrogersjr/mlds
REPO 404: ffminx/swell-master
REPO 404: svivek/distro
REPO 404: paramveerdhillon/swell
REPO 404: DrSkippy/Python-DP-Means-Clustering
REPO 404: jhalcrow/random_stuff
REPO 404: LFY/ssmt
REPO 404: kastnerkyle/vrnn-samples
REPO 404: xlou/BOT
REPO 404: ffagan/Analytic_EPESS_for_TMG
REPO 404: Maltliquor/LNP
REPO 404: jiminsong/Spec

KeyboardInterrupt: 

In [None]:
import subprocess 
    
p = subprocess.Popen(["embed_doc", SS_MODEL], 
                    stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE, encoding='utf-8')    

o,e = p.communicate(input = readmes)
vecs = np.array(get_ss_embed(o))

## Visualize Embedded Space

In [None]:
from scipy.spatial.distance import euclidean, cosine
from hdmedians import geomedian, medoid

def distance_from(arr, v):
    dist = lambda d: euclidean(d, v)
    return np.apply_along_axis(lambda d: euclidean(d, v), 1, arr)

def distances_from_med(vecs, embeddings, med = None):
    """ Generate distance of each embedding from medoid of vecs """
    med = med or np.asarray(medoid(vecs.T))
    return distance_from(embeddings, med)

def pick_local_sample(distances, embedding, sample_size, thresh):
    s = embeddings[distances < thresh]
    idx = np.random.choice(s.shape[0], sample_size, replace=False)
    sample = s[idx]
    return sample

In [None]:
distances = distances_from_med(vecs, embeddings)

## One-Class SVM

In [None]:
from sklearn.svm import OneClassSVM

svm = OneClassSVM(kernel='rbf', nu = .5, gamma = 1/1000)

In [None]:
from sklearn.manifold import MDS, Isomap, LocallyLinearEmbedding, TSNE

def project(embeddings, vecs):
    mds = MDS()
    X = np.concatenate([vecs, embeddings])
    manifold = mds.fit_transform(X)
    labels = np.concatenate([np.ones(vecs.shape[0]), np.zeros(embeddings.shape[0])])
    df = pd.DataFrame(manifold, columns = ['x', 'y']).assign(source = pd.Series(labels))
    return df
    
def plot_predictions(model, df):
    X = df[['x', 'y']].values
    vecs = X[df.source == 1]
    pos,mod = predict_ai(model, vecs, X)
    
    # Add support vectors
    df['support_vector'] = False
    df.loc[mod.support_ ,'support_vector'] = True
    
    # Add labels for source
    lab = np.array(['Papers' if i == 1 else 'Random' for i in df.source])
    lab[pos[:,0]] = 'Classified'
    df['label'] = lab
    sns.scatterplot(x = 'x', y = 'y', style='source', hue = 'label', data = df, markers = ['P', ','], palette="bright")

In [None]:
sample = pick_local_sample(distances, embeddings, sample_size=500, thresh=1.2)

In [None]:
df = project(sample, vecs)

In [None]:
plot_predictions(svm, df)

In [None]:
pos,_ = predict_ai(svm, vecs, embeddings)
len(vecs), len(pos)

In [None]:
print_random(pos)

## Evaluation

In [None]:
svm.decision_function(sample)