In [1]:
'''
Build k-nearest neighbor clustering model to help identify similarities in groups.

Ex: Cluster wiki articles based on topic to present to user as 'simiar' 
articles.

used basic word count vector and tf-idf methodologies to model text
'''

import os
import numpy as np
import pandas as pd
#import dask.dataframe as dd

from scipy.sparse import csr_matrix
#from scipy.sparse import csc_matrix

from sklearn.neighbors import NearestNeighbors

In [2]:
wiki_data = pd.read_csv('../data/Wikipedia/people_wiki.csv')
#wiki_data = dd.read_csv('../data/Wikipedia/people_wiki.csv')

wiki_data.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [3]:
#word_count = np.load(r'../data/Wikipedia/people_wiki_word_count.npz')
tf_idf = np.load(r'../data/Wikipedia/people_wiki_tf_idf.npz')

#word_count.files
tf_idf.files

['indices', 'indptr', 'shape', 'data']

In [4]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix((data, indices, indptr), shape, dtype= 'int32')

In [5]:
#word_count = load_sparse_csr(r'../data/Wikipedia/people_wiki_word_count.npz').toarray()
tf_idf = load_sparse_csr(r'../data/Wikipedia/people_wiki_tf_idf.npz').toarray()

In [6]:
#word_count_sample = word_count[:8000]
tf_idf_sample = tf_idf[:8000]

In [7]:
model = NearestNeighbors(metric= 'euclidean', algorithm= 'brute')

#model.fit(word_count_sample)
model.fit(tf_idf_sample)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [8]:
wiki_data[wiki_data['name'] == 'Barack Obama']

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [9]:
#word_count_sample.shape

In [10]:
#distances, indicies = model.kneighbors(word_count_sample[[816]], n_neighbors= 10)
distances, indicies = model.kneighbors(tf_idf_sample[[816]], n_neighbors= 10)

In [11]:
distances

array([[  0.        ,  65.77993615,  65.88626564,  67.50555533,
         67.867518  ,  68.08817812,  68.08817812,  68.70953355,
         68.9710084 ,  69.0869018 ]])

In [12]:
indicies

array([[ 816, 3116, 7653, 3765, 7419, 1699, 3990, 4118, 5620, 6707]])

In [13]:
neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indicies.flatten()})

In [14]:
neighbors.head()

Unnamed: 0,distance,id
0,0.0,816
1,65.779936,3116
2,65.886266,7653
3,67.505555,3765
4,67.867518,7419


In [15]:
full_data = pd.merge(neighbors, 
                     wiki_data, 
                     how= 'inner', 
                     left_on='id', 
                     right_index=True)

In [16]:
full_data[['id', 'distance', 'name', 'text']].head()

Unnamed: 0,id,distance,name,text
0,816,0.0,Bruce Reynolds (TV personality),for the great train robber see bruce reynoldsb...
1,3116,65.779936,Bruce Parker,bruce parker born 20 july 1941 is a british jo...
2,7653,65.886266,Rodney Marsh (footballer),rodney william marsh born 11 october 1944 is a...
3,3765,67.505555,Tessa Ross,tessa sarah ross cbe born 1961 is a bafta awar...
4,7419,67.867518,Ken Brown (golfer),ken brown born 9 january 1957 is a scottish fo...
