In [1]:
'''
Build nearest neighbor clustering model to help identify similarities in groups.

Ex: Cluster wiki articles based on topic to present to user as 'simiar' 
articles.
'''

import os
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
#from scipy.sparse import csc_matrix

from sklearn.neighbors import NearestNeighbors

In [2]:
wiki_data = pd.read_csv('../data/Wikipedia/people_wiki.csv')
wiki_data.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [3]:
word_count = np.load(r'../data/Wikipedia/people_wiki_word_count.npz')
word_count.files

['indices', 'indptr', 'shape', 'data']

In [4]:
word_count['indices']

array([  5877,  92219, 227191, ..., 547976, 547977, 547978], dtype=int32)

In [5]:
word_count['indptr']

array([       0,      139,      286, ..., 10379022, 10379155, 10379283], dtype=int32)

In [6]:
word_count['shape']

array([ 59071, 547979])

In [7]:
word_count['data']

array([ 1,  1,  1, ...,  6,  9, 14])

In [8]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix((data, indices, indptr), shape, dtype= 'int32')

In [9]:
word_count = load_sparse_csr(r'../data/Wikipedia/people_wiki_word_count.npz').toarray()

In [10]:
word_count_sample = word_count[:500]

In [11]:
model = NearestNeighbors(metric= 'euclidean', algorithm= 'brute')

model.fit(word_count_sample)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [12]:
wiki_data[wiki_data['name'] == 'Barack Obama']

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [13]:
word_count_sample.shape

(500, 547979)

In [14]:
word_count_sample.dtype

dtype('int32')

In [15]:
word_count_sample.size

273989500

In [16]:
distances, indicies = model.kneighbors(word_count_sample[[42]], n_neighbors= 10)

In [17]:
distances

array([[  0.        ,  22.95648057,  24.20743687,  24.81934729,
         24.8997992 ,  24.93992783,  25.05992817,  25.0998008 ,
         25.17935662,  25.21904043]])

In [18]:
indicies

array([[ 42, 142,  55, 480, 496, 198, 438, 365, 111, 156]])

In [19]:
neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indicies.flatten()})

In [20]:
neighbors

Unnamed: 0,distance,id
0,0.0,42
1,22.956481,142
2,24.207437,55
3,24.819347,480
4,24.899799,496
5,24.939928,198
6,25.059928,438
7,25.099801,365
8,25.179357,111
9,25.21904,156


In [21]:
wiki_data.join

<bound method DataFrame.join of                                                      URI  \
0            <http://dbpedia.org/resource/Digby_Morrell>   
1           <http://dbpedia.org/resource/Alfred_J._Lewy>   
2            <http://dbpedia.org/resource/Harpdog_Brown>   
3      <http://dbpedia.org/resource/Franz_Rottensteiner>   
4                   <http://dbpedia.org/resource/G-Enka>   
5            <http://dbpedia.org/resource/Sam_Henderson>   
6            <http://dbpedia.org/resource/Aaron_LaCrate>   
7          <http://dbpedia.org/resource/Trevor_Ferguson>   
8             <http://dbpedia.org/resource/Grant_Nelson>   
9             <http://dbpedia.org/resource/Cathy_Caruth>   
10            <http://dbpedia.org/resource/Sophie_Crumb>   
11           <http://dbpedia.org/resource/Jenn_Ashworth>   
12        <http://dbpedia.org/resource/Jonathan_Hoefler>   
13     <http://dbpedia.org/resource/Anthony_Gueterboc...   
14      <http://dbpedia.org/resource/David_Chernushenko>   
15      