In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances
from sklearn.neighbors import NearestNeighbors
from collections import Counter

In [2]:
wiki = pd.read_csv('people_wiki.csv')
wiki.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [3]:
# counter = Counter()
# wiki['word_count'] = [dict(Counter(x)) for x in wiki['text'].str.split()]

In [4]:
cv = CountVectorizer()
f = cv.fit_transform(wiki.text)
features_count = f.sum(axis=0).tolist()[0]
features_names = cv.get_feature_names()
features = pd.DataFrame(list(zip(features_names, features_count)), 
                                columns=['features', 'count']
                               ).sort_values(by=['count'], ascending=False)

In [5]:
cv_ej = CountVectorizer()
ej = cv_ej.fit_transform(wiki.loc[wiki.name=='Elton John', 'text'])
features_count = ej.sum(axis=0).tolist()[0]
features_names = cv_ej.get_feature_names()
features = pd.DataFrame(list(zip(features_names, features_count)), 
                                columns=['features', 'count']
                               ).sort_values(by=['count'], ascending=False)
features

Unnamed: 0,features,count
227,the,27
135,in,18
42,and,15
176,of,13
118,has,9
121,he,7
146,john,7
177,on,6
209,since,5
106,for,5


In [6]:
tf = TfidfVectorizer(stop_words='english', smooth_idf=True, use_idf=True)
tfidf_matrix  = tf.fit_transform(wiki.text)
feature_names = tf.get_feature_names()

In [7]:
doc = wiki.loc[wiki.name=='Elton John'].index.values[0]
feature_index = tfidf_matrix[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])
d = {}
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    d[w] = [s]
df = pd.DataFrame.from_dict(d, orient='index', columns=['tfidf'])
df.sort_values(by='tfidf', ascending=False)

Unnamed: 0,tfidf
billboard,0.220815
john,0.217082
elton,0.212174
furnish,0.208194
songwriters,0.137278
award,0.136446
aids,0.127077
million,0.124955
100,0.123607
palace,0.122427


In [8]:
print(cosine_similarity(tfidf_matrix[wiki.loc[wiki.name=='Elton John'].index.values[0],:], tfidf_matrix[wiki.loc[wiki.name=='Victoria Beckham'].index.values[0],:]).flatten())
print(cosine_similarity(tfidf_matrix[wiki.loc[wiki.name=='Elton John'].index.values[0],:], tfidf_matrix[wiki.loc[wiki.name=='Paul McCartney'].index.values[0],:]).flatten())

[0.03407023]
[0.18991373]


In [9]:
print(cosine_distances(tfidf_matrix[wiki.loc[wiki.name=='Elton John'].index.values[0],:], tfidf_matrix[wiki.loc[wiki.name=='Victoria Beckham'].index.values[0],:]).flatten())
print(cosine_distances(tfidf_matrix[wiki.loc[wiki.name=='Elton John'].index.values[0],:], tfidf_matrix[wiki.loc[wiki.name=='Paul McCartney'].index.values[0],:]).flatten())

[0.96592977]
[0.81008627]


In [10]:
def get_closest_neighs(name, matrix):
    model = NearestNeighbors(n_neighbors=5, metric='cosine').fit(matrix)
    row = wiki.loc[wiki.name==name].index.values[0]
    distances, indices = model.kneighbors(matrix.getrow(row))
    names_similar = pd.Series(indices.flatten()).map(wiki.reset_index()['name'])
    result = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
    return result

In [11]:
print(get_closest_neighs('Elton John', f))
print(get_closest_neighs('Elton John', tfidf_matrix))

       distance            name
0  2.442491e-15      Elton John
1  1.687792e-01   Cliff Richard
2  1.718410e-01  Sandro Petrone
3  1.744907e-01     Rod Stewart
4  1.840130e-01   Roger Daltrey
   distance              name
0  0.000000        Elton John
1  0.704178       Rod Stewart
2  0.715818  Sting (musician)
3  0.724673    George Michael
4  0.724830      Phil Collins


In [12]:
print(get_closest_neighs('Victoria Beckham', f))
print(get_closest_neighs('Victoria Beckham', tfidf_matrix))

       distance                      name
0  3.330669e-16          Victoria Beckham
1  2.115428e-01  Mary Fitzgerald (artist)
2  2.185431e-01            Adrienne Corri
3  2.218932e-01          Beverly Jane Fry
4  2.224486e-01             Raman Mundair
   distance                 name
0  0.000000     Victoria Beckham
1  0.575420        David Beckham
2  0.799198  Stephen Dow Beckham
3  0.812301        Caroline Rush
4  0.814231  Angelique Westerhof
