# Fire up GraphLab

In [29]:
import graphlab

# Load the data

In [30]:
profiles = graphlab.SFrame('profiles.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [31]:
burak = profiles[profiles['Name'] == "Burak Ergenc"]

In [32]:
burak

Date,Name,Hours of Study,Topic
8/12/16,Burak Ergenc,92.5,"Clustring and Similarity, KNN, Clustring, TF-ITF, ..."


# Create Word Count Vector

In [33]:
profiles['word_count'] = graphlab.text_analytics.count_words(profiles['Topic'])

# Normalize the vector and create Term Frequency * Inverse Document Frequency

In [34]:
tfidf = graphlab.text_analytics.tf_idf(profiles['word_count'])

In [35]:
profiles['tfidf'] = tfidf

In [36]:
burak = profiles[profiles['Name'] == "Burak Ergenc"]

In [37]:
burak[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf']).sort('tfidf')

word,tfidf
+,1.09861228867
data,1.09861228867
"data,",1.38629436112
",",1.38629436112
of,1.38629436112
working,1.38629436112
to,1.79175946923
blog,1.79175946923
"regression,",1.79175946923
algorithms,2.19722457734


# Create the k-Nearest Neighbors Algorithm with tfidf

In [38]:
knn_model = graphlab.nearest_neighbors.create(profiles, features=['tfidf'], label='Name', distance='cosine')

# Find the profiles that are closest to the given person

In [39]:
knn_model.query(burak)

query_label,reference_label,distance,rank
0,Burak Ergenc,1.11022302463e-16,1
0,Hafsa Naciye Celebi,0.892211572682,2
0,Enes Kemal Ergin,0.89271294866,3
0,Almaz Tukenov,0.923932199058,4
0,Behic Guven,0.927440394253,5


In [40]:
almaz = profiles[profiles['Name'] == 'Almaz Tukenov']
knn_model.query(almaz)

query_label,reference_label,distance,rank
0,Almaz Tukenov,0.0,1
0,M. Bedir Tapkan,0.824117182107,2
0,Burak Ergenc,0.923932199058,3
0,Nadide Pasali,0.925850620744,4
0,,1.0,5


In [41]:
aysenur = profiles[profiles['Name'] == 'Aysenur Erdem']
knn_model.query(aysenur)

query_label,reference_label,distance,rank
0,Aysenur Erdem,0.0,1
0,Inamullah Rasuna,0.825398164092,2
0,Hafsa Naciye Celebi,0.862112518475,3
0,Burak Ergenc,0.960655577795,4
0,Enes Kemal Ergin,0.974547118497,5


In [42]:
behic = profiles[profiles['Name'] == 'Behic Guven']
knn_model.query(behic)

query_label,reference_label,distance,rank
0,Behic Guven,-2.22044604925e-16,1
0,Burak Ergenc,0.927440394253,2
0,,1.0,3
0,Almaz Tukenov,1.0,4
0,Hafsa Naciye Celebi,1.0,5


In [43]:
enes = profiles[profiles['Name'] == 'Enes Kemal Ergin']
knn_model.query(enes)

query_label,reference_label,distance,rank
0,Enes Kemal Ergin,1.11022302463e-16,1
0,Medina Colic,0.870719927769,2
0,Burak Ergenc,0.89271294866,3
0,Hafsa Naciye Celebi,0.908833324693,4
0,Aysenur Erdem,0.974547118497,5


In [44]:
hafsa = profiles[profiles['Name'] == 'Hafsa Naciye Celebi']
knn_model.query(hafsa)

query_label,reference_label,distance,rank
0,Hafsa Naciye Celebi,0.0,1
0,Aysenur Erdem,0.862112518475,2
0,Burak Ergenc,0.892211572682,3
0,Inamullah Rasuna,0.895282256364,4
0,Enes Kemal Ergin,0.908833324693,5


In [45]:
bedir = profiles[profiles['Name'] == 'M. Bedir Tapkan']
knn_model.query(bedir)

query_label,reference_label,distance,rank
0,M. Bedir Tapkan,1.11022302463e-16,1
0,Nadide Pasali,0.656463345418,2
0,Almaz Tukenov,0.824117182107,3
0,Islam Kamilov,0.952049083945,4
0,Medina Colic,0.953157817212,5


In [46]:
medina = profiles[profiles['Name'] == 'Medina Colic']
knn_model.query(medina)

query_label,reference_label,distance,rank
0,Medina Colic,0.0,1
0,Enes Kemal Ergin,0.870719927769,2
0,Hafsa Naciye Celebi,0.91784026922,3
0,M. Bedir Tapkan,0.953157817212,4
0,Nadide Pasali,0.960089612056,5


In [47]:
nadide = profiles[profiles['Name'] == 'Nadide Pasali']

In [48]:
knn_model.query(nadide)

query_label,reference_label,distance,rank
0,Nadide Pasali,0.0,1
0,M. Bedir Tapkan,0.656463345418,2
0,Islam Kamilov,0.870852057621,3
0,Almaz Tukenov,0.925850620744,4
0,Medina Colic,0.960089612056,5


# Average Hours of Study

In [49]:
profiles['Hours of Study'].mean()

39.75