In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.decomposition import PCA 
from sklearn.manifold import TSNE 
from sklearn.cluster import KMeans 
from sklearn.preprocessing import StandardScaler 

import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
df_raw = pd.read_csv("data.csv")
df_ = df_raw.drop(['url','Fitness', 'Sports', 'Gaming', 'Entertainment', 'Gadgets & Tech', 'Finance', 'Education', 'Animal/Pet', 'Health', 'Self Improvement', 'Art', 'Parenting', 'Books', 'url', 'genuinity_score', 'top keywords'], axis=1)
print(df_.head())

df_info = pd.read_csv("data.csv", usecols= ['user_id', 'url'])
print(df_info.head())

   user_id  Food  Fashion  Make-up  Beauty  Lifestyle  Luxury  Travel  \
0      127   2.0     12.0      6.0     4.0        4.0     3.0    15.0   
1      128   3.0      8.0      5.0     5.0        6.0     3.0     5.0   
2      132   2.0      7.0      5.0     4.0        7.0     4.0    12.0   
3      134   2.0     11.0      6.0     9.0        7.0     3.0    12.0   
4      135   3.0      5.0      5.0     9.0        9.0     7.0    16.0   

   Photography  
0          2.0  
1          1.0  
2          2.0  
3          2.0  
4          2.0  
   user_id                                          url
0      127   https://www.instagram.com/abhiruchipandey/
1      128  https://www.instagram.com/thebasic_culture/
2      132      https://www.instagram.com/vipul_juneja/
3      134      https://www.instagram.com/theleggylass/
4      135     https://www.instagram.com/akanksharedhu/


In [3]:
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df_))
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.587887,-0.517096,0.762015,0.022361,-0.485286,-0.319008,-0.214583,1.301202,-0.348429
1,-1.582266,-0.431983,0.093993,-0.255758,-0.242643,0.328773,-0.214583,-0.602763,-0.70769
2,-1.559784,-0.517096,-0.073012,-0.255758,-0.485286,0.652664,0.547953,0.730013,-0.348429
3,-1.548542,-0.517096,0.595009,0.022361,0.72793,0.652664,-0.214583,0.730013,-0.348429
4,-1.542922,-0.431983,-0.407023,-0.255758,0.72793,1.300444,2.835561,1.491599,-0.348429
5,-1.537301,-0.431983,-0.240018,0.856719,0.72793,-0.319008,0.547953,-0.22197,-0.348429
6,-1.526059,1.3554,-0.741034,-0.255758,-0.72793,-1.290679,-1.739655,-0.793159,-0.348429
7,-1.509197,-0.517096,0.260998,0.022361,-0.242643,2.272115,0.547953,-0.412366,0.729354
8,-1.503577,-0.431983,-0.741034,3.081671,2.669076,-0.966788,-0.214583,-0.602763,-0.348429
9,-1.475473,3.483236,-0.90804,-1.090115,-0.72793,-1.290679,-0.977119,-0.983556,-0.348429


In [4]:
kmeans = KMeans(n_clusters= 6, verbose=1)
kmeans.fit(df)

Initialization complete
Iteration 0, inertia 1277.0539878919703
Iteration 1, inertia 1008.3662561154744
Iteration 2, inertia 984.9937446312504
Iteration 3, inertia 972.1381555512064
Iteration 4, inertia 966.6589018876366
Iteration 5, inertia 963.5066258362134
Iteration 6, inertia 960.7554963240565
Iteration 7, inertia 950.2960392109206
Iteration 8, inertia 921.758119245862
Iteration 9, inertia 906.7413510312919
Iteration 10, inertia 899.7825065079032
Iteration 11, inertia 898.917951948931
Iteration 12, inertia 898.3880853259177
Iteration 13, inertia 898.051767873378
Iteration 14, inertia 897.4737139414916
Iteration 15, inertia 897.3204997951896
Converged at iteration 15: strict convergence.
Initialization complete
Iteration 0, inertia 1416.5388916496609
Iteration 1, inertia 1010.1284847997983
Iteration 2, inertia 955.5421134479016
Iteration 3, inertia 938.7572045586861
Iteration 4, inertia 936.33299295775
Iteration 5, inertia 936.1802770422626
Iteration 6, inertia 935.68246084136
Itera

KMeans(n_clusters=6, verbose=1)

In [5]:
clusters = kmeans.predict(df)
df["Cluster"] = clusters

df.to_csv("results.csv", index= False)

In [6]:
plotX = pd.DataFrame(np.array(df.sample(150)))

plotX.columns = df.columns

In [7]:
pca_2d = PCA(n_components=2)
pca_3d = PCA(n_components=3)

#Nobody likes doing PCA with categorical tags ://
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["Cluster"], axis=1)))

PCs_2d.columns = ["PC1_2d", "PC2_2d"]
PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

plotX = pd.concat([plotX,PCs_2d,PCs_3d], axis=1, join='inner')
plotX["dummy"] = 0

cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
cluster2 = plotX[plotX["Cluster"] == 2]
cluster3 = plotX[plotX["Cluster"] == 3]
cluster4 = plotX[plotX["Cluster"] == 4]
cluster5 = plotX[plotX["Cluster"] == 5]

In [11]:
#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_2d"],
                    y = cluster2["PC2_2d"],
                    mode = "markers",
                    name = "Photography",
                    marker = dict(color = 'rgba(0, 135, 200, 0.8)'),
                    text = None)

#trace4 is for 'Cluster 3'
trace4 = go.Scatter(
                    x = cluster3["PC1_2d"],
                    y = cluster3["PC2_2d"],
                    mode = "markers",
                    name = "Food",
                    marker = dict(color = 'rgba(0, 105, 50, 0.8)'),
                    text = None)

#trace5 is for 'Cluster 4'
trace5 = go.Scatter(
                    x = cluster4["PC1_2d"],
                    y = cluster4["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 4",
                    marker = dict(color = 'rgba(50, 25, 210, 0.8)'),
                    text = None)

#trace6 is for 'Cluster 5'
trace6 = go.Scatter(
                    x = cluster5["PC1_2d"],
                    y = cluster5["PC2_2d"],
                    mode = "markers",
                    name = "Fashion",
                    marker = dict(color = 'rgba(70, 195, 100, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3, trace4, trace5, trace6]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [14]:
#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2["PC1_3d"],
                    y = cluster2["PC2_3d"],
                    z = cluster2["PC3_3d"],
                    mode = "markers",
                    name = "Photography",
                    marker = dict(color = 'rgba(0, 135, 200, 0.8)'),
                    text = None)

#trace4 is for 'Cluster 3'
trace4 = go.Scatter3d(
                    x = cluster3["PC1_3d"],
                    y = cluster3["PC2_3d"],
                    z = cluster3["PC3_3d"],
                    mode = "markers",
                    name = "Food",
                    marker = dict(color = 'rgba(0, 105, 50, 0.8)'),
                    text = None)

#trace5 is for 'Cluster 4'
trace5 = go.Scatter3d(
                    x = cluster4["PC1_3d"],
                    y = cluster4["PC2_3d"],
                    z = cluster4["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 4",
                    marker = dict(color = 'rgba(50, 25, 210, 0.8)'),
                    text = None)

#trace6 is for 'Cluster 5'
trace6 = go.Scatter3d(
                    x = cluster5["PC1_3d"],
                    y = cluster5["PC2_3d"],
                    z = cluster5["PC3_3d"],
                    mode = "markers",
                    name = "Fashion",
                    marker = dict(color = 'rgba(70, 195, 100, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3, trace4, trace5, trace6]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)