In [90]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import text
import tldextract
from matplotlib import pyplot as plt
from fcmeans import FCM

In [91]:
pd.set_option('display.max_rows', None)

In [92]:
df = pd.read_csv('history.csv', parse_dates = True)

In [93]:
# Extract domains from URL
domains = df.url.apply(lambda x: tldextract.extract(x).domain)
df = pd.concat([df, domains.rename('domains')], axis = 1)

def f(x):
    t = str(x.title).lower().strip()
    if t != x.domains:
        x.title = t.replace(x.domains, "")
    if x.title == 'nan':
        x.title = x.domains
    return x

df = df.apply(f, axis = 1)

In [94]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(["suche", "search"])
vectorizer = TfidfVectorizer(stop_words=my_stop_words, min_df = 0.001, max_df = 0.9)

In [95]:
X = vectorizer.fit_transform(df.title[~pd.isnull(df.title)])

In [96]:
max_n = 10

In [97]:
# wcss = []
# for i in range(1, max_n):
#     kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
#     kmeans.fit(X)
#     wcss.append(kmeans.inertia_)

# plt.plot(range(1, max_n), wcss)
# plt.title('Elbow Method')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.show()

In [98]:
n_init = np.random.randint(0,100)
km = KMeans(n_clusters=max_n, init='k-means++', max_iter=300, n_init=n_init)
km.fit(X)

KMeans(n_clusters=10, n_init=48)

In [99]:
pred_y = pd.Series(km.predict(X))
pred_y.index = df.index

In [100]:
df = pd.concat([df, pred_y.rename('Cluster')], axis = 1)

In [101]:
# for i in range(1, 10):
#     kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
#     kmeans.fit(X)
#     wcss.append(kmeans.inertia_)

# plt.plot(range(1, max_n), wcss)
# plt.title('Elbow Method')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.show()

In [102]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(max_n):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: python overflow stack list string convert data user pandas sort
Cluster 1: season premium episode watch disney sunny philadelphia louie 10 11
Cluster 2: learning university data machine discussions neural ai networks india home
Cluster 3: netflix remove indian comedy error online watch für garden garching
Cluster 4: maps lamma island jaipur cyberport information garden hyderabad google different
Cluster 5: algebra linear opencourseware mathematics unit matrices exchange applications positive stack
Cluster 6: gmail account com kshitijkapoor08 kshitijkapoor96 attention security linkedin scratch game
Cluster 7: stories plancker level german web com youtube games game für
Cluster 8: gmail kshitijkapoor08 com results kshitijkapoor96 inbox 2020 new security fwd
Cluster 9: algs stanford beaunus master txt testcases wiki test home cases


In [103]:
df.Cluster.value_counts()

2    10306
0      833
3      588
4      585
9      215
1      214
8      173
7      138
5      107
6       13
Name: Cluster, dtype: int64

In [104]:
df[df.Cluster == 2].sample(1000)

Unnamed: 0,id,lastVisitTime,lastVisitTimeTimestamp,title,typedCount,url,visitCount,domains,Cluster
8650,104774,"26/04/2020, 00:54:01",1587843000000.0,hashar meaning in hindi - search,0,https://www.google.com/search?q=hashar+meaning...,2,google,2
3909,111952,"03/06/2020, 15:14:38",1591177000000.0,chi-squared test -,0,https://en.wikipedia.org/wiki/Chi-squared_test,1,wikipedia,2
7006,107373,"12/05/2020, 21:06:02",1589298000000.0,neural networks and deep learning - discussion...,0,https://www.coursera.org/learn/neural-networks...,1,coursera,2
3096,113070,"07/06/2020, 06:15:09",1591491000000.0,tiddlywiki - -suche,0,https://www.google.com/search?q=tiddlywiki&oq=...,2,google,2
7107,107185,"12/05/2020, 15:21:25",1589277000000.0,(1) bhala hua mori gagri phooti -,0,https://www.youtube.com/results?search_query=b...,2,youtube,2
1694,107400,"11/06/2020, 22:49:21",1591896000000.0,improving deep neural networks: hyperparameter...,0,https://www.coursera.org/learn/deep-neural-net...,26,coursera,2
1840,114643,"11/06/2020, 15:28:09",1591869000000.0,how to export pandas dataframe to csv - toward...,0,https://towardsdatascience.com/how-to-export-p...,1,towardsdatascience,2
8429,105134,"27/04/2020, 20:52:47",1588001000000.0,chaap tilak sab chheeni ~ amir khusrau -,0,https://www.youtube.com/watch?v=wuxSFZV51W8&t=7s,1,youtube,2
6360,108501,"15/05/2020, 05:09:44",1589500000000.0,convolutional model: step by step |,0,https://www.coursera.org/learn/convolutional-n...,1,coursera,2
9660,103095,"17/04/2020, 20:18:26",1587135000000.0,bew_antrag_8785465_20200417164814.pdf,0,https://campus.tum.de/tumonline/wbCallReport.s...,2,tum,2
