In [233]:
import pandas as pd
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import tldextract
import scipy.sparse
from sklearn.feature_extraction import text
from sklearn import metrics
from sklearn.metrics import pairwise_distances

In [234]:
pd.set_option('display.max_rows', None)
df = pd.read_csv('history.csv', parse_dates = True)

In [235]:
# Extract domains from URL
domains = df.url.apply(lambda x: tldextract.extract(x).domain)
df = pd.concat([df, domains.rename('domains')], axis = 1)

def f(x):
    t = str(x.title).lower().strip()
    if t != x.domains:
        x.title = t.replace(x.domains, "")
    if x.title == 'nan':
        x.title = x.domains
    return x

df = df.apply(f, axis = 1)

In [236]:
# Model Feature Selection
numeric_cols = []
useful_cols = numeric_cols
categorical_cols = []
categorical_cols_int = [x+"_int" for x in categorical_cols]
useful_cols.extend(categorical_cols_int)

In [237]:
# Encode categorical cols as numeric
for c in categorical_cols:
    new = pd.Series((pd.factorize(df[c])[0]+1))
    df = pd.concat([df, new.rename(c+"_int")], axis = 1)

In [238]:
# useful_cols.extend(list(title_df.columns))

In [239]:
def cluster_df(df_param, iteration, n_clusters):        
    # Preprocessing
    local_df = df_param.copy()
    
    my_stop_words = text.ENGLISH_STOP_WORDS.union(["suche", "search"])
    vectorizer = TfidfVectorizer(stop_words=my_stop_words, max_features = 100)
    
    X = vectorizer.fit_transform(local_df.title[~pd.isnull(local_df.title)])


    km = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10)
    km.fit(X)
    
    pred_y = pd.Series(km.predict(X))
    pred_y.index = local_df.index

    local_df = pd.concat([local_df, pred_y.rename('Cluster'+str(iteration))], axis = 1)
    
    print("Top terms per cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(n_clusters):
        print("Cluster %d:" % i, end=' | ')
        print("Cluster size = ",local_df['Cluster'+str(iteration)].value_counts()[i], end=' | ')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end=' | ')
        print()
    labels = km.labels_
    print('Silhouette Coefficient = ', metrics.silhouette_score(X, labels, metric='euclidean'))
    
    return local_df

In [240]:
def evaluate(local_df, n_clusters = 5, iteration = 0, max_iteration = 10):
    max_cluster_size = local_df.shape[0]
    df_rev = local_df
    while max_cluster_size > df.shape[0]*0.1 and iteration < max_iteration:
        if 'Cluster'+str(iteration-1) in list(df_rev.columns.values):
            df_rev = cluster_df(df_rev[df_rev['Cluster'+str(iteration-1)] == df_rev['Cluster'+str(iteration-1)].value_counts().idxmax()], iteration, n_clusters)

        else:
            df_rev = cluster_df(df_rev, iteration, n_clusters)
        max_cluster_size = df_rev['Cluster'+str(iteration)].value_counts().max()
        iteration = iteration+1

In [244]:
evaluate(df, 10)

Top terms per cluster:
Cluster 0: | Cluster size =  458 |  overflow |  stack |  python |  list |  pandas |  string |  data |  using |  algorithm |  learn | 
Cluster 1: | Cluster size =  8449 |  online |  india |  stories |  courses |  chrome |  mathematics |  pandas |  web |  algorithms |  com | 
Cluster 2: | Cluster size =  160 |  paths |  shortest |  discussions |  structures |  graph |  data |  home |  stanford |  university |  course | 
Cluster 3: | Cluster size =  593 |  netflix |  best |  learning |  lamma |  graph |  edu |  episode |  exchange |  explore |  fateh | 
Cluster 4: | Cluster size =  330 |  python |  string |  list |  time |  best |  set |  pandas |  using |  learn |  neural | 
Cluster 5: | Cluster size =  324 |  maps |  com |  world |  linear |  video |  using |  documentation |  edu |  episode |  exchange | 
Cluster 6: | Cluster size =  386 |  learning |  machine |  titanic |  deep |  jobs |  mathematics |  neural |  home |  data |  opencourseware | 
Cluster 7: | Cl