# Clustering for UpLabel

In [39]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics import confusion_matrix, classification_report, silhouette_score
from sklearn import metrics
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

In [40]:
# Import data
df = pd.read_csv('input.txt', sep="\t")
df.head()

Unnamed: 0,label,text,tag
0,Sport,21-Jähriger fällt wohl bis Saisonende aus. Wie...,
1,Kultur,"'Erfundene Bilder zu Filmen, die als verloren ...",
2,Web,Der frischgekürte CEO Sundar Pichai setzt auf ...,
3,Wirtschaft,"Putin: ""Einigung, dass wir Menge auf Niveau vo...",
4,Inland,Estland sieht den künftigen österreichischen P...,


In [41]:
def get_cluster_complexity(data):
    nCl = len(data.label.drop_duplicates())
    words = stopwords.words("german")
    # Fit cluster model
    text_clf = Pipeline([
        ('vect', TfidfVectorizer(max_df=0.5, min_df=2)),
        ('tfidf', TfidfTransformer()),
        ('clf', KMeans(n_clusters=nCl, init='k-means++', max_iter=1000, n_init=1)),
    ])
    text_clf.fit(data.text)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=words)
    X = vectorizer.fit_transform(data.text)

    # Calculate complexity
    pred = text_clf.predict(data.text)
    #TODO: create mapping table for labels
    _temp = data.copy()
    _temp = _temp[_temp.label != '']
    _temp['cluster'] = pred
    cat_map = None
    retobj = _temp.copy()
    
    silscore = silhouette_score(X, pred)
    complexity = (silscore + 1)/2
    print(f'\t[INFO] Complexity Score -> {complexity}')
    #return complexity, text_clf, cat_map
    return retobj

In [None]:
# Fit Cluster and Return Complexity
returnobject = get_cluster_complexity(df)

In [None]:
# Get Rank Table
rank = returnobject.groupby(["cluster", "label"]).count().sort_values("text").groupby(level=0).tail(1)
print(rank)

In [None]:
# Create copy and remove text/tag
rank_list = rank.copy()
del rank['text'], rank['tag']

In [None]:
# Reset index
rank = rank.reset_index()

In [None]:
# Create ranking
rank.groupby('cluster')['label'].apply(lambda x: x).to_dict()