# MBD Project - Stack Overflow - Tags clustering using affinity propagation

**Input Data:** top_tags_90.csv: unique tags accounting for 90% of tags in SO questions <br>
**Two settings:** Precomputed affinity and Euclidean affinity <br>
**Reference:** <br>
- Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation
- Tutorial AffinityPropagation
Ref:https://stats.stackexchange.com/questions/123060/clustering-a-long-list-of-strings-words-into-similarity-groups




In [1]:
!pip install distance

Defaulting to user installation because normal site-packages is not writeable
Collecting distance
  Downloading Distance-0.1.3.tar.gz (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m500.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: distance
  Building wheel for distance (setup.py) ... [?25ldone
[?25h  Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16257 sha256=075ed9ab6782b4e58939a6f34a0efd3c302d2a86025c5ad83697d668d6b252a1
  Stored in directory: /home/jovyan/.cache/pip/wheels/06/e2/ef/ea4693333706cddb28606cbdcf670a12d5b13f02372937fdf6
Successfully built distance
Installing collected packages: distance
Successfully installed distance-0.1.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0[0m
[1m[[0m[34;49mnotice[0m[1;39;4

In [2]:
import numpy as np
import importlib
from sklearn.cluster import AffinityPropagation
import distance

In [3]:
import pandas as pd

In [4]:
top_tags90 = pd.read_csv('top_tags_90.csv')
top_tags90.head()

Unnamed: 0,Tag,tag_frequency,med_score,med_answer_count,percent_answer,rel_frequency,cum_frequency
0,javascript,2453713,0.0,1.0,0.528266,0.035504,0.035504
1,python,2071327,0.0,1.0,0.513938,0.029971,0.065474
2,java,1878116,0.0,1.0,0.499181,0.027175,0.09265
3,c#,1571198,0.0,1.0,0.559132,0.022734,0.115384
4,php,1451348,0.0,1.0,0.519951,0.021,0.136384


In [5]:
tags = top_tags90.Tag.to_list()

In [6]:
len(tags)

4748

In [7]:
#words = "THIS IS YOUR TEXT".split(" ") #Replace this line
words = np.asarray(tags) #So that indexing with a list will work
lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

## Setting 2: Affinity: Precomputed

In [9]:
affprop = AffinityPropagation(affinity="precomputed", damping=0.5, random_state=100)
affprop.fit(lev_similarity)

### Check output

In [10]:
# Number of clusters
len(np.unique(affprop.labels_))

530

In [11]:
# Print exemplars

for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

 - *python:* biopython, boost-python, cpython, cython, iphone, ipython, ironpython, jython, methods, mysql-python, pentaho, phalcon, plotly-python, pyodbc, pytest, python, pytorch, rethinkdb, wxpython
 - *android:* anaconda, android, android-tv, android-ui, android-xml, androidx, appium-android, background, endpoint, osmdroid, palindrome, rx-android, sandbox, sendgrid, undefined
 - *jquery:* binary, celery, frequency, jquery, jquery-ui, jruby, liferay, querydsl, subquery, xquery
 - *c++:* c#, c++, c++11, c++14, c++17, c++20, c99, clang++, cmd, coq, css, csv, cut, cv2, cvs, echo, g++, pcre, x++
 - *ios:* aop, blogs, bots, cors, dom, dos, ejs, fs, go, gps, host, i2c, icons, idioms, iis, io, ios, ios10, ios11, ios13, ios4, ios5, ios6, ios7, ios8, ios9, iot, ivy, jboss, jms, jobs, kivy, loops, ls, oop, ribbon, roles, ros, row, rss, ssas, tfs, tvos, views, vps, winjs, xor
 - *json:* bison, bson, cpan, cron, geojson, getjson, gson, hudson, jackson, jboss7.x, jnlp, join, jsch, jsdoc, jsf-2, j

In [12]:
np.unique(affprop.labels_)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

## Setting 2: Affinity: Euclidean

In [13]:
af_euclidean = AffinityPropagation(affinity="euclidean", damping=0.5, random_state=123).fit(lev_similarity)

### Check output

In [14]:
# Number of clusters
len(np.unique(af_euclidean.labels_))

119

In [15]:
for cluster_id in np.unique(af_euclidean.labels_):
    exemplar = words[af_euclidean.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(af_euclidean.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

 - *xcode:* adobe, adodb, arcore, bokeh, bower, bundle, chmod, clone, cloud, cocoa, codec, conda, corda, core, count, cplex, cycle, decode, encode, gnome, hover, local, locale, model, models, module, moodle, mouse, move, nodes, ode, oledb, oozie, pickle, plone, pydev, roles, scope, tcpdf, xcode, xcode4, xcode5, xcode6, xcode7, xcode8, xcode9
 - *linq:* bind, build, cicd, cin, clang, click, egit, elisp, file, fill, final, find, fluid, glib, glibc, indy, ini, init, int, jlist, jndi, join, julia, limit, line, linq, lint, linux, lisp, list, live, logic, midi, min, mingw, nginx, owin, ping, plist, send, shiny, sign, slick, slim, swing, unix, using, view, xilinx, zlib, zxing
 - *google-maps-api-3:* google-admin-sdk, google-analytics, google-api-client, google-app-engine, google-apps-script, google-chrome-app, google-cloud-run, google-cloud-sql, google-data-studio, google-drive-api, google-maps-api-2, google-maps-api-3, google-places-api, google-play-games, google-sheets-api, google-tag-manag

In [None]:
np.unique(affprop.labels_)