# Spectral Clustering
The goal is to assign unlabeled data to groups, where similar data points hopefully get assigned to the same group.

In [2]:
import pandas as pd
import numpy as np
import multiprocessing
import matplotlib.pyplot as plt
%matplotlib inline

from preprocessing import PreProcessor

import warnings
warnings.filterwarnings("ignore") 

pp = PreProcessor()
df = pd.read_csv('darkweb/Balanced_Sample_500.csv') #500 records of all categories
df.columns = ['recordID','Category','Item','categoryID']
df['Tokens'] = df.apply(lambda d: pp.preprocess(str(d['Item']), tokenize = True), axis = 1)
df.head()

Unnamed: 0,recordID,Category,Item,categoryID,Tokens
0,40127,Counterfeits/Watches,Emporio Armani - AR1610 Shell Case ceramic bra...,0,"[emporio, armani, ar, shell, case, ceram, brac..."
1,40126,Counterfeits/Watches,Cartier-Tank Ladies Brand: Cartier Series: Tan...,0,"[cartiertank, ladi, brand, cartier, seri, tank..."
2,40125,Counterfeits/Watches,Patek Philippe watch box ★ Patek Philippe - Wa...,0,"[patek, philipp, watch, box, patek, philipp, w..."
3,40130,Counterfeits/Watches,Breitling - NAVITIMER COSMONAUTE 【Replica】 Wat...,0,"[breitl, navitim, cosmonaut, replica, watch, i..."
4,40129,Counterfeits/Watches,Emporio Armani Men's AR0397 Dial color Gary Wa...,0,"[emporio, armani, men, ar, dial, color, gari, ..."


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf = True)
features = vectorizer.fit_transform(df['Item'])
labels = df['Category']

In [4]:
from sklearn.model_selection import train_test_split

random = 42
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=random)

In [5]:
from sklearn.cluster import KMeans
true_k = 30
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X_train, y_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=30, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [6]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [7]:
for i in range(true_k):
    print("Cluster %d:" % i),
    for j in order_centroids[i, :30]:
        print('    %s' % terms[j])

Cluster 0:
    the
    of
    is
    and
    for
    in
    100
    99
    10
    to
    mg
    powder
    purity
    50
    aaa
    gram
    with
    tablets
    this
    from
    10mg
    shipping
    by
    white
    quality
    free
    are
    high
    ritalin
    5mg
Cluster 1:
    the
    you
    to
    is
    for
    this
    and
    of
    will
    we
    are
    in
    listing
    on
    with
    from
    have
    it
    your
    be
    as
    if
    please
    all
    not
    our
    my
    or
    only
    quality
Cluster 2:
    lsd
    blotter
    blotters
    tabs
    laid
    the
    of
    25
    100ug
    and
    hits
    to
    please
    100
    are
    you
    150ug
    crystal
    clean
    with
    our
    10
    for
    very
    is
    ug
    on
    200ug
    from
    vendor
Cluster 3:
    hash
    hashish
    the
    of
    pollen
    is
    quality
    soft
    and
    this
    it
    good
    moroccan
    polm
    very
    from
    smoke
    high
    grade
    

In [9]:
# fig, ax = plt.subplots(figsize=(9,7))
# ax.set_title('Data', fontsize=18, fontweight='demi')
# ax.scatter(X_train[:, 0], X_train[:, 1])

## Eigenvectors and eigenvalues
For a matrix A, if there exists a vector x which isn’t all 0’s and a scalar λ such that Ax = λx, then x is said to be an eigenvector of A with corresponding eigenvalue λ.

In [10]:
float_formatter = lambda x: "%.3f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})
from sklearn.datasets.samples_generator import make_circles
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics import pairwise_distances
from matplotlib import pyplot as plt
import networkx as nx
import seaborn as sns
sns.set()

In [12]:
# a 2x2 matrix
A = np.array([[0,1],[-2,-3]])

# find eigenvalues and eigenvectors
vals, vecs = np.linalg.eig(A)

# print results
for i, value in enumerate(vals):
    print("Eigenvector:", vecs[:,i], ", Eigenvalue:", value)

Eigenvector: [0.707 -0.707] , Eigenvalue: -1.0
Eigenvector: [-0.447 0.894] , Eigenvalue: -2.0
