In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import jaccard_score


# grab my k-means and euclidean distance functions from Q1
def euclidean(vec1, vec2):
    return np.linalg.norm(vec1-vec2)

### K-means function
def my_kmeans(image_data, K, max_iter = 300):
    
    print('Max Iterations: {}'.format(max_iter))
    
    ### initialize initial assignment
    labels_prev = np.random.randint(0, K, image_data.shape[0])
    centroids = np.array([np.mean(image_data[np.equal(labels_prev, i)], axis=0) for i in range(K)])
    
    
    ### initialize large difference
    difference = 0
    
    ### initialize iteration
    iteration = 1
    
    ### Repeat algorithm until convergence (when jaccard similarity = 1)
    while difference < 1 or iteration == max_iter:
        
        print('Iteration {}'.format(iteration))
        
        # assign each point to the cluster with the nearest centroid
        distances = np.zeros((image_data.shape[0], K))
        for i in range(image_data.shape[0]):
            for j in range(K):
                distances[i, j] = euclidean(centroids[j], image_data[i, :])
                
        # assign each pixel to closer centroid
        labels_new = np.array([np.argmin(centroid) for centroid in distances])
        
        # Calculate new cluster centers
        centroids = np.array([np.mean(image_data[np.equal(labels_new, i)], axis=0) for i in range(K)])
        
        # calculate difference between old cluster centers and new cluster centers
        if K == 2:
            difference = jaccard_score(labels_prev, labels_new)
        else:
            difference = jaccard_score(labels_prev, labels_new, average='macro')
        
        print('Current Jaccard Similarity: {}'.format(difference))
        
        # pass on labels (if we terminate, these will be equal in the end)
        labels_prev = labels_new
        
        # increment iteration
        iteration += 1
        
        if iteration > 5:
            clusters = np.array([image_data[np.where(labels_new == k)] for k in range(K)])
            empty_clusters = np.array([np.linalg.norm(cluster) == 0 for cluster in clusters])
            if np.sum(empty_clusters > 0):
                print('Found empty clusters. Reducing K to {}'.format(K))
                K-=1
                # reinitialize
                labels_prev = np.random.randint(0, K, image_data.shape[0])
                centroids = np.array([np.mean(image_data[np.equal(labels_prev, i)], axis=0) for i in range(K)])
        print(centroids)
    
    
    ### return final labels and centroids
    return labels_new, centroids

In [2]:
# Read in the dataset
raw_nodes = pd.read_csv('homework2_data_code/nodes_fixed.txt', delimiter='\t', header=None)
raw_nodes.columns = ['node', 'name', 'label', 'source']

raw_edges = pd.read_csv('homework2_data_code/edges.txt', delimiter='\t', header=None)
raw_edges.columns = ['node_1', 'node_2']

print(raw_nodes)
print(raw_edges)

      node                            name  label                 source
0        1            100monkeystyping.com      0              Blogarama
1        2      12thharmonic.com/wordpress      0            BlogCatalog
2        3           40ozblog.blogspot.com      0  Blogarama,BlogCatalog
3        4                 4lina.tblog.com      0              Blogarama
4        5           750volts.blogspot.com      0              Blogarama
...    ...                             ...    ...                    ...
1485  1486  youngconservative.blogspot.com      1              Blogarama
1486  1487                zebrax.blogs.com      1            BlogCatalog
1487  1488             zeke01.blogspot.com      1  Blogarama,BlogCatalog
1488  1489              zeke01.typepad.com      1              Blogarama
1489  1490          zeph1z.tripod.com/blog      1              Blogarama

[1490 rows x 4 columns]
       node_1  node_2
0         267    1394
1         267     483
2         267    1051
3         9

In [3]:
### Create adjacency and degree matrix, then Laplacian

# adjancency matrix
A = np.zeros((raw_nodes.shape[0], raw_nodes.shape[0]))
for i in range(raw_edges.shape[0]):
    A[raw_edges.node_1.values[i]-1, raw_edges.node_2.values[i]-1] = 1

# degree matrix
# D = np.diag(np.array([np.sum(a) for a in A]))
D = np.zeros((raw_nodes.shape[0], raw_nodes.shape[0]))
for i, j in zip(raw_edges.node_1.values, raw_edges.node_2.values):
    D[i-1, i-1] += 1
    D[j-1, j-1] += 1

# Laplacian
L = D - A

In [8]:
### Calculate eigenvalues and eigenvectors of the laplacian
eigenvals, eigenvecs = np.linalg.eig(L)

# grab m smallest eigenvectors corresponding to non-zero eigenvalues
nonzero_eigenvecs = np.squeeze(eigenvecs[:, np.where(eigenvals > 0)])
nonzero_eigenvals = eigenvals[np.where(eigenvals > 0)]

m = 25

m_smallest_eigenvecs = nonzero_eigenvecs[:, nonzero_eigenvals.argsort()[:m]]

In [9]:
# run k-means with k = 2 on non-zero eigenvalues
labels, centroids = my_kmeans(m_smallest_eigenvecs, K = 2)

Max Iterations: 300
Iteration 1
Current Jaccard Similarity: 0.015503875968992248
[[4.04642795e-05+0.j 0.00000000e+00+0.j 0.00000000e+00+0.j
  0.00000000e+00+0.j 4.04642795e-05+0.j 4.04642795e-05+0.j
  4.04642795e-05+0.j 0.00000000e+00+0.j 1.82856491e-04+0.j
  6.77048070e-04+0.j 1.86465634e-03+0.j 5.52127873e-05+0.j
  6.77048070e-04+0.j 1.37419727e-05+0.j 4.04642795e-05+0.j
  1.63531693e-04+0.j 1.32332560e-04+0.j 2.89668557e-06+0.j
  3.18538146e-05+0.j 4.04642795e-05+0.j 6.91247972e-04+0.j
  0.00000000e+00+0.j 2.20120565e-03+0.j 6.77048070e-04+0.j
  0.00000000e+00+0.j]
 [9.77573022e-02+0.j 7.69230769e-02+0.j 7.69230769e-02+0.j
  1.03203137e-01+0.j 9.77573022e-02+0.j 9.77573022e-02+0.j
  9.77573022e-02+0.j 7.69230769e-02+0.j 1.41071276e-01+0.j
  0.00000000e+00+0.j 3.16910122e-03+0.j 1.06340936e-01+0.j
  0.00000000e+00+0.j 8.33802724e-02+0.j 9.77573022e-02+0.j
  1.37695688e-01+0.j 1.13381471e-01+0.j 7.93889818e-02+0.j
  7.96358462e-02+0.j 9.77573022e-02+0.j 2.87523764e-06+0.j
  7.69230769

In [10]:
### calculate false positive rate
print(np.bincount(labels))
# first determine which cluster is left-leaning and which is right-leaning
nodes_assigned = raw_nodes
nodes_assigned['cluster'] = labels

print(nodes_assigned[nodes_assigned['label'] == 0])
print(nodes_assigned[nodes_assigned['label'] == 1])

[1489    1]
     node                          name  label                       source  \
0       1          100monkeystyping.com      0                    Blogarama   
1       2    12thharmonic.com/wordpress      0                  BlogCatalog   
2       3         40ozblog.blogspot.com      0        Blogarama,BlogCatalog   
3       4               4lina.tblog.com      0                    Blogarama   
4       5         750volts.blogspot.com      0                    Blogarama   
..    ...                           ...    ...                          ...   
753   754            xnerg.blogspot.com      0  LeftyDirectory,eTalkingHead   
754   755      yarsrevenge.blogspot.com      0     Blogarama,LeftyDirectory   
755   756  yglesias.typepad.com/matthew      0                 eTalkingHead   
756   757                      yoder.ru      0                 eTalkingHead   
757   758                 younglibs.com      0                    Blogarama   

     cluster  correct  
0          0   

In [11]:
# looks like cluster zero is liberals and cluster 1 is conservative
nodes_assigned['correct'] = np.where(nodes_assigned['label'] == nodes_assigned['cluster'], 1, 0)

# get false classification rate
np.mean(nodes_assigned.correct.values)

0.5093959731543625