In [2]:
import csv
import numpy as np
from sklearn.cluster import KMeans

In [8]:
from numpy import random
def generate_centroids(centroid_type, k):
    # Generate initial centroids using uniform distribution
    centroid_points = []
    num = []
    
    # We just want a uniform centroid for each class in k
    if centroid_type == "uniform":
        for i in range(k):
            centroid_points.append([random.uniform(0,1) for j in range(1000)])
    
    # We want to find the centroid for each true class
    elif centroid_type == "trained":
        
        # Initialize centroids to be 0s
        for i in range(k):
            centroid_points.append([0 for j in range(1000)])
            num.append(0)
            
        # Loop through data and normalize each point
        # Add element-wise to the centroid of the true class
        # Keep track of number in each true class
        # Divide sum by number in each true class to get centroid
        with open('topUsers_Apr-Jul_2014_1000-words.txt','r') as myfile:
            for line in csv.reader(myfile):
                true_class = int(line[1])
                total = int(line[2])
                num[true_class] += 1
                point = [float(i)/total for i in line[3:]]
                centroid_points[true_class] = [sum(x) for x in zip(centroid_points[true_class], point)]
        for i in range(k):           
            for j in range(len(centroid_points[i])):
                centroid_points[i][j] = centroid_points[i][j] / num[i]
    
    # We need to find the data-wide centroid first
    # Add random noise to this data-wide centroid to get the second centroid
    elif centroid_type == "perturbed" and k == 2:
        total_centroid = perturbed_centroid = [0 for j in range(1000)]
        num = 0
        
        # Loop through normalized data to find element-wise sum
        with open('topUsers_normalized.txt','r') as myfile:
            for line in csv.reader(myfile):
                num += 1
                point = [float(i) for i in line]
                total_centroid = [sum(x) for x in zip(total_centroid, point)]
                
        # Divide each element by number of observations to get centroid
        for j in range(len(total_centroid)):
            total_centroid[j] = total_centroid[j] / num
        centroid_points.append(total_centroid)
        
        # Perturb the original centroid
        perturbed_centroid = [sum(x) for x in zip(total_centroid, [random.uniform(-1,1)/100 for i in range(1000)])]
        centroid_points.append(perturbed_centroid)
        
    # We use the final centroids of the k = 2 perturbed method
    # Add random noise to these centroids to get the third and forth centroids
    elif centroid_type == "perturbed" and k == 4:
        myfile = open('Centroids.txt','r')
        centroid_points = [map(float,s.split('\n')[0].split(',')) for s in myfile.readlines()]
        myfile.close()
        for i in range(2):
            centroid_points.append([sum(x) for x in zip(centroid_points[i], 
                                                        [random.uniform(-1,1)/100 for i in range(1000)])])
                        
    with open('Centroids.txt', 'w+') as f:
            f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)
            
    return centroid_points

In [5]:
X, Y = [], []

with open('topUsers_Apr-Jul_2014_1000-words.txt','r') as myfile:
    for line in csv.reader(myfile):
        total = float(line[2])
        Y.append(line[1])
        X.append([float(i)/total for i in line[3:]])

Y = np.array(Y)
X = np.array(X)

In [18]:
# generate_centroids("uniform", 4)
myfile = open('Centroids.txt','r')
centroids = [map(float,s.split('\n')[0].split(',')) for s in myfile.readlines()]
myfile.close()

centroids = np.array(centroids)

In [19]:
km = KMeans(n_clusters=4, init=centroids)
km.fit(X)
print km.cluster_centers_

[[ 0.          0.00823094  0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.02993239  0.04154374  0.02406348 ...,  0.00010714  0.00025228
   0.00026425]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]


In [13]:
import csv
from numpy import random

k = 4

centroid_points = []

data = []
with open('topUsers_normalized.txt', 'r') as myfile:
    for line in csv.reader(myfile):
        fields = [float(i) for i in line]
        data.append(fields)

seeds = [random.randint(0,999) for i in range(k)]

for i in range(k):
    centroid_points.append(data[seeds[i]])
    
print centroid_points

[[0.00859986076416, 0.0786272984152, 0.0196158728859, 0.0295671403415, 0.0237519963962, 0.0346861050821, 0.0138826323764, 0.0070027437651, 0.0, 0.0188377902453, 0.0190835005528, 0.0250624513698, 0.0121626602236, 0.0, 0.00765797125189, 0.0106474466604, 0.0151521356321, 0.0187558868095, 0.0114664810189, 0.0207215692698, 0.0, 0.00565133707359, 0.00573324050944, 0.00761701953397, 0.00741226094435, 0.00896842622548, 0.00823129530284, 0.00638846799623, 0.0024980547934, 0.00597895081699, 4.09517179246e-05, 0.00925508825095, 0.00536467504812, 0.00864081248208, 0.00536467504812, 0.00372660633114, 0.0070027437651, 4.09517179246e-05, 0.00511896474057, 0.00687988861133, 0.000327613743397, 0.00380850976698, 0.00712559891887, 0.00151521356321, 0.00298947540849, 0.00303042712642, 0.00409517179246, 0.0, 0.00307137884434, 0.000286662025472, 0.00417707522831, 0.00212948933208, 0.00237519963962, 0.00507801302265, 0.0, 0.00348089602359, 0.000614275768869, 0.00425897866415, 0.00425897866415, 0.001310454973

In [19]:
centroid_points = []

with open('topUsers_Apr-Jul_2014_1000-words_summaries.txt','r') as myfile:
    for line in csv.reader(myfile):
        if line[0] == 'CODE':
            total = float(line[2])
            point = [int(i)/total for i in line[3:]]
            centroid_points.append(point)


[[0.012807130299849048, 0.04749921984805603, 0.0260213372375041, 0.027248829430505165, 0.026089596084959435, 0.02315617353881163, 0.011894716164597415, 0.0050678636122189254, 8.539472158716223e-08, 0.016790395553189424, 0.01375899679647395, 0.017788887567800917, 0.00936099784529192, 5.692981439144149e-08, 0.009078768290446348, 0.010208398132508527, 0.011564494776219859, 0.012124427965666882, 0.012339394944808964, 0.012239568515273571, 0.0, 0.009992975714851311, 0.00787783385585889, 0.006591960138199402, 0.009589571050073558, 0.007788681766521893, 0.007324931498489211, 0.007398456353775757, 0.0024776709170371207, 0.006573913387037314, 0.007532953040275538, 0.0058577363219929806, 0.004409527238596296, 0.006956481739747802, 0.006586921849625759, 0.006430364860049295, 0.0060850286059508105, 2.447982018831984e-06, 0.00412647220144205, 0.00485377904519991, 0.005288665897336131, 0.005039199450672835, 0.005154539254629896, 0.003807437521592411, 0.0050074610791496065, 0.001979221927132855, 0.00

In [25]:
with open('topUsers_Apr-Jul_2014_1000-words_summaries.txt','r') as myfile:
    for line in csv.reader(myfile):
        if line[0] == "ALL_CODES":
            total = float(line[2])
            point = [int(i)/total for i in line[3:]]     

# Perturb the original centroid
for i in range(k):
    perturbed_centroid = [sum(x) for x in zip(point, [random.uniform(-1,1)/100 for i in range(1000)])]
    normed = [i/sum(perturbed_centroid) for i in perturbed_centroid]


0.0340896109418


In [29]:
centroids = []
with open('Centroids.txt','r') as myfile:
    for line in csv.reader(myfile):
        point = [float(i) for i in line]
        centroids.append(point)
        
for i in range(4):
    print sum(centroids[i])

1.0
1.0
1.0
1.0
