# MidTerm DATASCI W261: Machine Learning at Scale

**Name:** Megan Jasek  
**Email:** meganjasek@ischool.berkeley.edu  
**Class Name:** W261-2  
**Week Number:** 8, MidTerm Exam  
**Date:** 6/29/16  

### MRJob for KMeans

In [30]:
%%writefile Kmeans.py
from numpy import argmin, array, random
from mrjob.job import MRJob
from mrjob.step import MRStep
from itertools import chain

#Calculate find the nearest centroid for data point 
def MinDist(datapoint, centroid_points):
    D = datapoint
    sqx1 = D[0]*D[0]
    sqx2 = D[1]*D[1]
    size_x = pow((sqx1+sqx2), 0.5)
    weight = 1.0 / size_x
    datapoint = array(datapoint)
    centroid_points = array(centroid_points)
    diff = datapoint - centroid_points 
    diffsq = weight*diff**2
    
    distances = (diffsq.sum(axis = 1))**0.5
    # Get the nearest centroid for each instance
    min_idx = argmin(distances)
    return min_idx

#Check whether centroids converge
def stop_criterion(centroid_points_old, centroid_points_new,T):
    oldvalue = list(chain(*centroid_points_old))
    newvalue = list(chain(*centroid_points_new))
    Diff = [abs(x-y) for x, y in zip(oldvalue, newvalue)]
    Flag = True
    for i in Diff:
        if(i>T):
            Flag = False
            break
    return Flag


class MRKmeans(MRJob):
    centroid_points=[]
    k=3    
    def steps(self):
        return [
            MRStep(mapper_init = self.mapper_init, mapper=self.mapper,combiner = self.combiner,reducer=self.reducer)
               ]
    #load centroids info from file
    def mapper_init(self):
        self.centroid_points = [map(float,s.split('\n')[0].split(',')) for s in open("Centroids.txt").readlines()]
        #open('Centroids.txt', 'w').close()
    
    #Weight each example as follows using the inverse vector length (Euclidean norm): 
    #weight(X)= 1/||X||, 
    #where ||X|| = SQRT(X.X)= SQRT(X1^2 + X2^2)
    #Here X is vector made up of X1 and X2.
    #load data and output the nearest centroid index and data point 
    def mapper(self, _, line):
        #read in the data point
        D = (map(float,line.split(',')))
        # calculate the weights
        sqx1 = D[0]*D[0]
        sqx2 = D[1]*D[1]
        size_x = pow((sqx1+sqx2), 0.5)
        weight = 1.0 / size_x
        weighted_D = [D[0], D[1]*weight]
        #print D
        #print weight
        #print weighted_D
        idx = MinDist(D,self.centroid_points)
        #idx = MinDist(weighted_D, self.centroid_points)
        yield int(idx), (D[0],D[1],1)
    #Combine sum of data points locally
    def combiner(self, idx, inputdata):
        sumx = sumy = num = 0
        for x,y,n in inputdata:
            num = num + n
            sumx = sumx + x
            sumy = sumy + y
        yield int(idx),(sumx,sumy,num)
    #Aggregate sum for each cluster and then calculate the new centroids
    def reducer(self, idx, inputdata): 
        centroids = []
        num = [0]*self.k 
        distances = 0
        for i in range(self.k):
            centroids.append([0,0])
        for x, y, n in inputdata:
            num[idx] = num[idx] + n
            centroids[idx][0] = centroids[idx][0] + x
            centroids[idx][1] = centroids[idx][1] + y
        centroids[idx][0] = centroids[idx][0]/num[idx]
        centroids[idx][1] = centroids[idx][1]/num[idx]
        yield idx,(centroids[idx][0],centroids[idx][1])
        
    # from HW4 for reference
    #def reducer2(self, idx, inputdata): 
    #    centroids = []
    #    num = [0]*self.k 
    #    for i in range(self.k):
    #        centroids.append([0.0]*1000)
    #    for D, n in inputdata:
    #        num[idx] = num[idx] + n
    #        centroids[idx] = [x + y for x, y in zip(centroids[idx],D)]
    #    centroids[idx] = [i / float(num[idx]) for i in centroids[idx]]        
    #    yield idx, centroids[idx]

if __name__ == '__main__':
    MRKmeans.run()

Overwriting Kmeans.py


**Driver:**  
Generate random initial centroids  

New Centroids = initial centroids  

While(1):  

* Cacluate new centroids
* stop if new centroids close to old centroids
* Updates centroids

In [32]:
%reload_ext autoreload
%autoreload 2

from numpy import random, array
from Kmeans import MRKmeans, stop_criterion
mr_job = MRKmeans(args=['Kmeandata.csv', '--file=Centroids.txt'])
#mr_job = MRKmeans(args=['Kmeandata_small.csv', '--file=Centroids.txt'])

#Geneate initial centroids
centroid_points = [[0,0],[6,3],[3,6]]
k = 3
with open('Centroids.txt', 'w+') as f:
        f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)

# Update centroids iteratively
for i in range(10):
    # save previous centoids to check convergency
    centroid_points_old = centroid_points[:]
    print "iteration"+str(i+1)+":"
    with mr_job.make_runner() as runner: 
        runner.run()
        # stream_output: get access of the output 
        for line in runner.stream_output():
            key,value =  mr_job.parse_output_line(line)
            print key, value
            centroid_points[key] = value
    with open('Centroids.txt', 'w+') as f:
        f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)
    print "\n"
    i = i + 1
print "Centroids\n"
print centroid_points

iteration1:
2 [0.24288276270220563, 5.350519186138149]
0 [-3.344726378997632, 0.3375985510805805]
1 [5.379067911319121, 0.15446805295171434]


iteration2:
2 [0.08609737928171646, 5.025145679728709]
0 [-4.938524015701949, 0.0432165878871746]
1 [5.0402327160888465, -0.026294229978289455]


iteration3:
2 [0.053065423788147964, 4.987793423944292]
0 [-4.98580568889943, 0.0009376094363626959]
1 [5.0402327160888465, -0.026294229978289455]


iteration4:
2 [0.053065423788147964, 4.987793423944292]
0 [-4.98580568889943, 0.0009376094363626959]
1 [5.0402327160888465, -0.026294229978289455]


iteration5:
2 [0.053065423788147964, 4.987793423944292]
0 [-4.98580568889943, 0.0009376094363626959]
1 [5.0402327160888465, -0.026294229978289455]


iteration6:
2 [0.053065423788147964, 4.987793423944292]
0 [-4.98580568889943, 0.0009376094363626959]
1 [5.0402327160888465, -0.026294229978289455]


iteration7:
2 [0.053065423788147964, 4.987793423944292]
0 [-4.98580568889943, 0.0009376094363626959]
1 [5.040232716

## Using the MRJob Class below calculate the KL divergence of the following two objects.

In [33]:
%%writefile kltext.txt
1.Data Science is an interdisciplinary field about processes and systems to extract knowledge or insights from large volumes of data in various forms (data in various forms, data in various forms, data in various forms), either structured or unstructured,[1][2] which is a continuation of some of the data analysis fields such as statistics, data mining and predictive analytics, as well as Knowledge Discovery in Databases.
2.Machine learning is a subfield of computer science[1] that evolved from the study of pattern recognition and computational learning theory in artificial intelligence.[1] Machine learning explores the study and construction of algorithms that can learn from and make predictions on data.[2] Such algorithms operate by building a model from example inputs in order to make data-driven predictions or decisions,[3]:2 rather than following strictly static program instructions.


Writing kltext.txt


In [34]:
import numpy as np
np.log(3)

1.0986122886681098

In [43]:
%%writefile kldivergence.py
from mrjob.job import MRJob
import re
import math
import numpy as np
class kldivergence(MRJob):
    def mapper1(self, _, line):
        index = int(line.split('.',1)[0])
        letter_list = re.sub(r"[^A-Za-z]+", '', line).lower()
        count = {}
        for l in letter_list:
            if count.has_key(l):
                count[l] += 1
            else:
                count[l] = 1
        for key in count:
            print key
            print [index, count[key]*1.0/len(letter_list)]
            yield key, [index, count[key]*1.0/len(letter_list)]


    def reducer1(self, key, values):
        #Fill in your code
        # (P(i) log (P(i) / Q(i))
        # a [1, 0.11078717201166181]
        kl_sum = 0
        for value in values:
            v1 = value[0]
            v2 = value[1]
            kl_sum += v1 * math.log(v1/1.0*v2)
        yield key, kl_sum
    
    def reducer2(self, key, values):
        kl_sum = 0
        for value in values:
            kl_sum = kl_sum + value
        yield None, kl_sum
            
    def steps(self):
        return [self.mr(mapper=self.mapper1,
                        reducer=self.reducer1),
                self.mr(reducer=self.reducer2)]

if __name__ == '__main__':
    kldivergence.run()

Overwriting kldivergence.py


In [44]:
%reload_ext autoreload
%autoreload 2

from kldivergence import kldivergence
mr_job = kldivergence(args=['kltext.txt'])
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access of the output 
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)



a
[1, 0.11078717201166181]
c
[1, 0.04081632653061224]
b
[1, 0.0058309037900874635]
e
[1, 0.07580174927113703]
d
[1, 0.05539358600583091]
g
[1, 0.014577259475218658]
f
[1, 0.029154518950437316]
i
[1, 0.09620991253644315]
h
[1, 0.01749271137026239]
k
[1, 0.0058309037900874635]
m
[1, 0.026239067055393587]
l
[1, 0.03206997084548105]
o
[1, 0.06997084548104957]
n
[1, 0.0641399416909621]
p
[1, 0.008746355685131196]
s
[1, 0.11078717201166181]
r
[1, 0.06705539358600583]
u
[1, 0.037900874635568516]
t
[1, 0.08163265306122448]
w
[1, 0.011661807580174927]
v
[1, 0.02040816326530612]
y
[1, 0.014577259475218658]
x
[1, 0.0029154518950437317]
a
[2, 0.08483290488431877]
c
[2, 0.04884318766066838]
b
[2, 0.007712082262210797]
e
[2, 0.08997429305912596]
d
[2, 0.04113110539845758]
g
[2, 0.02570694087403599]
f
[2, 0.02313624678663239]
i
[2, 0.09254498714652956]
h
[2, 0.030848329048843187]
k
[2, 0.005141388174807198]
m
[2, 0.03598971722365039]
l
[2, 0.04884318766066838]
o
[2, 0.07969151670951156]
n
[2, 0.08997