In [1]:
from pyspark.sql import SQLContext

trainingSet = sc.textFile('file:///home/cloudera/Desktop/final/haberman.data')

trainingSet.take(5)


['30,64,1,1', '30,62,3,1', '30,65,0,1', '31,59,2,1', '31,65,4,1']

In [2]:
def getData(line):
    attrs = line.split(',')
    if attrs[3] == '1':
        return [float(attrs[0]),float(attrs[2]),"green"]
    else:
        return [float(attrs[0]),float(attrs[2]),"red"]

data = trainingSet.map(getData)

In [3]:
cure_group = data.filter(lambda line: line[2] == "green")

fatal_group = data.filter(lambda line: line[2] == "red")

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

for line in data.collect():
    if line[2] == "green":
        plt.scatter(line[0], line[1], c="green", alpha=0.1)
    else:
        plt.scatter(line[0], line[1], c="red", alpha=0.1)

In [5]:
def findMeanPair(RDD):
    pair = [RDD.map(lambda line: line[0]).mean(),RDD.map(lambda line: line[1]).mean()]
    return pair
    
cure_mean = findMeanPair(cure_group)

fatal_mean = findMeanPair(fatal_group)

print(cure_mean, fatal_mean)

[52.0177777777778, 2.7911111111111127] [53.67901234567903, 7.4567901234567895]


In [8]:
#plt.scatter(cureXs,cureYs,c="green",alpha=0.1,label="cured sample")
#plt.scatter(fatalXs, fatalYs, c="red",alpha=0.1,label="fatal sample")

plt.scatter(cure_mean[0],cure_mean[1], marker='+',s=80,c="black",label="cured centeroid")
plt.scatter(fatal_mean[0], fatal_mean[1],marker='x',s=50,c="black", label="fatal centeroid")

plt.show()

In [9]:
plt.clf()

import math

def is_cure(line):
    x,y = line[0],line[1]
    cureX, cureY = cure_mean[0], cure_mean[1]
    fatalX, fatalY = fatal_mean[0], fatal_mean[1]
    
    cureDist = math.sqrt((x-cureX)**2+(y-cureY)**2)
    
    fatalDist = math.sqrt((x-fatalX)**2+(y-fatalY)**2)
    
    if cureDist < fatalDist:
        return [line[0],line[1],"green"]
    else:
        return [line[0],line[1],"red"]

distances = data.map(is_cure)

In [25]:
for line in distances.collect():
    if line[2] == "green":
        plt.scatter(line[0],line[1],c=line[2],alpha=0.3)
    else:
        plt.scatter(line[0],line[1],c=line[2], alpha=0.3)
    
plt.scatter(cure_mean[0],cure_mean[1], marker='+',s=80,c="black",label="cured centeroid")

plt.scatter(fatal_mean[0], fatal_mean[1],marker='x',s=50,c="black", label="fatal centeroid")

plt.show()

In [48]:
def get_pair(line):
    return(line[2],1)

data_pair = data.map(get_pair)

distance_pair = distances.map(get_pair)

In [75]:
actual_cure = data.filter(lambda line: line[2] == "green")

#actual_cure.collect()

In [80]:
actual_fatal = data.filter(lambda line:line[2] == "red")

#actual_fatal.collect()

In [83]:
predict_cure = distances.filter(lambda line:line[2] == "green")

#predict_cure.collect()

In [86]:
predict_fatal = distances.filter(lambda line:line[2] == "red")

#predict_fatal.collect()

In [89]:
actual_cure.count(), predict_cure.count()

(225, 213)

In [90]:
actual_fatal.count(), predict_fatal.count()

(81, 93)