In [1]:
from pyspark import SparkContext
from pyspark import SparkConf

import pandas as pd
import numpy as np

sc = SparkContext.getOrCreate()
sc

In [2]:
# load data
inputRDD = sc.textFile("data/case_transpose.csv")
inputRDD2 = sc.textFile("data/ctrl_transpose.csv")

adRDD = inputRDD.map(lambda line: line.split(","))
addCtrl = inputRDD2.map(lambda line: line.split(","))

genes = adRDD.first()
genes_test = addCtrl.first()

# combine the two csv files into 1 rdd

adRDD = adRDD.union(addCtrl)


# each row corresponds to 1 person & all their genes
# first we want to separate gene names from rdd
adRDD = adRDD.filter(lambda line: line != genes).cache()
# each column represents 1 gene
# making sure they're in the same order
#print(genes_test==genes)

#adRDD = adRDD.map(lambda line: array())

In [3]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

# put into familiar pandas dataframe
df = adRDD.toDF(genes).toPandas()

# binary classification, we want to have 1 vs 0
df.Classes = df.Classes.replace('1', '0')

df.Classes = df.Classes.replace('2', '1')

df = df.replace('?', np.nan)

# need to change the dtypes from object to float
df = df.apply(pd.to_numeric, errors='coerce')

df.head(10)

# our dependent var is Classes; 1:case, 0:ctrl


Unnamed: 0,Classes,GI_10047091-S,GI_10047093-S,GI_10047103-S,GI_10047133-A,GI_10092596-S,GI_10092600-S,GI_10092616-S,GI_10092618-S,GI_10092672-S,...,GI_9257221-S,GI_9257224-S,GI_9257226-S,GI_9257237-S,GI_9257239-A,GI_9257241-S,GI_9257243-S,GI_9257244-A,GI_9257245-I,GI_9257247-S
0,1,0.459128,0.037793,-0.005021,0.396483,0.294007,0.264442,0.435411,-0.535314,0.160456,...,-0.704773,-0.375528,-0.512366,0.240621,-0.030798,0.222405,-0.003962,0.357551,0.229804,-0.022661
1,1,-1.984835,,0.076412,,-0.165882,0.422382,-0.248215,1.018655,-0.348808,...,1.432851,1.489629,0.039887,-0.444673,-0.648194,,0.09779,0.434897,0.520602,-0.003774
2,1,1.004902,0.425738,-0.099454,-0.170769,-0.731453,-0.253671,-0.320739,-0.520905,0.544893,...,-0.665363,-0.70933,0.202728,0.147759,0.366219,-0.256029,-0.091443,0.111414,-0.234035,-0.631229
3,1,0.362306,0.170206,-0.120559,-0.103302,0.411346,-0.304337,-0.326257,-0.914469,0.181231,...,-0.722825,-0.98498,-0.160732,-0.066951,0.096363,0.278873,0.026118,-0.002167,-0.123767,-0.103869
4,1,-1.503368,-0.474718,0.193406,-0.469004,-0.345926,0.610511,-0.066067,0.441405,-0.357782,...,0.824125,1.240139,1.094079,-0.156186,-0.416933,-1.045043,-0.023955,0.41968,-0.109643,0.438731
5,1,0.797264,-0.391986,0.156093,-0.571157,0.19226,0.061127,-0.236569,-0.174161,0.238236,...,-0.809506,-0.161834,-0.571864,0.479861,0.610897,-0.450917,-0.311439,-0.001413,0.392068,-0.845085
6,1,0.154576,0.621624,-0.368632,0.282403,-0.612163,0.00151,-1.028594,0.365417,-0.037596,...,0.469806,0.073777,-0.658792,-0.367517,0.004368,0.968082,-0.096755,0.137465,-0.349002,0.099728
7,1,0.413116,0.418259,-0.190405,0.483131,-0.151788,-0.207195,-0.169778,-1.411016,0.423462,...,-1.179746,-1.183298,-1.047216,0.093976,0.079612,1.231731,0.352244,0.129158,-0.109624,-0.190399
8,1,0.169026,0.1287,0.194367,-0.141389,0.299015,0.018593,-0.617532,0.253561,0.017431,...,-0.108731,-0.668293,-0.11433,-0.096561,0.071076,-0.333713,-0.380944,-0.13423,0.218116,-0.037978
9,1,-0.247461,-0.106443,-0.143937,0.108251,-0.243365,-0.444874,-0.103435,0.500709,-0.037531,...,0.344601,0.255346,0.227162,-0.317532,-0.172908,0.113848,0.01084,-0.233653,-0.316777,0.436412


In [4]:
# Deal with NaN values:

from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values="NaN", strategy="mean", axis=1)

imputer = imputer.fit(df)

imputed_df = imputer.transform(df.values)

imputed_df = pd.DataFrame(imputed_df)

imputed_df.columns = genes
imputed_df.head(10)


Unnamed: 0,Classes,GI_10047091-S,GI_10047093-S,GI_10047103-S,GI_10047133-A,GI_10092596-S,GI_10092600-S,GI_10092616-S,GI_10092618-S,GI_10092672-S,...,GI_9257221-S,GI_9257224-S,GI_9257226-S,GI_9257237-S,GI_9257239-A,GI_9257241-S,GI_9257243-S,GI_9257244-A,GI_9257245-I,GI_9257247-S
0,1.0,0.459128,0.037793,-0.005021,0.396483,0.294007,0.264442,0.435411,-0.535314,0.160456,...,-0.704773,-0.375528,-0.512366,0.240621,-0.030798,0.222405,-0.003962,0.357551,0.229804,-0.022661
1,1.0,-1.984835,0.159142,0.076412,0.159142,-0.165882,0.422382,-0.248215,1.018655,-0.348808,...,1.432851,1.489629,0.039887,-0.444673,-0.648194,0.159142,0.09779,0.434897,0.520602,-0.003774
2,1.0,1.004902,0.425738,-0.099454,-0.170769,-0.731453,-0.253671,-0.320739,-0.520905,0.544893,...,-0.665363,-0.70933,0.202728,0.147759,0.366219,-0.256029,-0.091443,0.111414,-0.234035,-0.631229
3,1.0,0.362306,0.170206,-0.120559,-0.103302,0.411346,-0.304337,-0.326257,-0.914469,0.181231,...,-0.722825,-0.98498,-0.160732,-0.066951,0.096363,0.278873,0.026118,-0.002167,-0.123767,-0.103869
4,1.0,-1.503368,-0.474718,0.193406,-0.469004,-0.345926,0.610511,-0.066067,0.441405,-0.357782,...,0.824125,1.240139,1.094079,-0.156186,-0.416933,-1.045043,-0.023955,0.41968,-0.109643,0.438731
5,1.0,0.797264,-0.391986,0.156093,-0.571157,0.19226,0.061127,-0.236569,-0.174161,0.238236,...,-0.809506,-0.161834,-0.571864,0.479861,0.610897,-0.450917,-0.311439,-0.001413,0.392068,-0.845085
6,1.0,0.154576,0.621624,-0.368632,0.282403,-0.612163,0.00151,-1.028594,0.365417,-0.037596,...,0.469806,0.073777,-0.658792,-0.367517,0.004368,0.968082,-0.096755,0.137465,-0.349002,0.099728
7,1.0,0.413116,0.418259,-0.190405,0.483131,-0.151788,-0.207195,-0.169778,-1.411016,0.423462,...,-1.179746,-1.183298,-1.047216,0.093976,0.079612,1.231731,0.352244,0.129158,-0.109624,-0.190399
8,1.0,0.169026,0.1287,0.194367,-0.141389,0.299015,0.018593,-0.617532,0.253561,0.017431,...,-0.108731,-0.668293,-0.11433,-0.096561,0.071076,-0.333713,-0.380944,-0.13423,0.218116,-0.037978
9,1.0,-0.247461,-0.106443,-0.143937,0.108251,-0.243365,-0.444874,-0.103435,0.500709,-0.037531,...,0.344601,0.255346,0.227162,-0.317532,-0.172908,0.113848,0.01084,-0.233653,-0.316777,0.436412


In [5]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

# back to spark df
imputed_df = sqlContext.createDataFrame(imputed_df)

In [6]:
from pyspark.mllib.regression import LabeledPoint

# put into LabeledPoints for mllib classification models
temp = imputed_df.rdd.map(lambda line: LabeledPoint(line[0], line[1:]))

# divide into training and testing sets
train, test = temp.randomSplit([0.7, 0.3], seed=666)

train.cache()
test.cache()


#test.take(1)



PythonRDD[26] at RDD at PythonRDD.scala:48

In [7]:
############# DECISION TREE - Classification #################


from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

model = DecisionTree.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=30, maxBins=32)

predictions = model.predict(test.map(lambda x: x.features))

labels_preds = test.map(lambda lp: lp.label).zip(predictions)

test_err = labels_preds.filter(lambda lp: lp[0] != lp[1]).count() / float(test.count())
print("Decision Tree Classification Model Accuracy: %0.3f" % (1 - test_err))



Decision Tree Classification Model Accuracy: 0.640


In [37]:
############# DECISION TREE - Regression #################

model2 = DecisionTree.trainRegressor(train, categoricalFeaturesInfo={}, impurity='variance', maxDepth=30, maxBins=32)

preds2 = model2.predict(test.map(lambda x: x.features))

labels_preds2 = test.map(lambda lp: lp.label).zip(preds2)

mse = labels_preds2.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() / float(test.count())

print("Decision Tree Regression Model Accuracy: %0.3f" % (1 - mse))


Decision Tree Regression Model Accuracy: 0.640


In [42]:
from pyspark.mllib.tree import RandomForest, RandomForestModel


model3 = RandomForest.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={}, numTrees=15, featureSubsetStrategy="auto", impurity="gini", maxDepth=5, maxBins=32)

preds3 = model3.predict(test.map(lambda x: x.features))

labels_preds3 = test.map(lambda lp: lp.label).zip(preds3)

mse2 = labels_preds3.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() / float(test.count())

print("Random Forest Classification Model Accuracy: %0.3f" % (1 - mse2))

Random Forest Classification Model Accuracy: 0.658


In [8]:
##################### PCA ############################
# put the data (excluding labels) into vectors
features = imputed_df.rdd.map(lambda row: (Vectors.dense(row[1:]),))


# features.count()
#features.take(3)

vec_df = features.toDF(["features"])

vec_df.take(1)

pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
#model = pca.fit(vec_df)

# java.lang.OutOfMemoryError: Java heap space


In [16]:
from numpy import array
from numpy import sqrt

from pyspark.mllib.clustering import KMeans, KMeansModel

parsed = imputed_df.rdd.map(lambda line: array(line[1:]))

clusters = KMeans.train(parsed, 2, maxIterations=10, initializationMode="random")

def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsed.map(lambda point: error(point)).reduce(lambda x, y: x + y)

print(WSSSE)


13308.8773496
