In [1]:
import numpy as nop
import pandas as pd
import os
from pyspark.sql.functions import lit, col
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

- age - age in years
- sex - (1 = male; 0 = female)
- cp - chest pain type
- trestbps - resting blood pressure (in mm Hg on admission to the hospital)
- chol - serum cholestoral in mg/dl
- fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- restecg - resting electrocardiographic results
- thalach - maximum heart rate achieved
- exang - exercise induced angina (1 = yes; 0 = no)
- oldpeak - ST depression induced by exercise relative to rest
- slope - the slope of the peak exercise ST segment
- ca - number of major vessels (0-3) colored by flourosopy
- thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
- target - have disease or not (1=yes, 0=no)

In [2]:
_trainDf = spark.read.format('com.databricks.spark.csv')\
    .options(header='true', inferschema='true')\
    .load(os.path.join("data","kaggle","heartattack","heart.csv"))
df = _trainDf
df = df.withColumn("target1",_trainDf['target'].cast("double")).drop('target') #target column double형으로 변경
df = df.drop('cp','thal','slope') # 건강지표와 필요 없는 column drop
train,test = df.randomSplit([0.5,0.5],seed=11) #데이터 반으로 랜덤하게 train, test 데이터로 나눔

In [3]:
_trainDf.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [6]:
train.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- ca: integer (nullable = true)
 |-- target1: double (nullable = true)
 |-- testOrtrain: string (nullable = false)



In [10]:
train = train.withColumn('testOrtrain',lit('train')) #train data인것 알려주기 위해서 새로운 column 생성해서 train이라고 알려줌 
test = test.withColumn('testOrtrain',lit('test')) #test data인것 알려주기 위해서 새로운 column 생성해서 test라고 알려줌

In [12]:
df = train.select('age','sex','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','ca','target1','testOrtrain')\
        .union(test.select('age','sex','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','ca','target1','testOrtrain'))
df.groupBy('testOrtrain').count().show()

+-----------+-----+
|testOrtrain|count|
+-----------+-----+
|      train|  153|
|       test|  150|
+-----------+-----+



In [13]:
va = VectorAssembler(inputCols=["age","sex","trestbps","chol","fbs","restecg","thalach",\
                                "exang","oldpeak","ca"], outputCol = "features") # feature 벡터 구성
pipeline = Pipeline(stages=[va]) #파이프라인 구성
model = pipeline.fit(df) #fit 이용해서 모델 만들기
myDf = model.transform(df)

In [14]:
train=myDf.filter(myDf['testOrtrain']=='train')
testDf=myDf.filter(myDf['testOrtrain']=='test')
trainDf,validateDf = train.randomSplit([0.7,0.3],seed=11)

In [15]:
lr = LogisticRegression().\
    setLabelCol('target1').\
    setFeaturesCol('features').\
    setRegParam(0.0).\
    setMaxIter(100).\
    setElasticNetParam(0.)

In [16]:
lrModel=lr.fit(trainDf)

In [17]:
lrDf = lrModel.transform(validateDf)

In [18]:
lrDf.groupBy('prediction','target1').count().show()

+----------+-------+-----+
|prediction|target1|count|
+----------+-------+-----+
|       1.0|    1.0|   20|
|       0.0|    1.0|    2|
|       1.0|    0.0|    8|
|       0.0|    0.0|   12|
+----------+-------+-----+



In [19]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol = 'prediction',labelCol='target1')
print(evaluator.evaluate(lrDf)*100 ,"%")

75.45454545454547 %
