In [3]:
import numpy as np
import pandas as pd
import os
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import lit, col
from pyspark.sql import functions as F

In [1]:
import pyspark
myConf = pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# 1. Logistic Regression(로지스틱 회귀)

- 로지스틱 회귀는 발생할 결과 값이 이진인 경우의 분류에 적용합니다.
- 결과값이 이항분포이므로 GLM(Generalized Linear Model)의 한 종류이고, 이 경우 Logit 함수를 link function이라고 합니다.
- 로지스틱 회귀식은 입력 값을 받아서 0 ~ 1 사이의 확률을 반환합니다.
    - sigmoid 함수를 사용해서 0 ~ 1 사이의 확률 반화
- ex) 
    - 환자의 데이터로부터 병 유무
    - 환자의 데이터로부터 사망 또는 생존
    - 이메일이 스팸인지 아닌지
 

# 2. 예제 : 건강 지표를 통해 심장병 유무 판별하기 

- age - age in years
- sex - (1 = male; 0 = female)
- cp - chest pain type
- trestbps - resting blood pressure (in mm Hg on admission to the hospital)
- chol - serum cholestoral in mg/dl
- fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- restecg - resting electrocardiographic results
- thalach - maximum heart rate achieved
- exang - exercise induced angina (1 = yes; 0 = no)
- oldpeak - ST depression induced by exercise relative to rest
- slope - the slope of the peak exercise ST segment
- ca - number of major vessels (0-3) colored by flourosopy
- thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
- target - have disease or not (1=yes, 0=no)

## 2.0 데이터 불러오기

In [59]:
# csv 파일 불러오기
trainDf = spark.read.format('com.databricks.spark.csv')\
    .options(header='true', inferschema='true')\
    .load(os.path.join("../data/kaggle/heartattack"))

In [60]:
trainDf.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [61]:
# 형 변환 해주고, 건강 지표와 필요 없는 column drop
df = trainDf.withColumn("target1",trainDf['target'].cast("double")).drop('target') #target column double형으로 변경
df = df.drop('cp','thal','slope') # 건강지표와 필요 없는 column drop

In [62]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- ca: integer (nullable = true)
 |-- target1: double (nullable = true)



In [79]:
# train 데이터와 test 데이터로 나누기
train,test = df.randomSplit([0.7,0.3],seed=11) #데이터 반으로 랜덤하게 train, test 데이터로 나눔

In [22]:
#trainDf.printSchema()

In [23]:
#train.printSchema()

In [24]:
#test.printSchema()

In [80]:
train = train.withColumn('testOrtrain',lit('train')) #train data인것 알려주기 위해서 새로운 column 생성해서 train이라고 알려줌 
test = test.withColumn('testOrtrain',lit('test')) #test data인것 알려주기 위해서 새로운 column 생성해서 test라고 알려줌

In [81]:
df = (train.select('age','sex','trestbps','chol','fbs','restecg','thalach',
                   'exang','oldpeak','ca','target1','testOrtrain')
        .union(test.select('age','sex','trestbps','chol','fbs','restecg','thalach',
                           'exang','oldpeak','ca','target1','testOrtrain'))
     )
df.groupBy('testOrtrain').count().show()

+-----------+-----+
|testOrtrain|count|
+-----------+-----+
|      train|  219|
|       test|   84|
+-----------+-----+



In [82]:
df.show(5)

+---+---+--------+----+---+-------+-------+-----+-------+---+-------+-----------+
|age|sex|trestbps|chol|fbs|restecg|thalach|exang|oldpeak| ca|target1|testOrtrain|
+---+---+--------+----+---+-------+-------+-----+-------+---+-------+-----------+
| 29|  1|     130| 204|  0|      0|    202|    0|    0.0|  0|    1.0|      train|
| 34|  0|     118| 210|  0|      1|    192|    0|    0.7|  0|    1.0|      train|
| 34|  1|     118| 182|  0|      0|    174|    0|    0.0|  0|    1.0|      train|
| 35|  0|     138| 183|  0|      1|    182|    0|    1.4|  0|    1.0|      train|
| 35|  1|     120| 198|  0|      1|    130|    1|    1.6|  0|    0.0|      train|
+---+---+--------+----+---+-------+-------+-----+-------+---+-------+-----------+
only showing top 5 rows



In [83]:
va = VectorAssembler(inputCols=["age","sex","trestbps","chol","fbs","restecg","thalach",\
                                "exang","oldpeak","ca"], outputCol = "features") # feature 벡터 구성

In [84]:
pipeline = Pipeline(stages=[va]) #파이프라인 구성

In [85]:
model = pipeline.fit(df) #fit 이용해서 모델 만들기

In [86]:
myDf = model.transform(df)

In [93]:
myDf.show(5)

+---+---+--------+----+---+-------+-------+-----+-------+---+-------+-----------+--------------------+
|age|sex|trestbps|chol|fbs|restecg|thalach|exang|oldpeak| ca|target1|testOrtrain|            features|
+---+---+--------+----+---+-------+-------+-----+-------+---+-------+-----------+--------------------+
| 29|  1|     130| 204|  0|      0|    202|    0|    0.0|  0|    1.0|      train|(10,[0,1,2,3,6],[...|
| 34|  0|     118| 210|  0|      1|    192|    0|    0.7|  0|    1.0|      train|[34.0,0.0,118.0,2...|
| 34|  1|     118| 182|  0|      0|    174|    0|    0.0|  0|    1.0|      train|(10,[0,1,2,3,6],[...|
| 35|  0|     138| 183|  0|      1|    182|    0|    1.4|  0|    1.0|      train|[35.0,0.0,138.0,1...|
| 35|  1|     120| 198|  0|      1|    130|    1|    1.6|  0|    0.0|      train|[35.0,1.0,120.0,1...|
+---+---+--------+----+---+-------+-------+-----+-------+---+-------+-----------+--------------------+
only showing top 5 rows



In [94]:
train=myDf.filter(myDf['testOrtrain']=='train')
trainDf,validateDf = train.randomSplit([0.7,0.3],seed=11)

In [95]:
testDf=myDf.filter(myDf['testOrtrain']=='test')

## logisticRegression 모델링

In [88]:
lr = (LogisticRegression().
    setLabelCol('target1').
    setFeaturesCol('features').
    setRegParam(0.0).
    setMaxIter(100).
    setElasticNetParam(0.)
     )

In [89]:
lrModel=lr.fit(trainDf)

## 예측
- transform() 함수에 데이터 넣어주면 예측값 생성함

In [90]:
lrDf = lrModel.transform(validateDf)

In [91]:
lrDf.groupBy('prediction','target1').count().show()

+----------+-------+-----+
|prediction|target1|count|
+----------+-------+-----+
|       1.0|    1.0|   31|
|       0.0|    1.0|    4|
|       1.0|    0.0|   11|
|       0.0|    0.0|   21|
+----------+-------+-----+



In [92]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol = 'prediction',labelCol='target1')
print(evaluator.evaluate(lrDf)*100 ,"%")

77.09821428571428 %


# 3. Titanic
- 타아타닉 사고에서 2224명의 승객 중 1502명이 사망했습니다.
- 사고 당시의 탑승객 위치, 상황 등을 분석하여 생존 여부를 예측 해 보도록 하겠습니다.
- 생존 했는지 안했는지 이진분류이기 때문에 logistic regression을 적용하도록 하겠습니다.

## 3.0 데이터 불러오기

### 3.0.1 Train 데이터 불러오기

In [49]:
# train.csv 파일 불러오기
trainDf = spark.read.format('com.databricks.spark.csv')\
    .options(header='true', inferschema='true')\
    .load(os.path.join("../data/kaggle/titanic/train.csv"))

In [50]:
trainDf.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [51]:
trainDf.show(3, truncate=True)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 3 rows



### 3.0.2 Test 데이터 불러오기

In [52]:
# test.csv 파일 불러오기
testDf = spark.read.format('com.databricks.spark.csv')\
    .options(header='true', inferschema='true')\
    .load(os.path.join("../data/kaggle/titanic/test.csv"))

In [53]:
testDf.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [54]:
testDf.show(3, truncate=True)

+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|Ticket|  Fare|Cabin|Embarked|
+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0|330911|7.8292| null|       Q|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0|363272|   7.0| null|       S|
|        894|     2|Myles, Mr. Thomas...|  male|62.0|    0|    0|240276|9.6875| null|       Q|
+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
only showing top 3 rows



## 3.1 train, test 데이터 하나로 합치기

In [55]:
# 컬럼 testOrtrain 추가해서 train 데이터인지 test 데이터인지 구별해 주기
trainDf = trainDf.withColumn('testOrtrain',lit('train'))
testDf = testDf.withColumn('testOrtrain',lit('test'))

In [56]:
# testDf 에는 Survived 없으므로 임의로 99 추가해주기
testDf = testDf.withColumn('Survived',lit(99))

**`union`** 기능을 이용해서 trainDf 와 testDf를 하나로 합쳐주도록 하겠습니다.  
두 DataFrame의 column 수와 데이터type이 일치해야 union 할 수 있습니다.

In [57]:
df = trainDf.select('PassengerId','Survived','Pclass','Name','Sex','Age',\
                   'SibSp','Parch','Ticket','Fare','Cabin','Embarked','testOrtrain')\
            .union(testDf.select('PassengerId','Survived','Pclass','Name','Sex','Age',\
                   'SibSp','Parch','Ticket','Fare','Cabin','Embarked','testOrtrain'))

In [58]:
df = df.drop('Cabin')

In [59]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- testOrtrain: string (nullable = false)



### 3.1.1 합친 데이터에서 train 확인하기

In [60]:
df.select('testOrtrain','Survived','Name')\
    .filter(df['testOrtrain']=='train').show(10)

+-----------+--------+--------------------+
|testOrtrain|Survived|                Name|
+-----------+--------+--------------------+
|      train|       0|Braund, Mr. Owen ...|
|      train|       1|Cumings, Mrs. Joh...|
|      train|       1|Heikkinen, Miss. ...|
|      train|       1|Futrelle, Mrs. Ja...|
|      train|       0|Allen, Mr. Willia...|
|      train|       0|    Moran, Mr. James|
|      train|       0|McCarthy, Mr. Tim...|
|      train|       0|Palsson, Master. ...|
|      train|       1|Johnson, Mrs. Osc...|
|      train|       1|Nasser, Mrs. Nich...|
+-----------+--------+--------------------+
only showing top 10 rows



### 3.1.2 합친 데이터에서 test 확인하기

In [61]:
df.select('testOrtrain','Survived','Name')\
    .filter(df['testOrtrain']=='test').show(10)

+-----------+--------+--------------------+
|testOrtrain|Survived|                Name|
+-----------+--------+--------------------+
|       test|      99|    Kelly, Mr. James|
|       test|      99|Wilkes, Mrs. Jame...|
|       test|      99|Myles, Mr. Thomas...|
|       test|      99|    Wirz, Mr. Albert|
|       test|      99|Hirvonen, Mrs. Al...|
|       test|      99|Svensson, Mr. Joh...|
|       test|      99|Connolly, Miss. Kate|
|       test|      99|Caldwell, Mr. Alb...|
|       test|      99|Abrahim, Mrs. Jos...|
|       test|      99|Davies, Mr. John ...|
+-----------+--------+--------------------+
only showing top 10 rows



## 3.2 결측치 처리하기
- 결측값들은 보통 결측으로 제외하거나 평균으로 대체 합니다.

### 3.2.1 결측치 처리

In [62]:
from pyspark.sql.functions import count
df.agg(*[count(c).alias(c) for c in df.columns]).show()

+-----------+--------+------+----+----+----+-----+-----+------+----+--------+-----------+
|PassengerId|Survived|Pclass|Name| Sex| Age|SibSp|Parch|Ticket|Fare|Embarked|testOrtrain|
+-----------+--------+------+----+----+----+-----+-----+------+----+--------+-----------+
|       1309|    1309|  1309|1309|1309|1046| 1309| 1309|  1309|1308|    1307|       1309|
+-----------+--------+------+----+----+----+-----+-----+------+----+--------+-----------+



In [63]:
def countNull(df,var):
    return df.where(df[var].isNull()).count()

missing = {c: countNull(df,c) for c in ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Embarked','testOrtrain']}
print (missing)

{'PassengerId': 0, 'Survived': 0, 'Pclass': 0, 'Name': 0, 'Sex': 0, 'Age': 263, 'SibSp': 0, 'Parch': 0, 'Ticket': 0, 'Fare': 1, 'Embarked': 2, 'testOrtrain': 0}


In [64]:
avgAge = df.agg(F.avg(df['Age']).alias('meanAge')).collect()
avgFare = df.agg(F.avg(df['Fare']).alias('meanFare')).collect()
#avgEmbarked = df.agg(F.avg(df['Embarked']).alias('meanEmbarked')).collect()

In [65]:
from pyspark.sql.functions import when,isnull
df=df.withColumn("Age", when(isnull(df['Age']), avgAge[0]['meanAge']).otherwise(df.Age))
df=df.withColumn("Fare", when(isnull(df['Fare']), avgFare[0]['meanFare']).otherwise(df.Fare))
#df=df.withColumn("Embarked", when(isnull(df['Embarked']), avgEmbarked[0]['meanEmbarked']).otherwise(df.Embarked))

In [66]:
from pyspark.sql.functions import count
df.agg(*[count(c).alias(c) for c in df.columns]).show()

+-----------+--------+------+----+----+----+-----+-----+------+----+--------+-----------+
|PassengerId|Survived|Pclass|Name| Sex| Age|SibSp|Parch|Ticket|Fare|Embarked|testOrtrain|
+-----------+--------+------+----+----+----+-----+-----+------+----+--------+-----------+
|       1309|    1309|  1309|1309|1309|1309| 1309| 1309|  1309|1309|    1307|       1309|
+-----------+--------+------+----+----+----+-----+-----+------+----+--------+-----------+



In [67]:
def countNull(df,var):
    return df.where(df[var].isNull()).count()

missing = {c: countNull(df,c) for c in ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Embarked','testOrtrain']}
print (missing)

{'PassengerId': 0, 'Survived': 0, 'Pclass': 0, 'Name': 0, 'Sex': 0, 'Age': 0, 'SibSp': 0, 'Parch': 0, 'Ticket': 0, 'Fare': 0, 'Embarked': 2, 'testOrtrain': 0}


In [70]:
# Embarked 결측치 있는 행 제거
df = df.na.drop('any')

In [71]:
def countNull(df,var):
    return df.where(df[var].isNull()).count()

missing = {c: countNull(df,c) for c in ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Embarked','testOrtrain']}
print (missing)

{'PassengerId': 0, 'Survived': 0, 'Pclass': 0, 'Name': 0, 'Sex': 0, 'Age': 0, 'SibSp': 0, 'Parch': 0, 'Ticket': 0, 'Fare': 0, 'Embarked': 0, 'testOrtrain': 0}


In [72]:
from pyspark.sql.functions import count
df.agg(*[count(c).alias(c) for c in df.columns]).show()

+-----------+--------+------+----+----+----+-----+-----+------+----+--------+-----------+
|PassengerId|Survived|Pclass|Name| Sex| Age|SibSp|Parch|Ticket|Fare|Embarked|testOrtrain|
+-----------+--------+------+----+----+----+-----+-----+------+----+--------+-----------+
|       1307|    1307|  1307|1307|1307|1307| 1307| 1307|  1307|1307|    1307|       1307|
+-----------+--------+------+----+----+----+-----+-----+------+----+--------+-----------+



## 3.3 label, features 구성
- 현재 Survived column이 integer로 정의되어 있으니 double 형으로 우선 바꿔 주도록 하겠습니다.

In [73]:
df = (df.withColumn("Survive",trainDf['Survived']
     .cast("double"))
      .drop("Survived")
     )

In [74]:
df.groupBy('Survive').count().show()

+-------+-----+
|Survive|count|
+-------+-----+
|    0.0|  549|
|    1.0|  340|
|   99.0|  418|
+-------+-----+



string인 `Sex`와 `Embarked`를 StringIndexer로 변환해 주도록 하겠습니다.

In [75]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
#from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

SexIndexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
EmbarkedIndexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkedIndex")

이후 `feature` column을 생성해 주도록 하겠습니다. 

In [76]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- testOrtrain: string (nullable = false)
 |-- Survive: double (nullable = true)



In [77]:
va = VectorAssembler(inputCols=["Pclass","SexIndex","Age","SibSp","Parch",\
                                "Fare","EmbarkedIndex"],\
                     outputCol="features")

`pipeline`으로 구성해서 `fit()`을 실행하도록 하겠습니다.

In [78]:
pipeline = Pipeline(stages=[SexIndexer,EmbarkedIndexer,va])
model = pipeline.fit(df)

## 3.4 train & test

In [79]:
myDf = model.transform(df)
myDf.count()

1307

In [80]:
myDf.select('Survive','features').show(10)

+-------+--------------------+
|Survive|            features|
+-------+--------------------+
|    0.0|[3.0,0.0,22.0,1.0...|
|    1.0|[1.0,1.0,38.0,1.0...|
|    1.0|[3.0,1.0,26.0,0.0...|
|    1.0|[1.0,1.0,35.0,1.0...|
|    0.0|(7,[0,2,5],[3.0,3...|
|    0.0|[3.0,0.0,29.88113...|
|    0.0|(7,[0,2,5],[1.0,5...|
|    0.0|[3.0,0.0,2.0,3.0,...|
|    1.0|[3.0,1.0,27.0,0.0...|
|    1.0|[2.0,1.0,14.0,1.0...|
+-------+--------------------+
only showing top 10 rows



In [81]:
trainingDf = myDf.filter(myDf['testOrtrain']=='train')
testDf = myDf.filter(myDf['testOrtrain']=='test')
print(trainDf.count())
print(testDf.count())

891
418


In [82]:
trainDf,validateDf = trainingDf.randomSplit([0.7,0.3],seed=11)

In [84]:
print(trainDf.count())
print(validateDf.count())

628
261


In [85]:
trainDf.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- testOrtrain: string (nullable = false)
 |-- Survive: double (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- EmbarkedIndex: double (nullable = false)
 |-- features: vector (nullable = true)



In [86]:
validateDf.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- testOrtrain: string (nullable = false)
 |-- Survive: double (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- EmbarkedIndex: double (nullable = false)
 |-- features: vector (nullable = true)



## 3.5 LogisticRegression 모델링

In [87]:
trainDf.groupBy('Survive').count().show()

+-------+-----+
|Survive|count|
+-------+-----+
|    0.0|  385|
|    1.0|  243|
+-------+-----+



In [88]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression().\
    setLabelCol('Survive').\
    setFeaturesCol('features').\
    setRegParam(0.0).\
    setMaxIter(100).\
    setElasticNetParam(0.)

In [89]:
lrModel=lr.fit(trainDf)

## 3.6 예측

In [90]:
lrDf = lrModel.transform(validateDf)

In [91]:
lrDf.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- testOrtrain: string (nullable = false)
 |-- Survive: double (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- EmbarkedIndex: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [93]:
lrDf.select('Survive','rawPrediction','probability','prediction').show()

+-------+--------------------+--------------------+----------+
|Survive|       rawPrediction|         probability|prediction|
+-------+--------------------+--------------------+----------+
|    0.0|[1.75048837387783...|[0.85201438959993...|       0.0|
|    0.0|[0.68880110544961...|[0.66570017364378...|       0.0|
|    1.0|[-1.6072526323220...|[0.16697039891915...|       1.0|
|    0.0|[1.85316538795041...|[0.86449832760428...|       0.0|
|    0.0|[3.06289988462698...|[0.95533619549127...|       0.0|
|    0.0|[-0.9069728065691...|[0.28761969386244...|       1.0|
|    0.0|[2.32973077312161...|[0.91130957904571...|       0.0|
|    1.0|[1.13927934115816...|[0.75754730060353...|       0.0|
|    1.0|[1.27143083992238...|[0.78098758571895...|       0.0|
|    0.0|[-0.3564542334155...|[0.41181816621951...|       1.0|
|    0.0|[2.17107581221118...|[0.89762187260460...|       0.0|
|    1.0|[-0.8142202518985...|[0.30699191545836...|       1.0|
|    0.0|[-0.1466323274074...|[0.46340745946973...|    

## 3.7 평가

In [96]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol = 'prediction',labelCol='Survive')
print(evaluator.evaluate(lrDf)*100 ,"%")

79.89062107115916 %
