In [2]:
from pyspark.sql import  SQLContext 
from pyspark import SparkContext
from pyspark.ml.classification import LogisticRegression # 로지스틱 회귀분석
from pyspark.ml.feature import VectorAssembler # 특성 데이터를 하나로 묶어줌
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
matplotlib.rcParams['font.family'] = 'Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

In [None]:
# 로지스틱(분류) : 0, 1

In [3]:
sc = SparkContext('local')
sqlctx = SQLContext(sc)

In [5]:
titanic_df = sqlctx.read.csv( '../data1/titanic1.csv', header=True, inferSchema=True)
titanic_df.show()

+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|
+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+
|       0|     3|22.0|    1|    0|   7.25|          1|    0|      0.0|           0.0|          0.0|
|       1|     1|38.0|    1|    0|71.2833|          1|    0|      1.0|           1.0|          2.0|
|       1|     3|26.0|    0|    0|  7.925|          0|    1|      1.0|           0.0|          1.0|
|       1|     1|35.0|    1|    0|   53.1|          1|    0|      1.0|           0.0|          2.0|
|       0|     3|35.0|    0|    0|   8.05|          0|    1|      0.0|           0.0|          0.0|
|       0|     3|33.0|    0|    0| 8.4583|          0|    1|      0.0|           2.0|          0.0|
|       0|     1|54.0|    0|    0|51.8625|          0|    1|      0.0|           0.0|          0.0|


In [6]:
f = VectorAssembler(inputCols=['Pclass', 'Age', 'Sex_index', 'Family_Size'], outputCol='features') # 특성 데이터를 리스트 안에 묶어줌 - 학습 대상이 되는 칼럼
v_df = f.transform(titanic_df)
v_df.show()

+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+------------------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|          features|
+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+------------------+
|       0|     3|22.0|    1|    0|   7.25|          1|    0|      0.0|           0.0|          0.0|[3.0,22.0,0.0,1.0]|
|       1|     1|38.0|    1|    0|71.2833|          1|    0|      1.0|           1.0|          2.0|[1.0,38.0,1.0,1.0]|
|       1|     3|26.0|    0|    0|  7.925|          0|    1|      1.0|           0.0|          1.0|[3.0,26.0,1.0,0.0]|
|       1|     1|35.0|    1|    0|   53.1|          1|    0|      1.0|           0.0|          2.0|[1.0,35.0,1.0,1.0]|
|       0|     3|35.0|    0|    0|   8.05|          0|    1|      0.0|           0.0|          0.0|[3.0,35.0,0.0,0.0]|
|       0|     3|33.0|    0|    0| 8.4583|      

In [7]:
v_df.count()

891

In [8]:
train_df, test_df = v_df.randomSplit([0.8, 0.2])

In [9]:
train_df.count()

706

In [10]:
test_df.count()

185

In [11]:
# Survived -> 0, 1 : 분류
lr = LogisticRegression(featuresCol='features', labelCol='Survived')

In [15]:
lrModel = lr.fit(train_df)

In [16]:
print("기울기(w) :",lr_model.coefficients )
print("절편(b) :", lr_model.intercept )

기울기(w) : [-1.213693404703422,-0.04309020254859634,2.7539639434718857,-0.2343400946295363]
절편(b) : 2.6228201827472293


In [17]:
# y = w1 * x1 + w2 * x2 + w3 * x3 + w4 * x4 + b

In [18]:
# transform함수는 y = w1 * x1 + w2 * x2 + w3 * x3 + w4 * x4 + b 연산을 해서 확률(probability)을 구하고 높은 확률에 대해서 예측(prediction)

lr_predict = lrModel.transform(test_df)
lr_predict.show()

+--------+------+----+-----+-----+--------+-----------+-----+---------+--------------+-------------+------------------+--------------------+--------------------+----------+
|Survived|Pclass| Age|SibSp|Parch|    Fare|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|          features|       rawPrediction|         probability|prediction|
+--------+------+----+-----+-----+--------+-----------+-----+---------+--------------+-------------+------------------+--------------------+--------------------+----------+
|       0|     1|19.0|    3|    2|   263.0|          5|    0|      0.0|           0.0|          0.0|[1.0,19.0,0.0,5.0]|[0.58128754352720...|[0.64136361624186...|       0.0|
|       0|     1|22.0|    0|    0|135.6333|          0|    1|      0.0|           1.0|          0.0|[1.0,22.0,0.0,0.0]|[-0.4611423219746...|[0.38671486834573...|       1.0|
|       0|     1|24.0|    0|    0|    79.2|          0|    1|      0.0|           1.0|          0.0|[1.0,24.0,0.0,0.0]|[-0.374961916877

In [20]:
# 확인겸
# probability가 두 개나 온 것은 각각 0이 확률, 1일 확률

lr_predict.select('probability').toPandas()

Unnamed: 0,probability
0,"[0.6413636162418692, 0.35863638375813084]"
1,"[0.3867148683457321, 0.6132851316542679]"
2,"[0.40734259383546695, 0.592657406164533]"
3,"[0.08449253166745258, 0.9155074683325475]"
4,"[0.5032124323119981, 0.4967875676880019]"
...,...
180,"[0.916609451883987, 0.08339054811601292]"
181,"[0.9198445427257468, 0.08015545727425323]"
182,"[0.454016151511508, 0.545983848488492]"
183,"[0.454016151511508, 0.545983848488492]"


In [19]:
lr_predict.select('Pclass', 'Age', 'Sex_index', 'Family_Size', 'probability', 'prediction').show()

+------+----+---------+-----------+--------------------+----------+
|Pclass| Age|Sex_index|Family_Size|         probability|prediction|
+------+----+---------+-----------+--------------------+----------+
|     1|19.0|      0.0|          5|[0.64136361624186...|       0.0|
|     1|22.0|      0.0|          0|[0.38671486834573...|       1.0|
|     1|24.0|      0.0|          0|[0.40734259383546...|       1.0|
|     1|25.0|      1.0|          3|[0.08449253166745...|       1.0|
|     1|33.0|      0.0|          0|[0.50321243231199...|       0.0|
|     1|33.0|      0.0|          0|[0.50321243231199...|       0.0|
|     1|33.0|      0.0|          0|[0.50321243231199...|       0.0|
|     1|33.0|      0.0|          0|[0.50321243231199...|       0.0|
|     1|33.0|      0.0|          0|[0.50321243231199...|       0.0|
|     1|36.0|      0.0|          0|[0.53547044513360...|       0.0|
|     1|37.0|      0.0|          1|[0.60337575785964...|       0.0|
|     1|38.0|      0.0|          0|[0.5568286120