# ▒ 로지스틱분석 ▒

## 0. 환경설정

In [29]:
import os
import numpy as np
import pandas as pd
import requests
import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

## 1. 데이터 준비

### 데이터 설명 (Titanic dataset)

- 타이타닉호에 탔던 891명 승객들의 정보
- 승객들의 정보로 사망 여부를 예측하는 모형을 구축하자.
- Survived: 사망 여부 (0: 사망, 1: 생존)

### 데이터 호출

In [9]:
data_file = "./data/titanic.txt"
titanic = pd.read_csv(data_file)
print(titanic.shape)
titanic.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 데이터 전처리

- 성별, 나이, 좌석 클래스로 사망 여부를 예측하자. 
- 성별에서 남성은 0, 여성은 1로 변환
- 나이에서 결측치는 평균값으로 대체
- 좌석 클래스에 대한 가변수 생성 (1,2,3등급이므로 2개의 가변수면 충분)

In [10]:
titanic['Sex'] = titanic['Sex'].map({'female':1,'male':0})
titanic['Age'].fillna(value=titanic['Age'].mean(), inplace=True)
titanic['FirstClass'] = titanic['Pclass'].apply(lambda x: 1 if x == 1 else 0)
titanic['SecondClass'] = titanic['Pclass'].apply(lambda x: 1 if x == 2 else 0)

titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0


- 종속변수와 독립변수 구분하기
- 학습 데이터와 평가 데이터 나누기

In [11]:
x_titanic = titanic[['Sex', 'Age', 'FirstClass', 'SecondClass']]
y_titanic = titanic['Survived']

In [14]:
train_x_titanic, test_x_titanic, train_y_titanic, test_y_titanic = \
train_test_split(x_titanic, y_titanic, test_size=0.3, random_state=123)
print(train_x_titanic.head())
print(test_x_titanic.head())

     Sex   Age  FirstClass  SecondClass
416    1  34.0           0            1
801    1  31.0           0            1
512    0  36.0           1            0
455    0  29.0           0            0
757    0  18.0           0            1
     Sex        Age  FirstClass  SecondClass
172    1   1.000000           0            0
524    0  29.699118           0            0
452    0  30.000000           1            0
170    0  61.000000           1            0
620    0  27.000000           0            0


## 1. 로지스틱 모형 적합하기
### 학습 데이터를 이용해서 로지스틱 모형을 적합하자.

In [17]:
logistic = LogisticRegression(penalty='none')
logistic.fit(train_x_titanic, train_y_titanic)

LogisticRegression(penalty='none')

In [21]:
print(logistic.intercept_)
print(logistic.coef_)

[-1.08777172]
[[ 2.55838407 -0.04004487  2.38223532  1.05157353]]


## 2. 적합된 로지스틱 모형 성능 확인하기
### 예측 정확도 살펴보기

In [32]:
print(logistic.score(train_x_titanic, train_y_titanic))
print(logistic.score(test_x_titanic, test_y_titanic))
test_y_prediction = logistic.predict(test_x_titanic)
print(test_y_prediction)

0.7865168539325843
0.7835820895522388
[1 0 1 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0
 0 1 0 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 1 1 1 0 1 0
 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 0
 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 1 1 0 1 0 1 0 0 1 0 0 0
 0 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0
 1 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 1
 1 0 1 0 0 1 1 0 1]


In [26]:
print(test_x_titanic.head())
print(logistic.predict(test_x_titanic)[0:5])

     Sex        Age  FirstClass  SecondClass
172    1   1.000000           0            0
524    0  29.699118           0            0
452    0  30.000000           1            0
170    0  61.000000           1            0
620    0  27.000000           0            0
[1 0 1 0 0]


### 오차행렬 구하기 (Confusion Matrix)

In [35]:
print(test_y_titanic.sum())
print(test_y_prediction.sum())
confusion_matrix(test_y_titanic, test_y_prediction)

98
108


array([[136,  34],
       [ 24,  74]])