In [2]:
from sklearn.linear_model import LinearRegression  # 선형회귀
from sklearn.preprocessing import PolynomialFeatures # 다항특성을 만들어주는 라이브러리
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False  # 마이너스 표시 해결
# 한글설정
matplotlib.rcParams['font.family'] = 'Malgun Gothic' # windows 사용자
# matplotlib.rcParams['font.family'] = 'AppleGothic Gothic' # Mac사용자
matplotlib.rcParams['font.size'] = '10' # 글자크기

요구사항 분석

##### 데이터 피처 설명
- pclass : Passenger Class, 승객 등급
- survived : 생존 여부 : target 값이 됨.
- name : 승객 이름
- sex : 승객 성별
- age : 승객 나이
- sibsp : 탑승 한 형제/배우자 수
- parch : 탑승 한 부모/자녀 수
- ticket : 티켓 번호
- fare : 승객 지불 요금
- cabin : 선실 이름
- embarked : 승선항 (C = 쉘 부르그, Q = 퀸즈타운, S = 사우스 햄튼)
- body : 사망자 확인 번호 - 분석과정에서 제외해야 함. 머신러닝에 100% 영향을 미침.
- home.dest : 고향/목적지

In [64]:
# 타이타닉의 생존자 분류 모델을 구현하시오.
# target : survived
# data : 그외

# 데이터 확인 : nan,0, 타입,....
# 데이터 전처리 - nan,0,
# train,test세트

# 불필요한 피처를 제거 - 제거 피처는 체크 해 둘것
# 'name', 'ticket', 'body', 'cabin', 'home.dest' - name,cabin 사용 해 볼것.
# 각각 모델을 적용

# 정답률 체크

# 분류 : 로지스틱회귀, 결정트리, 랜덤포레스트

df_train = pd.read_csv('titanic_train.csv')
df_train.head(2)
df_test = pd.read_csv('titanic_test.csv')
df_test.head(2)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,3,0,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S,,
1,2,1,"Phillips, Miss. Alice Frances Louisa",female,21.0,0,1,S.O./P.P. 2,21.0,,S,,"Ilfracombe, Devon"


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     916 non-null    int64  
 1   survived   916 non-null    int64  
 2   name       916 non-null    object 
 3   sex        916 non-null    object 
 4   age        741 non-null    float64
 5   sibsp      916 non-null    int64  
 6   parch      916 non-null    int64  
 7   ticket     916 non-null    object 
 8   fare       916 non-null    float64
 9   cabin      214 non-null    object 
 10  embarked   914 non-null    object 
 11  body       85 non-null     float64
 12  home.dest  527 non-null    object 
dtypes: float64(3), int64(4), object(6)
memory usage: 93.2+ KB


In [8]:
df_train['name'].head(10)

0    Mellinger, Miss. Madeleine Violet
1                    Wells, Miss. Joan
2       Duran y More, Miss. Florentina
3                   Scanlan, Mr. James
4         Bradley, Miss. Bridget Delia
5                 Linehan, Mr. Michael
6       Francatelli, Miss. Laura Mabel
7             Quick, Miss. Phyllis May
8             Thayer, Mr. John Borland
9     Silverthorne, Mr. Spencer Victor
Name: name, dtype: object

In [65]:
df_sp = df_train['name'].str.split(',').str[1]
df_sp.head(2)

0     Miss. Madeleine Violet
1                 Miss. Joan
Name: name, dtype: object

In [67]:
test = pd.concat([df_train2,df_sp],axis=1)
test

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest,name.1
0,2,1,"Mellinger, Miss. Madeleine Violet",female,13.00,0,1,250644,19.5000,,S,,"England / Bennington, VT",Miss. Madeleine Violet
1,2,1,"Wells, Miss. Joan",female,4.00,1,1,29103,23.0000,,S,,"Cornwall / Akron, OH",Miss. Joan
2,2,1,"Duran y More, Miss. Florentina",female,30.00,1,0,SC/PARIS 2148,13.8583,,C,,"Barcelona, Spain / Havana, Cuba",Miss. Florentina
3,3,0,"Scanlan, Mr. James",male,,0,0,36209,7.7250,,Q,,,Mr. James
4,3,1,"Bradley, Miss. Bridget Delia",female,22.00,0,0,334914,7.7250,,Q,,"Kingwilliamstown, Co Cork, Ireland Glens Falls...",Miss. Bridget Delia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,3,1,"Dean, Miss. Elizabeth Gladys ""Millvina""",female,0.17,1,2,C.A. 2315,20.5750,,S,,"Devon, England Wichita, KS","Miss. Elizabeth Gladys ""Millvina"""
912,3,0,"Guest, Mr. Robert",male,,0,0,376563,8.0500,,S,,,Mr. Robert
913,3,1,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q,,,Miss. Julia
914,2,1,"Sincock, Miss. Maude",female,20.00,0,0,C.A. 33112,36.7500,,S,,"Cornwall / Hancock, MI",Miss. Maude
