# 실습하기

## 데이터 준비

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv")

In [43]:
print(df.describe(include='all'))
print(df.info())

        PassengerId    Survived      Pclass                     Name   Sex  \
count    891.000000  891.000000  891.000000                      891   891   
unique          NaN         NaN         NaN                      891     2   
top             NaN         NaN         NaN  Braund, Mr. Owen Harris  male   
freq            NaN         NaN         NaN                        1   577   
mean     446.000000    0.383838    2.308642                      NaN   NaN   
std      257.353842    0.486592    0.836071                      NaN   NaN   
min        1.000000    0.000000    1.000000                      NaN   NaN   
25%      223.500000    0.000000    2.000000                      NaN   NaN   
50%      446.000000    0.000000    3.000000                      NaN   NaN   
75%      668.500000    1.000000    3.000000                      NaN   NaN   
max      891.000000    1.000000    3.000000                      NaN   NaN   

               Age       SibSp       Parch  Ticket        Fare 

## 데이터 전처리

### 결측치 처리

In [44]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [45]:
df['Age'].fillna(df['Age'].mean(),inplace=True)
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)


In [46]:
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)


### 레이블 인코딩

In [47]:
print(df['Sex'].unique())

['male' 'female']


In [49]:
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

### 파생변수 생성

In [50]:
df['FamilySize'] = df['SibSp'] + df['Parch']

In [51]:
print(df.describe(include='all'))

        PassengerId    Survived      Pclass                     Name  \
count    891.000000  891.000000  891.000000                      891   
unique          NaN         NaN         NaN                      891   
top             NaN         NaN         NaN  Braund, Mr. Owen Harris   
freq            NaN         NaN         NaN                        1   
mean     446.000000    0.383838    2.308642                      NaN   
std      257.353842    0.486592    0.836071                      NaN   
min        1.000000    0.000000    1.000000                      NaN   
25%      223.500000    0.000000    2.000000                      NaN   
50%      446.000000    0.000000    3.000000                      NaN   
75%      668.500000    1.000000    3.000000                      NaN   
max      891.000000    1.000000    3.000000                      NaN   

               Sex         Age       SibSp       Parch  Ticket        Fare  \
count   891.000000  891.000000  891.000000  891.000000   

## 분석 데이터셋 준비

In [52]:
x = df[['Pclass','Sex','Age','Fare','Embarked','FamilySize']]
y = df['Survived']

In [53]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=11)

## 분석 진행

In [54]:
dt = DecisionTreeClassifier(random_state=11)
dt.fit(x_train,y_train)
pred = dt.predict(x_test)

## 모델 평가

In [55]:
acc = accuracy_score(y_test,pred)
print(acc)
cm = confusion_matrix(y_test,pred)
print(cm)
cr = classification_report(y_test,pred)
print(cr)

0.7877094972067039
[[98 20]
 [18 43]]
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       118
           1       0.68      0.70      0.69        61

    accuracy                           0.79       179
   macro avg       0.76      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179



# 연습하기

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 

# 데이터 준비
df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv")

# 데이터 전처리
## 결측치 처리
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

## 라벨인코딩
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

## 파생변수 생성
df['FamilySize'] = df['SibSp'] + df['Parch']

## 데이터 확인
print(df.describe(include='all'))

# 학습, 테스트 데이터 셋 분류
x = df[['Pclass','Sex','Age','Fare','Embarked','FamilySize']]
y = df['Survived']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 11)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

# 분석 진행
dt = DecisionTreeClassifier(random_state=11)
dt.fit(x_train,y_train)
pred = dt.predict(x_test)

# 모델 평가
acc = accuracy_score(pred,y_test)
print(acc)
cm = confusion_matrix(pred,y_test)
print(cm)
cr = classification_report(pred,y_test)
print(cr)

        PassengerId    Survived      Pclass                     Name  \
count    891.000000  891.000000  891.000000                      891   
unique          NaN         NaN         NaN                      891   
top             NaN         NaN         NaN  Braund, Mr. Owen Harris   
freq            NaN         NaN         NaN                        1   
mean     446.000000    0.383838    2.308642                      NaN   
std      257.353842    0.486592    0.836071                      NaN   
min        1.000000    0.000000    1.000000                      NaN   
25%      223.500000    0.000000    2.000000                      NaN   
50%      446.000000    0.000000    3.000000                      NaN   
75%      668.500000    1.000000    3.000000                      NaN   
max      891.000000    1.000000    3.000000                      NaN   

               Sex         Age       SibSp       Parch  Ticket        Fare  \
count   891.000000  891.000000  891.000000  891.000000   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)
