# 실습하기

## 데이터 준비

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv")
print(df.describe(include='all'))
print(df.info())

        PassengerId    Survived      Pclass                     Name   Sex  \
count    891.000000  891.000000  891.000000                      891   891   
unique          NaN         NaN         NaN                      891     2   
top             NaN         NaN         NaN  Braund, Mr. Owen Harris  male   
freq            NaN         NaN         NaN                        1   577   
mean     446.000000    0.383838    2.308642                      NaN   NaN   
std      257.353842    0.486592    0.836071                      NaN   NaN   
min        1.000000    0.000000    1.000000                      NaN   NaN   
25%      223.500000    0.000000    2.000000                      NaN   NaN   
50%      446.000000    0.000000    3.000000                      NaN   NaN   
75%      668.500000    1.000000    3.000000                      NaN   NaN   
max      891.000000    1.000000    3.000000                      NaN   NaN   

               Age       SibSp       Parch  Ticket        Fare 

## 데이터 전처리

### 결측치 처리

In [2]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [3]:
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)


### 원핫인코딩

In [4]:
onehotSex = pd.get_dummies(df['Sex'])
onehotEmbarked = pd.get_dummies(df['Embarked'])
df = pd.concat([df,onehotSex,onehotEmbarked],axis=1)

### 파생변수 생성

In [5]:
df['FamilySize'] = df['SibSp']+df['Parch']

### 데이터 확인

In [6]:
print(df.describe(include='all'))

        PassengerId    Survived      Pclass                     Name   Sex  \
count    891.000000  891.000000  891.000000                      891   891   
unique          NaN         NaN         NaN                      891     2   
top             NaN         NaN         NaN  Braund, Mr. Owen Harris  male   
freq            NaN         NaN         NaN                        1   577   
mean     446.000000    0.383838    2.308642                      NaN   NaN   
std      257.353842    0.486592    0.836071                      NaN   NaN   
min        1.000000    0.000000    1.000000                      NaN   NaN   
25%      223.500000    0.000000    2.000000                      NaN   NaN   
50%      446.000000    0.000000    3.000000                      NaN   NaN   
75%      668.500000    1.000000    3.000000                      NaN   NaN   
max      891.000000    1.000000    3.000000                      NaN   NaN   

               Age       SibSp       Parch  Ticket        Fare 

## 분석 데이터 셋 준비

In [7]:
x = df[['Pclass','Age','Fare','FamilySize','female','male','C','Q','S']]
y = df['Survived']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 11)

## 분석 진행

In [9]:
rfc = RandomForestClassifier(n_estimators = 50, max_depth = 3, random_state = 20)
rfc.fit(x_train,y_train)
pred = rfc.predict(x_test)

## 모델 평가

In [10]:
acc = accuracy_score(y_test, pred)
print(acc)
cm = confusion_matrix(y_test, pred)
print(cm)
cr = classification_report(y_test, pred)
print(cr)

0.8659217877094972
[[110   8]
 [ 16  45]]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       118
           1       0.85      0.74      0.79        61

    accuracy                           0.87       179
   macro avg       0.86      0.83      0.85       179
weighted avg       0.86      0.87      0.86       179



# 연습하기

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv")

df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

onehotSex = pd.get_dummies(df['Sex'])
onehotEmbarked = pd.get_dummies(df['Embarked'])

df['FamilySize']  = df['SibSp'] + df['Parch']
df = pd.concat([df,onehotSex,onehotEmbarked],axis=1)

x = df[['Pclass','Age','Fare','FamilySize','female','male','C','Q','S']]
y = df['Survived']

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state = 11)

rfc = RandomForestClassifier(n_estimators = 50, max_depth = 3, random_state = 20)
rfc.fit(x_train,y_train)
pred = rfc.predict(x_test)

acc = accuracy_score(y_test,pred)
print(acc)
cm = confusion_matrix(y_test,pred)
print(cm)
cr = classification_report(y_test,pred)
print(cr)

0.8659217877094972
[[110   8]
 [ 16  45]]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       118
           1       0.85      0.74      0.79        61

    accuracy                           0.87       179
   macro avg       0.86      0.83      0.85       179
weighted avg       0.86      0.87      0.86       179



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)
