In [1]:
import pandas as pd
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
titanic_df = pd.read_csv("../Ch01/titanic_train.csv")

In [3]:
# 필요한 열만 선택하고 결측치 처리
titanic_df = titanic_df[['Pclass', 'Sex', 'Age', 'SibSp', 
                         'Parch', 'Fare', 'Embarked', 'Survived']]
titanic_df['Age'] = titanic_df['Age'].fillna(titanic_df['Age'].mean())
# 최빈값의 첫 번째 요소 만약 최빈값이 여러개면 첫 번째 값
titanic_df['Embarked'] = titanic_df['Embarked']\
    .fillna(titanic_df['Embarked'].mode()[0])

titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    object 
 7   Survived  891 non-null    int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [4]:
# 범주형 변수 인코딩
label_encoders = {}
for column in ['Sex', 'Embarked']:
    le = LabelEncoder()
    le = le.fit(titanic_df[column])
    titanic_df[column] = le.transform(titanic_df[column])
    label_encoders[column] = le

In [5]:
# 입력 변수와 타겟 변수 분리
X = titanic_df.drop('Survived', axis = 1)
y = titanic_df['Survived']

In [6]:
# 데이터 분할
X_train, X_test, y_train, y_test \
    = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# 하이퍼파라미터 그리드 정의
param_grid = {
    'max_depth':[10,20,30,40],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4]
}

In [8]:
# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42),
                            param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [9]:
# 최적의 모델 및 성능 출력
best_model = grid_search.best_estimator_
print(f"Best Parameters:{grid_search.best_params_}")
print(f"Best Cross Validation Accuracy:{grid_search.best_score_:.3f}")

Best Parameters:{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best Cross Validation Accuracy:0.815


In [10]:
# 테스트 세트에 대한 성능 평가
y_pred = best_model.predict(X_test)
print(f"Test Accuracy:{accuracy_score(y_test, y_pred):.3f}")

Test Accuracy:0.838


In [17]:
def predict_survival(Pclass, Sex, Age, SibSp, Parch, Fare, Embarked):
    # Sex와 Embarked 인코딩
    # transform메서드는 리스트나 배열과 같은 1차원 시퀀스 형태의 입력만을 허용
    Sex = label_encoders['Sex'].transform([Sex])[0]
    Embarked = label_encoders['Embarked'].transform([Embarked])[0]

    input_data = pd.DataFrame({
        'Pclass': [Pclass],
        'Sex':[Sex],
        'Age':[Age],
        'SibSp':[SibSp],
        'Parch':[Parch],
        'Fare':[Fare],
        'Embarked':[Embarked]
    })

    prediction = best_model.predict(input_data)[0]
    return "Survived" if prediction == 1 else "Did not survived"

In [12]:
def get_category(age):
    # cat = ''
    if age <= -1: cat = 'Unknown'
    elif age <= 5: cat = 'Baby'
    elif age<= 12: cat = 'Child'
    elif age <= 18: cat = 'Teenager'
    elif age <= 25: cat = 'Student'
    elif age <= 35: cat = 'Young Adult'
    elif age <= 60: cat = 'Adult'
    else: cat = 'Elderly'

    return cat

In [13]:
temp_df = titanic_df.copy()
temp_df['Sex'] = label_encoders['Sex'].inverse_transform(titanic_df['Sex'])
group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 
               'Young Adult', 'Adult', 'Elderly']
temp_df['Age_cat'] = temp_df['Age'].apply(lambda x: get_category(x))

In [14]:
def visualize_data(feature):
    plt.figure(figsize=(8,6))
    if feature == 'Sex':
        sns.barplot(x='Sex', y='Survived', data=titanic_df)
        plt.xticks([0, 1], ['Female', 'Male'])
        plt.title("Survival Rate by Sex")
        plt.ylabel("Survival Rate")
    elif feature == 'Pclass':
        sns.barplot(x='Pclass', y='Survived', hue='Sex', data=temp_df)
        plt.title("Survival Rate by Ticket Class")
        plt.xlabel("Ticket Class")
        plt.ylabel("Survival Rate")
    elif feature == 'Age':
        sns.barplot(x='Age_cat', y='Survived', hue='Sex', data=temp_df, order=group_names)

    return plt
    

In [15]:
# 생존 예측 인터페이스 구성
predict_interface = gr.Interface(
    fn = predict_survival,
    inputs = [
        gr.Dropdown(choices=[1,2,3], label='Ticket Class'),
        gr.Radio(choices=['male', 'female'], label='Sex'),
        gr.Slider(min_width=0, maximum=titanic_df['Age'].max(), step=1, label= 'Age'),
        gr.Slider(minimum=0, maximum=titanic_df['SibSp'].max(), step=1, label='Siblings/Spouse Aboard'),
        gr.Slider(minimum=0, maximum=titanic_df['Parch'].max(), step=1, label='Parents/Children Aboard'),
        gr.Number(label='Fare'),
        gr.Radio(choices=['C', 'Q', 'S'], label='Embarked Port')

    ],
    outputs = "text",
    title="Titanic Survival Prediction",
    description="Enter passenger details to predict survival on the Titanic."
)

In [16]:
# EDA(Exploratory Data Analysis, 탐색적 데이터 분석) 
# 시각화 인터페이스 구성
eda_interface = gr.Interface(
    fn = visualize_data,
    inputs = gr.Radio(choices=['Sex', 'Pclass', 'Age'], label="Select Feature to Visualize"),
    outputs = "plot",
    title = "Titanic EDA Visualization",
    description = "Select a feature to visualize survival statistics on the Titanic dataset."
)

In [19]:
# Tabbed Interface 구성
demo = gr.TabbedInterface(
    [predict_interface, eda_interface],
    ["Survival Prediction", "EDA Visualization"]
)

demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://4bfdd7001eb3704404.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


