In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv('./titanic_train.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# 결측치가 없는 행만 사용하여 훈련
train_data = df.dropna(subset=['Cabin'])

# 'Sex'와 'Embarked'를 LabelEncoder로 변환
encoder_sex = LabelEncoder()
train_data['Sex'] = encoder_sex.fit_transform(train_data['Sex'])

encoder_embarked = LabelEncoder()
train_data['Embarked'] = encoder_embarked.fit_transform(train_data['Embarked'])

# 'Cabin'에서 첫 번째 문자만 사용 (선실 구역 정보)
train_data['Cabin'] = train_data['Cabin'].apply(lambda x: str(x)[0] if pd.notnull(x) else 'Unknown')

# 특성 선택: 'Name'과 'Ticket'은 모델에 필요 없으므로 제거
X_train = train_data.drop(columns=['Cabin', 'Name', 'Ticket'])
y_train = train_data['Cabin']

# RandomForest 모델 학습
model = RandomForestClassifier()
model.fit(X_train, y_train)

# 결측치가 있는 행에 대해 예측
missing_data = df[df['Cabin'].isnull()]
missing_data['Sex'] = encoder_sex.transform(missing_data['Sex'])
missing_data['Embarked'] = encoder_embarked.transform(missing_data['Embarked'])
missing_data['Cabin'] = model.predict(missing_data.drop(columns=['Cabin', 'Name', 'Ticket']))

# 결측치 채운 결과를 원본 DataFrame에 반영
df.loc[df['Cabin'].isnull(), 'Cabin'] = missing_data['Cabin']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Sex'] = encoder_sex.fit_transform(train_data['Sex'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Embarked'] = encoder_embarked.fit_transform(train_data['Embarked'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Cabin'] = train_data['Cabin'].apply(lambda x: st

In [34]:
df['Cabin'].unique()

array(['F', 'C85', 'C123', 'E46', 'G', 'G6', 'C103', 'D', 'D56', 'A6',
       'C23 C25 C27', 'C', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83',
       'F33', 'F G73', 'E', 'A', 'E31', 'A5', 'D10 D12', 'D26', 'C110',
       'B58 B60', 'E101', 'F E69', 'D47', 'B86', 'F2', 'C2', 'B', 'E33',
       'B19', 'A7', 'C49', 'F4', 'A32', 'B4', 'B80', 'A31', 'D36', 'D15',
       'C93', 'C78', 'D35', 'C87', 'B77', 'E67', 'B94', 'C125', 'C99',
       'C118', 'D7', 'A19', 'B49', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', '

In [46]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# 데이터 불러오기
df = pd.read_csv('titanic_train.csv')

# 'Cabin'에서 첫 번째 문자를 추출하여 수치형 코드로 변환
df['Cabin'] = df['Cabin'].apply(lambda x: str(x)[0] if pd.notnull(x) else 'Unknown')

# 'Cabin'을 수치형 코드로 변환 (LabelEncoder 사용)
encoder = LabelEncoder()
df['Cabin'] = encoder.fit_transform(df['Cabin'])

# IterativeImputer 사용 (회귀 기반 예측)
imputer = IterativeImputer(max_iter=10, random_state=42)
df['Cabin'] = imputer.fit_transform(df[['Cabin']])

# 예측된 값이 실수형일 수 있기 때문에, 소수점 처리와 범주 확인 후 변환
df['Cabin'] = df['Cabin'].round().astype(int)  # 반올림 후 정수형으로 변환

# 다시 문자형으로 변환
df['Cabin'] = encoder.inverse_transform(df['Cabin'])

# 결과 확인
print(df['Cabin'].head())


0    Unknown
1          C
2    Unknown
3          C
4    Unknown
Name: Cabin, dtype: object


In [40]:
df['Cabin'].isna().sum()

0