In [132]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler 

# 타이타닉 데이터 로딩
file_path = "C:/Users/solmi/Downloads/titanic.csv"  # 파일 경로 수정
df = pd.read_csv(file_path)
#데이터확인
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [133]:
# 결측치 확인
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [134]:
#결측치제거

# Age 컬럼의 평균값계산
mean_age = df['Age'].mean()

# age 컬럼의 결측치를 평균값으로 채우기
df['Age'] = df['Age'].fillna(mean_age)

df.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [135]:
# 4. 레이블 확인 (생존자와 사망자 분포 확인)
print("\nSurvived 데이터 분포:")
print(df['Survived'].value_counts())


Survived 데이터 분포:
0    549
1    342
Name: Survived, dtype: int64


In [136]:
# 불필요한 칼럼 제거
df = df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket', 'Embarked'], axis=1)
print(df)

     Survived  Pclass     Sex        Age  SibSp  Parch     Fare
0           0       3    male  22.000000      1      0   7.2500
1           1       1  female  38.000000      1      0  71.2833
2           1       3  female  26.000000      0      0   7.9250
3           1       1  female  35.000000      1      0  53.1000
4           0       3    male  35.000000      0      0   8.0500
..        ...     ...     ...        ...    ...    ...      ...
886         0       2    male  27.000000      0      0  13.0000
887         1       1  female  19.000000      0      0  30.0000
888         0       3  female  29.699118      1      2  23.4500
889         1       1    male  26.000000      0      0  30.0000
890         0       3    male  32.000000      0      0   7.7500

[891 rows x 7 columns]


In [137]:
# 인코딩 (Sex 컬럼을 숫자로 변환)
print("\n변환 전 Sex 컬럼:")
print(df['Sex'])


변환 전 Sex 컬럼:
0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object


In [138]:


df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})


print("\n변환 후 Sex 컬럼:")
print(df['Sex'].value_counts())


변환 후 Sex 컬럼:
0    577
1    314
Name: Sex, dtype: int64


In [139]:
# 특징(X)과 타겟(y) 분리
X = df.drop('Survived', axis=1)
y = df['Survived']

# 데이터 분할 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [140]:
# 1. DT

clf_dt = DecisionTreeClassifier(random_state=0)
clf_dt.fit(X_train, y_train)

pred_dt = clf_dt.predict(X_test)

print ("\n--- DT ---")
print (accuracy_score(y_test, pred_dt))
print (confusion_matrix(y_test, pred_dt))



--- DT ---
0.7541899441340782
[[83 22]
 [22 52]]


In [141]:
# 2. RF


print ("\n--- RF ---")
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
print(accuracy_score(y_test,pred))
print (confusion_matrix(y_test, pred))



--- RF ---
0.7877094972067039
[[89 16]
 [22 52]]


In [142]:
# 3. SVM

clf_svm = SVC(random_state=0)
clf_svm.fit(X_train, y_train)

pred_svm = clf_svm.predict(X_test)

print("\n--- SVM ---")
print(accuracy_score(y_test, pred_svm))
print(confusion_matrix(y_test, pred_svm))



--- SVM ---
0.8100558659217877
[[92 13]
 [21 53]]


In [143]:
# 4. LR

clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

pred_lr = clf_lr.predict(X_test)

print ("\n--- LR ---")
print (accuracy_score(y_test, pred_lr))
print (confusion_matrix(y_test, pred_lr))


--- LR ---
0.7988826815642458
[[90 15]
 [21 53]]


In [144]:
# 5. KNN

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)

print ("\n--- KNN ---")
print(accuracy_score(y_test, pred_knn))
print(confusion_matrix(y_test, pred_knn))


--- KNN ---
0.7821229050279329
[[86 19]
 [20 54]]
