In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

# 1. 데이터 로딩
file_path = "C:/Users/82104/Downloads/titanic (1).csv"  # 파일 경로 수정 필요
df = pd.read_csv(file_path)
df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [6]:
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [7]:
df = df.dropna()
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [8]:
print("\nLabel Distribution (Before Encoding):")
print(df['Survived'].value_counts())



Label Distribution (Before Encoding):
1    123
0     60
Name: Survived, dtype: int64


In [18]:
file_path = "C:/Users/82104/Downloads/titanic (1).csv" 
df = pd.read_csv(file_path)
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Parch'], inplace=True)
print(df.columns)

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')


In [20]:
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

print("\nLabel Distribution (After Encoding):")
print(df['Survived'].value_counts())

# 변환 전/후 레이블 확인
print("\nSurvived Value Counts Before Encoding:")
print(df['Survived'].value_counts())

df['Survived'] = df['Survived'].astype(int)

print("\nSurvived Value Counts After Encoding:")
print(df['Survived'].value_counts())


Label Distribution (After Encoding):
0    549
1    342
Name: Survived, dtype: int64

Survived Value Counts Before Encoding:
0    549
1    342
Name: Survived, dtype: int64

Survived Value Counts After Encoding:
0    549
1    342
Name: Survived, dtype: int64


In [4]:
# 데이터 로드
file_path = "C:/Users/82104/Downloads/titanic (1).csv"  
df = pd.read_csv(file_path)

# 데이터 전처리
df['Sex_encoded'] = df['Sex'].map({'male': 0, 'female': 1})  # 성별 숫자 변환
features = ['Pclass', 'Sex_encoded', 'Age', 'SibSp', 'Parch', 'Fare']
df = df.dropna(subset=features)  # 결측값 제거

X = df[features]
y = df['Survived']

# 데이터 분할 (80% 훈련, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 리스트
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC()
}

# 결과 저장
results = {}

# 각 모델 훈련 및 평가
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    results[name] = {"Accuracy": acc, "Confusion Matrix": conf_matrix}

# 정확도 출력
accuracy_df = pd.DataFrame({name: [res["Accuracy"]] for name, res in results.items()}, index=["Accuracy"])
print("=== Accuracy Results ===")
print(accuracy_df)

# Confusion Matrix 출력
print("\n=== Confusion Matrices ===")
for name, res in results.items():
    print(f"\n{name}:\n", res["Confusion Matrix"])

=== Accuracy Results ===
          Random Forest  Decision Tree  Logistic Regression  \
Accuracy       0.783217       0.713287             0.748252   

          K-Nearest Neighbors  Support Vector Machine  
Accuracy              0.65035                0.622378  

=== Confusion Matrices ===

Random Forest:
 [[71 16]
 [15 41]]

Decision Tree:
 [[65 22]
 [19 37]]

Logistic Regression:
 [[71 16]
 [20 36]]

K-Nearest Neighbors:
 [[67 20]
 [30 26]]

Support Vector Machine:
 [[70 17]
 [37 19]]
