## Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.impute import KNNImputer

In [9]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Survived'] = False
df = pd.concat([df_train, df_test])


In [10]:
# Extract the first letter of the Cabin (deck), or set to 'U' (Unknown) for NaN
df['Deck'] = df['Cabin'].fillna('U').apply(lambda x: x[0])

# One-hot encode the deck values
deck_dummies = pd.get_dummies(df['Deck'], prefix='Deck')
df = pd.concat([df, deck_dummies], axis=1)

# Drop the original Cabin column and the Deck column
df.drop(columns=['Cabin', 'Deck','Name','Ticket'], inplace=True)
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,1,0,3,1,22.0,1,0,7.25,S,False,False,False,False,False,False,False,False,True
1,2,1,1,0,38.0,1,0,71.2833,C,False,False,True,False,False,False,False,False,False
2,3,1,3,0,26.0,0,0,7.925,S,False,False,False,False,False,False,False,False,True
3,4,1,1,0,35.0,1,0,53.1,S,False,False,True,False,False,False,False,False,False
4,5,0,3,1,35.0,0,0,8.05,S,False,False,False,False,False,False,False,False,True


In [11]:
df.loc[df['PassengerId'] == 1044, 'Fare'] = 15

# Verify if it's filled
print(df.loc[df['PassengerId'] == 1044])

     PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch  Fare Embarked  \
152         1044         0       3    1  60.5      0      0  15.0        S   

     Deck_A  Deck_B  Deck_C  Deck_D  Deck_E  Deck_F  Deck_G  Deck_T  Deck_U  
152   False   False   False   False   False   False   False   False    True  


In [12]:
mode_embarked = df['Embarked'].mode()[0]

# Fill missing values with the mode

df['Embarked'].fillna(mode_embarked, inplace=True)
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(mode_embarked, inplace=True)


In [None]:
features = df
knn_imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(knn_imputer.fit_transform(features), columns=features.columns)
df['Age'] = df_imputed['Age']

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,1,0,3,1,0.271174,1,0,0.014151,0,False,False,False,False,False,False,False,False,True
1,2,1,1,0,0.472229,1,0,0.139136,1,False,False,True,False,False,False,False,False,False
2,3,1,3,0,0.321438,0,0,0.015469,0,False,False,False,False,False,False,False,False,True
3,4,1,1,0,0.434531,1,0,0.103644,0,False,False,True,False,False,False,False,False,False
4,5,0,3,1,0.434531,0,0,0.015713,0,False,False,False,False,False,False,False,False,True


## Training Data

In [16]:
df_train = df.iloc[:len(df_train)]
df_test = df.iloc[len(df_train):]

In [17]:
X_train = df_train.drop(columns=['Survived'])
y_train = df_train['Survived']

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### 1. Random Forest 

In [19]:
from sklearn.ensemble import RandomForestClassifier

model1 = RandomForestClassifier(random_state=42)
model1.fit(X_train, y_train)

In [20]:
from sklearn.metrics import accuracy_score

y_pred = model1.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

Validation Accuracy: 0.8044692737430168


### 2. Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train the model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_logistic = logistic_model.predict(X_val)
print("Logistic Regression Validation Accuracy:", accuracy_score(y_val, y_pred_logistic))

Logistic Regression Validation Accuracy: 0.8156424581005587


### 3. XG Boost

In [22]:
from xgboost import XGBClassifier

# Initialize and train the model
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_val)
print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_pred_xgb))

XGBoost Validation Accuracy: 0.8044692737430168


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


### 4. KNN

In [23]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_knn = knn_model.predict(X_val)
print("KNN Validation Accuracy:", accuracy_score(y_val, y_pred_knn))

KNN Validation Accuracy: 0.5642458100558659


### 5.Gradient Boost

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_gb = gb_model.predict(X_val)
print("Gradient Boosting Validation Accuracy:", accuracy_score(y_val, y_pred_gb))

Gradient Boosting Validation Accuracy: 0.8100558659217877


### 6.SVM

In [26]:
from sklearn.svm import SVC

# Initialize and train the model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_val)
print("SVM Validation Accuracy:", accuracy_score(y_val, y_pred_svm))

SVM Validation Accuracy: 0.7821229050279329


## Submission

In [32]:
#df_test.drop(columns=['Survived'], inplace=True)

pred = model1.predict(df_test)

final = pd.DataFrame()
final['PassengerId'] = df_test['PassengerId']
final['Survived'] = pred

# Write DataFrame to a CSV file without index
final.to_csv('output random.csv', index=False)

In [34]:
from sklearn.linear_model import LogisticRegression

# Combine training and validation sets
X_full_train = pd.concat([X_train, X_val])
y_full_train = pd.concat([y_train, y_val])

# Initialize and train the Logistic Regression model
logistic_model_full = LogisticRegression(random_state=42, max_iter=1000)
logistic_model_full.fit(X_full_train, y_full_train)

# Predict on the test set
X_test = df_test  # Ensure 'Survived' is dropped
test_predictions = logistic_model_full.predict(X_test)

# Save predictions to a CSV file for submission
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': test_predictions
})
submission.to_csv('submission_logistic.csv', index=False)

print("Logistic Regression model trained on the full dataset and predictions saved.")

Logistic Regression model trained on the full dataset and predictions saved.
