In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import joblib

# Load the dataset
data = pd.read_csv('tested.csv')

# Handle missing values
data.dropna(subset=['Age'], inplace=True)
data['Cabin'].fillna('Unknown', inplace=True)

# Encode categorical variables
data = pd.get_dummies(data, columns=['Sex', 'Embarked', 'Cabin'], drop_first=True)

# Exclude non-numeric columns
non_numeric_columns = ['Name', 'Ticket']
features = [col for col in data.columns if col not in non_numeric_columns]

# Split features and target
X = data[features]
y = data['Survived']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred)
print(report)

# Define parameter grid for tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model with the best parameters on the entire training data
best_model = XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

# Save the model
joblib.dump(best_model, 'titanic_survival_model.pkl')

# Prepare input features for prediction
input_data = {
    'Pclass': 3,
    'Age': 25,
    'SibSp': 1,
    'Parch': 0,
    'Fare': 10.0,
    'Sex_male': 1,
    'Embarked_Q': 0,
    'Embarked_S': 1,
    'Cabin_Unknown': 1
}

# Convert input data to a DataFrame
input_df = pd.DataFrame([input_data])

# Ensure that the input DataFrame has the same columns as the training data
input_df = input_df.reindex(columns=X.columns, fill_value=0)

# Use the trained model to make a prediction
prediction = best_model.predict(input_df)

# Interpret the prediction
if prediction[0] == 1:
    print("The passenger is predicted to survive.")
else:
    print("The passenger is predicted not to survive.")







              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00        25

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
The passenger is predicted not to survive.
