In [None]:
import pandas as pd

# Load data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
# merge test and train to apply all changes in both
train_df['TrainFlag'] = 1
test_df['TrainFlag'] = 0
full_df = pd.concat([train_df, test_df], sort=False)

# 1. Title
full_df['Title'] = full_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
full_df['Title'] = full_df['Title'].replace(['Lady', 'Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
full_df['Title'] = full_df['Title'].replace(['Mlle','Ms'], 'Miss')
full_df['Title'] = full_df['Title'].replace('Mme', 'Mrs')

# 2. FamilySize
full_df['FamilySize'] = full_df['SibSp'] + full_df['Parch'] + 1

# 3. IsAlone
full_df['IsAlone'] = (full_df['FamilySize'] == 1).astype(int)

# 4. Deck
full_df['Deck'] = full_df['Cabin'].astype(str).str[0]
full_df['Deck'] = full_df['Deck'].replace('n', 'U')  # 'nan' تبدیل به 'U'

# 5. AgeBand
full_df['Age'] = full_df['Age'].fillna(full_df['Age'].median())
full_df['AgeBand'] = pd.cut(full_df['Age'], 5, labels=False)

# 6. FareBand
full_df['Fare'] = full_df['Fare'].fillna(full_df['Fare'].median())
full_df['FareBand'] = pd.qcut(full_df['Fare'], 4, labels=False)

# 7. Embarked (classified)
full_df['Embarked'] = full_df['Embarked'].fillna(full_df['Embarked'].mode()[0])

# 8. Sex, Embarked, Title, Deck → تبدیل به عدد
for col in ['Sex', 'Embarked', 'Title', 'Deck']:
    full_df[col] = pd.factorize(full_df[col])[0]

# Choose final features:
features = ['Pclass', 'Sex', 'AgeBand', 'FareBand', 'Embarked',
            'Title', 'FamilySize', 'IsAlone', 'Deck']

# Retrieve train and test
train_df = full_df[full_df['TrainFlag'] == 1].copy()
test_df = full_df[full_df['TrainFlag'] == 0].copy()
X_train = train_df[features]
y_train = train_df['Survived']
X_test = test_df[features]


In [2]:
X_train.head()

Unnamed: 0,Pclass,Sex,AgeBand,FareBand,Embarked,Title,FamilySize,IsAlone,Deck
0,3,0,1,0,0,0,2,0,0
1,1,1,2,3,1,1,2,0,1
2,3,1,1,1,0,2,1,1,0
3,1,1,2,3,0,1,2,0,1
4,3,0,2,1,0,0,1,1,0


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# XGBoost with primitive parameters
model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=1.0,
    reg_alpha=0.01,
    reg_lambda=1.5,
    gamma=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_train)

# Evaluate
acc = accuracy_score(y_train, y_pred)
print(f"Accuracy: {acc:.5f}")
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred))
print("Classification Report:\n", classification_report(y_train, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.84736
Confusion Matrix:
 [[519  30]
 [106 236]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.83      0.95      0.88       549
         1.0       0.89      0.69      0.78       342

    accuracy                           0.85       891
   macro avg       0.86      0.82      0.83       891
weighted avg       0.85      0.85      0.84       891

