In [64]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\train.csv')
test_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\test.csv')

# load train data

In [65]:
print(train_df.head())

# make a research on the data and determine the best way to handle missing values
print("\ninfo: ", train_df.info())

#Дивлюсь які дані відсутні
missing_data = train_df.isnull().sum()
print(missing_data)

# implement the best way to handle missing values
train_df.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)
test_df.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)

test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)
# implement the best way to handle categorical values (encoding)

train_df['Embarked'] = train_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
test_df['Embarked'] = test_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

In [66]:
# define validation method
n_splits = 5

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, val_index in skf.split(train_df, train_df['Survived']):
    X_train_fold, X_val_fold = train_df.iloc[train_index], train_df.iloc[val_index]
    y_train_fold, y_val_fold = train_df['Survived'].iloc[train_index], train_df['Survived'].iloc[val_index]

    
# use StratifiedKFold to split the data into folds or KFold if you don't need stratification

In [67]:
# define the regression model

model = LogisticRegression(max_iter=1000, random_state=42)

# define the hyperparameters grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear']
}

# define the grid search with cross validation using previously defined validation method

regression_grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)
X_train = train_df.drop(columns=['Survived']) 
y_train = train_df['Survived']                 

# train the model
regression_grid_search.fit(X_train, y_train)

# print the best hyperparameters
print("Best Hyperparameters:", regression_grid_search.best_params_)
print("Best Training Score (accuracy):", regression_grid_search.best_score_)

best_model = regression_grid_search.best_estimator_
regression_train_accuracy = accuracy_score(y_train, best_model.predict(X_train))



# print the best score on train and validation data, estimate the generalization error
print("Best Model Accuracy on Training Data:", regression_train_accuracy)
cv_results = regression_grid_search.cv_results_['mean_test_score']
regression_generalization_error = np.mean(1 - cv_results)
print("Estimated Generalization Error:", regression_generalization_error)


Best Hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Training Score (accuracy): 0.7957567007720796
Best Model Accuracy on Training Data: 0.8013468013468014
Estimated Generalization Error: 0.22669323959575668


In [68]:
# define the decision tree model
decision_tree = DecisionTreeClassifier(random_state=42)

# define the hyperparameters grid
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy'],
    'max_features': [None, 'sqrt', 'log2']
}

# define the grid search with cross validation using previously defined validation method
decision_tree_grid_search = GridSearchCV(estimator=decision_tree, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)

# train the model
X_train = train_df.drop(columns=['Survived'])
y_train = train_df['Survived']  
decision_tree_grid_search.fit(X_train, y_train)

# print the best hyperparameters
print("Best Hyperparameters:", decision_tree_grid_search.best_params_)
print("Best Training Score (accuracy):", decision_tree_grid_search.best_score_)

# print the best score on train and validation data, estimate the generalization error
best_model = decision_tree_grid_search.best_estimator_
decision_tree_train_accuracy = accuracy_score(y_train, best_model.predict(X_train))

print("Best Model Accuracy on Training Data:", decision_tree_train_accuracy)

cv_results = decision_tree_grid_search.cv_results_['mean_test_score']
decision_tree_generalization_error = np.mean(1 - cv_results)

print("Estimated Generalization Error:", decision_tree_generalization_error)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Best Training Score (accuracy): 0.8271483271608812
Best Model Accuracy on Training Data: 0.8271604938271605
Estimated Generalization Error: 0.22791082841048318


In [69]:
# compare the results of the two models
# make a conclusion on which model is better and why

print("Logistic Regression Results:")
print("Best Hyperparameters (Logistic Regression):", regression_grid_search.best_params_)
print("Training Accuracy (Logistic Regression):", regression_train_accuracy)
print("Cross-Validation Accuracy (Logistic Regression):", regression_grid_search.best_score_)
print("Generalization Error (Logistic Regression):", regression_generalization_error)

# Decision Tree Results
print("\nDecision Tree Results:")
print("Best Hyperparameters (Decision Tree):", decision_tree_grid_search.best_params_)
print("Training Accuracy (Decision Tree):", decision_tree_train_accuracy)
print("Cross-Validation Accuracy (Decision Tree):", decision_tree_grid_search.best_score_)
print("Generalization Error (Decision Tree):", decision_tree_generalization_error)

print("\nCross-Validation Accuracy (Logistic Regression):", regression_grid_search.best_score_, 'vs ' "Cross-Validation Accuracy (Decision Tree):", decision_tree_grid_search.best_score_)


Logistic Regression Results:
Best Hyperparameters (Logistic Regression): {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Training Accuracy (Logistic Regression): 0.8013468013468014
Cross-Validation Accuracy (Logistic Regression): 0.7957567007720796
Generalization Error (Logistic Regression): 0.22669323959575668

Decision Tree Results:
Best Hyperparameters (Decision Tree): {'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Training Accuracy (Decision Tree): 0.8271604938271605
Cross-Validation Accuracy (Decision Tree): 0.8271483271608812
Generalization Error (Decision Tree): 0.22791082841048318

Cross-Validation Accuracy (Logistic Regression): 0.7957567007720796 vs Cross-Validation Accuracy (Decision Tree): 0.8271483271608812


In [70]:
# retrain the best models (both regression and DT) on the whole train data
logreg_best_model = regression_grid_search.best_estimator_
logreg_best_model.fit(X_train, y_train)

logreg_train_accuracy = accuracy_score(y_train, logreg_best_model.predict(X_train))
print("Logistic Regression Training Accuracy:", logreg_train_accuracy)

#DT
decision_tree_best_model = decision_tree_grid_search.best_estimator_
decision_tree_best_model.fit(X_train, y_train)

decision_tree_train_accuracy = accuracy_score(y_train, decision_tree_best_model.predict(X_train))
print("Decision Tree Training Accuracy:", decision_tree_train_accuracy)

Logistic Regression Training Accuracy: 0.8013468013468014
Decision Tree Training Accuracy: 0.8271604938271605


In [83]:
# load test data
# do the same preprocessing as for train data

# using retrained models make predictions on the test data for both regression and DT models
logreg_predictions = logreg_best_model.predict(test_df)

decision_tree_predictions = decision_tree_best_model.predict(test_df)

logreg_submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': logreg_predictions
})

decision_tree_submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': decision_tree_predictions
})



# save the predictions to a file
logreg_submission.to_csv('logreg_submission.csv', index=False)
decision_tree_submission.to_csv('decision_tree_submission.csv', index=False)
# upload the predictions to Kaggle and make a submission
# report the score you got and compare it with the score you got on the validation data
print("Data uploaded to Kaggle")
print('logreg_submission.csv: Score: 0.76555')
print('decision_tree_submission.csv Score: 0.77990')

print("\nCross validation data")
print("Cross-Validation Accuracy (Logistic Regression):", regression_grid_search.best_score_)
print("Cross-Validation Accuracy (Decision Tree):", decision_tree_grid_search.best_score_)

# make a conclusion on how well the models generalizes

print("Виходячи із результату моїх моделей на Кагглі схоже що моделі перенавчені. Думаю тут можна було б застосувати регуляризацію чи якийсь інший метод.\nНа логістичній регресії може треба було погратись з фічами та/або гіперпараметрамию. \nВідповідно до десіжен трі то можна було б застосувати бустінг або рендом форест")

Data uploaded to Kaggle
logreg_submission.csv: Score: 0.76555
decision_tree_submission.csv Score: 0.77990

Cross validation data
Cross-Validation Accuracy (Logistic Regression): 0.7957567007720796
Cross-Validation Accuracy (Decision Tree): 0.8271483271608812
Виходячи із результату моїх моделей на Кагглі схоже що моделі перенавчені. Думаю тут можна було б застосувати регуляризацію чи якийсь інший метод.
На логістичній регресії може треба було погратись з фічами та/або гіперпараметрамию. 
Відповідно до десіжен трі то можна було б застосувати бустінг або рендом форест
