In [3]:
# Load the Titanic train and test datasets to proceed with the Random Forest model process.
import pandas as pd

# Load train and test datasets
train_data = pd.read_csv('titanic_train.csv')
test_data = pd.read_csv('titanic_test.csv')

# Display first few rows of the train and test data to understand the structure
train_data.head(), test_data.head()


(   PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Data Preprocessing
# Fill missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
train_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)

test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# Encode categorical variables
train_data['Sex'] = LabelEncoder().fit_transform(train_data['Sex'])
test_data['Sex'] = LabelEncoder().fit_transform(test_data['Sex'])
train_data['Embarked'] = LabelEncoder().fit_transform(train_data['Embarked'])
test_data['Embarked'] = LabelEncoder().fit_transform(test_data['Embarked'])

# Define features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = train_data[features]
y = train_data['Survived']

# Split the data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_val)

# Output the details of the model fitting process
print("Data Prep and Model Fit Summary:")
print("Features used in the model:", features)
print("Target column:", 'Survived')
print("Model type:", type(rf_model).__name__)
print("Hyperparameters:", rf_model.get_params())
print("Software used: scikit-learn version", rf_model.__module__.split('.')[0])

Data Prep and Model Fit Summary:
Features used in the model: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
Target column: Survived
Model type: RandomForestClassifier
Hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
Software used: scikit-learn version sklearn


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the

In [7]:
### Assess model performance

In [15]:
# Assess model performance
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

y_train_pred = rf_model.predict(X_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']])
y_val_pred = rf_model.predict(X_val[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']])

# Calculate and print AUC scores
from sklearn.metrics import roc_auc_score

train_auc = roc_auc_score(y_train, y_train_pred)
valid_auc = roc_auc_score(y_val, y_val_pred)
print("Train AUC:", train_auc)
print("Validation AUC:", valid_auc)

# test from Kaggle: 0.74641

Train AUC: 0.9742335619201291
Validation AUC: 0.8097168597168597


In [21]:
# Fill missing values
test_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)

# Encode categorical variables for test data (matching training encoding)
test_data['Sex'] = LabelEncoder().fit_transform(test_data['Sex'])
test_data['Embarked'] = LabelEncoder().fit_transform(test_data['Embarked'])

# Creating the submission DataFrame
submit_rf = pd.DataFrame(columns=['PassengerId', 'Survived'])
submit_rf['PassengerId'] = test_data['PassengerId']
submit_rf['Survived'] = rf_model.predict(test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']])

# Save the submission to a CSV file
submit_rf.to_csv('C:\\Users\\nmemr\\OneDrive\\Desktop\\submit_rf.csv', index=False)
submit_rf

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [2]:
# Import necessary libraries to check their version numbers
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

# Collect version information
versions = {
    'pandas': pd.__version__,
    'scikit-learn': sklearn.__version__,
    'seaborn': sns.__version__,
    'matplotlib': plt.matplotlib.__version__
}

versions

{'pandas': '2.2.2',
 'scikit-learn': '1.4.2',
 'seaborn': '0.13.2',
 'matplotlib': '3.8.4'}