In [1]:
# Random Forest Model details
# Let's load the Titanic dataset, implement the Random Forest model, and extract the requested information.

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load the dataset
train_data = pd.read_csv('titanic_train.csv')

# Data preprocessing
train_data = train_data.dropna(subset=['Age', 'Sex', 'Pclass', 'Fare'])
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})

# Define input features (X) and target variable (y)
X = train_data[['Pclass', 'Sex', 'Age', 'Fare']]
y = train_data['Survived']

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Collect relevant information about the model
columns_used_as_inputs = X.columns.tolist()
target_column = 'Survived'
model_type = "Random Forest Classifier"
software_used = "scikit-learn"
software_version = "0.24.1"  # This is an example, modify if different
hyperparameters = rf_model.get_params()

{
    "Columns used as inputs": columns_used_as_inputs,
    "Target column": target_column,
    "Type of model": model_type,
    "Software used": software_used,
    "Software version": software_version,
    "Hyperparameters": hyperparameters
}

{'Columns used as inputs': ['Pclass', 'Sex', 'Age', 'Fare'],
 'Target column': 'Survived',
 'Type of model': 'Random Forest Classifier',
 'Software used': 'scikit-learn',
 'Software version': '0.24.1',
 'Hyperparameters': {'bootstrap': True,
  'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': None,
  'max_features': 'sqrt',
  'max_leaf_nodes': None,
  'max_samples': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'monotonic_cst': None,
  'n_estimators': 100,
  'n_jobs': None,
  'oob_score': False,
  'random_state': 42,
  'verbose': 0,
  'warm_start': False}}

In [None]:
# Columns used as inputs': ['Pclass', 'Sex', 'Age', 'Fare'], 
# Target column: 'Survived',
# Type of model: 'Random Forest Classifier',
# Software used: 'scikit-learn',
# Software version: '0.24.1'

In [4]:
# AUC and AIR for Training Dataset

import pandas as pd

# Load the training dataset
file_path = 'titanic_train.csv'
titanic_train = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
titanic_train.head()

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score

# Prepare the data for training (dropping unnecessary columns)
titanic_train = titanic_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
titanic_train = pd.get_dummies(titanic_train, columns=['Sex', 'Embarked'], drop_first=True)

# Handle missing values by filling them with median values
titanic_train['Age'] = titanic_train['Age'].fillna(titanic_train['Age'].median())
titanic_train['Fare'] = titanic_train['Fare'].fillna(titanic_train['Fare'].median())

# Define features (X) and target (y)
X = titanic_train.drop('Survived', axis=1)
y = titanic_train['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Get predicted probabilities for the test set
y_pred_prob = rf_model.predict_proba(X_test)[:, 1]

# Calculate AUC (Area Under the Curve) and AIR (Average Information Retrieval)
auc = roc_auc_score(y_test, y_pred_prob)
air = average_precision_score(y_test, y_pred_prob)

# Display the results in a table format
results_table = pd.DataFrame({
    'Metric': ['AUC (Area Under Curve)', 'AIR (Average Information Retrieval)'],
    'Value': [auc, air]
})

# Displaying the table neatly
results_table


Unnamed: 0,Metric,Value
0,AUC (Area Under Curve),0.860131
1,AIR (Average Information Retrieval),0.810724


In [8]:

# AUC and AIR of Validation dataset

# Load the new dataset uploaded by the user
file_path_new = 'titanic_train.csv'
titanic_train_new = pd.read_csv(file_path_new)

# Display the first few rows of the new dataset to understand its structure
titanic_train_new.head()

# Preparing the data as before for the new dataset

# Dropping unnecessary columns and handling missing values
titanic_train_new = titanic_train_new.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
titanic_train_new = pd.get_dummies(titanic_train_new, columns=['Sex', 'Embarked'], drop_first=True)

# Filling missing values with the median
titanic_train_new['Age'] = titanic_train_new['Age'].fillna(titanic_train_new['Age'].median())
titanic_train_new['Fare'] = titanic_train_new['Fare'].fillna(titanic_train_new['Fare'].median())

# Define features (X) and target (y)
X_new = titanic_train_new.drop('Survived', axis=1)
y_new = titanic_train_new['Survived']

# Splitting the new dataset into training and validation sets
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.3, random_state=42)

# Train the Random Forest model on the new data
rf_model_new = RandomForestClassifier(random_state=42)
rf_model_new.fit(X_train_new, y_train_new)

# Get predicted probabilities for the validation set (new data)
y_val_pred_prob_new = rf_model_new.predict_proba(X_test_new)[:, 1]

# Calculate AUC and AIR for the validation set
auc_val_new = roc_auc_score(y_test_new, y_val_pred_prob_new)
air_val_new = average_precision_score(y_test_new, y_val_pred_prob_new)

# Display the results in a table format
validation_results_table_new = pd.DataFrame({
    'Metric': ['AUC (Area Under Curve)', 'AIR (Average Information Retrieval)'],
    'Value': [auc_val_new, air_val_new]
})

# Showing the final results
validation_results_table_new


Unnamed: 0,Metric,Value
0,AUC (Area Under Curve),0.860131
1,AIR (Average Information Retrieval),0.810724


In [18]:
# AUC and AIR using Test dataset (here gender_submission.csv)

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split

# Load the datasets
train_file_path = 'titanic_train.csv'
test_file_path = 'gender_submission.csv'

titanic_train_new = pd.read_csv(train_file_path)
gender_submission = pd.read_csv(test_file_path)

# Data preprocessing for the training dataset
titanic_train_new = titanic_train_new.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
titanic_train_new = pd.get_dummies(titanic_train_new, columns=['Sex', 'Embarked'], drop_first=True)
titanic_train_new['Age'] = titanic_train_new['Age'].fillna(titanic_train_new['Age'].median())
titanic_train_new['Fare'] = titanic_train_new['Fare'].fillna(titanic_train_new['Fare'].median())

# Defining features and target for training
X_train = titanic_train_new.drop('Survived', axis=1)
y_train = titanic_train_new['Survived']

# Train the Random Forest model
rf_model_final = RandomForestClassifier(random_state=42)
rf_model_final.fit(X_train, y_train)

# Aligning and preprocessing the test dataset
gender_submission_full = gender_submission.merge(
    titanic_train_new[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']], 
    left_on='PassengerId', right_index=True, how='left'
)

# Fill missing values in the test dataset
gender_submission_full['Pclass'] = gender_submission_full['Pclass'].fillna(titanic_train_new['Pclass'].mode()[0])
gender_submission_full['SibSp'] = gender_submission_full['SibSp'].fillna(titanic_train_new['SibSp'].mode()[0])
gender_submission_full['Parch'] = gender_submission_full['Parch'].fillna(titanic_train_new['Parch'].mode()[0])
gender_submission_full['Sex_male'] = gender_submission_full['Sex_male'].fillna(titanic_train_new['Sex_male'].mode()[0])
gender_submission_full['Embarked_Q'] = gender_submission_full['Embarked_Q'].fillna(titanic_train_new['Embarked_Q'].mode()[0])
gender_submission_full['Embarked_S'] = gender_submission_full['Embarked_S'].fillna(titanic_train_new['Embarked_S'].mode()[0])

# Drop 'PassengerId' for prediction
gender_submission_full = gender_submission_full.drop('PassengerId', axis=1)

# Predict probabilities for the test dataset
y_test_prob = rf_model_final.predict_proba(gender_submission_full.drop('Survived', axis=1))[:, 1]

# Calculate AUC and AIR
auc_test = roc_auc_score(gender_submission_full['Survived'], y_test_prob)
air_test = average_precision_score(gender_submission_full['Survived'], y_test_prob)

# Create a table to display the results
test_results_table_final = pd.DataFrame({
    'Metric': ['AUC (Area Under Curve)', 'AIR (Average Information Retrieval)'],
    'Value': [auc_test, air_test]
})

# Display the final results
print(test_results_table_final)




                                Metric     Value
0               AUC (Area Under Curve)  0.500000
1  AIR (Average Information Retrieval)  0.363636


  gender_submission_full['Sex_male'] = gender_submission_full['Sex_male'].fillna(titanic_train_new['Sex_male'].mode()[0])
  gender_submission_full['Embarked_Q'] = gender_submission_full['Embarked_Q'].fillna(titanic_train_new['Embarked_Q'].mode()[0])
  gender_submission_full['Embarked_S'] = gender_submission_full['Embarked_S'].fillna(titanic_train_new['Embarked_S'].mode()[0])


In [None]:
# The Random Forest model achieved an AUC (Area Under the Curve) of 0.500 on the test dataset, indicating that the model's performance is equivalent to random guessing. An AUC of 0.5 typically means the model cannot distinguish between the two classes (Survived vs. Not Survived).
# The Average Information Retrieval (AIR) score is 0.3636, which also indicates relatively low effectiveness in retrieving positive instances