In [114]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

# For reproducibility, set a base random state (e.g., use your student id number; here we use 12345 as a placeholder)
RANDOM_STATE = 20002350

In [115]:
pd.options.mode.chained_assignment = None
titanic = pd.read_csv('/Users/omiee/Documents/GitHub/AAI-595/Assignments/HW3/Titanic-1.csv')

titanic.info()
# Display the first few rows of the Dataframe
titanic.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1309 non-null   int64  
 1   pclass      1309 non-null   object 
 2   survived    1309 non-null   int64  
 3   name        1309 non-null   object 
 4   sex         1309 non-null   object 
 5   age         1046 non-null   float64
 6   sibsp       1309 non-null   int64  
 7   parch       1309 non-null   int64  
 8   ticket      1309 non-null   object 
 9   fare        1308 non-null   float64
 10  cabin       295 non-null    object 
 11  embarked    1307 non-null   object 
 12  boat        486 non-null    object 
 13  body        121 non-null    float64
 14  home.dest   745 non-null    object 
dtypes: float64(3), int64(4), object(8)
memory usage: 153.5+ KB


Unnamed: 0.1,Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1st,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.337494,B5,Southampton,2.0,,"St Louis, MO"
1,2,1st,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.550003,C22 C26,Southampton,11.0,,"Montreal, PQ / Chesterville, ON"
2,3,1st,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.550003,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"
3,4,1st,0,"Allison, Mr. Hudson Joshua Crei",male,30.0,1,2,113781,151.550003,C22 C26,Southampton,,135.0,"Montreal, PQ / Chesterville, ON"
4,5,1st,0,"Allison, Mrs. Hudson J C (Bessi",female,25.0,1,2,113781,151.550003,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"


# Step 1 - Data Preparation

In [116]:
def QuantileBinning(feature, bin_number):
    """
    This function takes a numerical feature and the number of bins, and
    returns the feature binned into quantile-based bins.

    Parameters:
    - feature (pandas.Series): The numerical feature to be binned.
    - bin_number (int): The number of quantile bins.

    Returns:
    - pandas.Series: A series of discrete features binned by quantile.
    """
    # Use qcut to create quantile-based bins for the feature
    # If there are fewer unique values than bins, qcut could throw an error.
    # 'duplicates' parameter handles this by dropping redundant bins.
    return pd.qcut(feature, q=bin_number, labels=False, duplicates='drop')

# One example
feature_test = pd.DataFrame(np.random.rand(100),  columns=['Column_A'])
feature_test_discrete = QuantileBinning(feature_test['Column_A'], 10)

def label_encoder(feature):
    unique_labels = pd.unique(feature)
    label_to_int = {label: idx for idx, label in enumerate(unique_labels)}
    transformed_feature = np.array([label_to_int[label] for label in feature])
    return transformed_feature

print("Original unique values:", titanic['pclass'].unique())
print("Original unique values:", titanic['sex'].unique())

Original unique values: ['1st' '2nd' '3rd']
Original unique values: ['female' 'male']


In [117]:
# Fill missing values in 'age' with the average age
titanic.age.fillna(titanic.age.mean(), inplace=True)


# Convert categorical features to numeric
titanic['pclass'] = label_encoder(titanic['pclass'])
titanic['sex'] = label_encoder(titanic['sex'])

# Discretization
titanic = titanic[['pclass', 'sex', 'age', 'sibsp', 'survived']]
titanic['survived'] = titanic['survived'].apply(lambda x: 1 if int(x) == 1 else 0)


# Split into training and testing sets (80/20 split)
X = titanic.drop('survived', axis=1)
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

print("Encoded unique values:", titanic['pclass'].unique())
print("Encoded unique values:", titanic['sex'].unique())



Encoded unique values: [0 1 2]
Encoded unique values: [0 1]


In [None]:
# Split into training and testing sets (80/20 split)
X = titanic.drop('survived', axis=1)
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [118]:
# Combine training features and target for display
training = X_train.copy()
training['survived'] = y_train

# Display the first few rows of the training dataset
training.head()

Unnamed: 0,pclass,sex,age,sibsp,survived
999,2,0,29.881135,0,1
392,1,0,24.0,1,1
628,2,0,11.0,4,0
1165,2,1,25.0,0,0
604,2,0,16.0,0,1


# Step 2 - Data Processing and Initial Analysis

In [None]:
# Apply quantile binning to the 'age' column (choosing 5 bins for discretization)
X_train['age'] = QuantileBinning(X_train['age'], 10)
X_test['age'] = QuantileBinning(X_test['age'], 10)

# Compute Information Gain for each feature using Decision Tree
feature_importances = {}

for feature in X_train.columns:
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=42)
    clf.fit(X_train[[feature]], y_train)
    feature_importances[feature] = clf.tree_.impurity[0]  # Root node impurity


clf.fit(X_train, y_train)

print("Feature Importances:", clf.feature_importances_)


Feature Importances: [0. 1. 0. 0.]


In [120]:
training.head()

Unnamed: 0,pclass,sex,age,sibsp,survived
999,2,0,29.881135,0,1
392,1,0,24.0,1,1
628,2,0,11.0,4,0
1165,2,1,25.0,0,0
604,2,0,16.0,0,1


So the first split should be "sex"

# Step 3

In [121]:
from sklearn.tree import DecisionTreeClassifier, plot_tree #sklearn.metrics is not imported !!!

# Instantiate the DecisionTreeClassifier


# Prepare the features and target variables for training

# Fit the decision tree model


# Plot the full decision tree



In [122]:
# Define your performance metrics including accuracy, precision, recall and F1-scorre


# Step 4

In [123]:
from sklearn.model_selection import GridSearchCV

# Define a range of max_leaf_nodes values to search
param_grid = {'max_leaf_nodes': list(range(5, 20))}

# Run grid search with cross-validation
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train) # Note testing set cannot be used

# Get the best estimator with the optimal max_leaf_nodes parameter


# Plot the pruned decision tree


# Step 5


In [124]:
## it's an example (max_depth)

In [125]:
def ensemble_vote(models, X):

    # Perform majority vote
    # For each sample, count the votes for each class (0 or 1)

    # Determine the winning class for each sample: it's the one with the most votes
    # If there's a tie, we choose the class with the highest index (1 in this case)


    return ensemble_predictions

# Use the defined function to get predictions from the ensemble of the three trees
models = [best_tree, dt_max_depth, dt_opt_critia]
ensemble_predictions = ensemble_vote(models, X_test)

# Calculate performance metrics for the ensemble using the previously defined function


NameError: name 'best_tree' is not defined

In [None]:
# Train a random forest model using the optimal tree you found in step 4 and compare the ensembled model with the random forest 