In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

import string
import warnings
warnings.filterwarnings('ignore')

SEED = 42

In [2]:
def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)
def divide_df(all_data):
    # Returns divided dfs of training and test set
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'], axis=1)
df_train = pd.read_csv(r"C:\Users\Nusrat Fatima Khan\OneDrive\Desktop\Titanic Feature Engineering\train.csv")
df_test = pd.read_csv(r"C:\Users\Nusrat Fatima Khan\OneDrive\Desktop\Titanic Feature Engineering\test.csv")
df_all = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]

print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['Survived'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
print(df_train.columns)
print(df_test.columns)



Number of Training Examples = 891
Number of Test Examples = 418

Training X Shape = (891, 12)
Training y Shape = 891

Test X Shape = (418, 11)
Test y Shape = 418

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [3]:
#**Exploratory Data Analysis**

In [4]:
print(df_train.info())
df_train.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
625,626,0,1,"Sutton, Mr. Frederick",male,61.0,0,0,36963,32.3208,D50,S
299,300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50.0,0,1,PC 17558,247.5208,B58 B60,C
828,829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q


In [5]:
print(df_test.info())
df_test.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
None


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
306,1198,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
107,999,3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [6]:
#**MISSING VALUES**

In [7]:
def display_missing(df):    
    for col in df.columns.tolist():          
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)

Training Set
PassengerId column missing values: 0
Survived column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 177
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 0
Cabin column missing values: 687
Embarked column missing values: 2


Test Set
PassengerId column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 86
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 1
Cabin column missing values: 327
Embarked column missing values: 0




In [8]:
#**AGE**

In [9]:
# Updated DataFrame based on your provided data
data = {
    'PassengerId': [1091, 1002, 905],
    'Pclass': [3, 2, 2],
    'Name': ['Rasmussen, Mrs. (Lena Jacobsen Solvang)', 'Stanton, Mr. Samuel Ward', 'Howard, Mr. Benjamin'],
    'Sex': ['female', 'male', 'male'],
    'Age': [np.nan, 41.0, 63.0],
    'SibSp': [0, 0, 1],
    'Parch': [0, 0, 0],
    'Ticket': ['65305', '237734', '24065'],
    'Fare': [8.1125, 15.0458, 26.0000],
    'Cabin': [np.nan, np.nan, np.nan],
    'Embarked': ['S', 'C', 'S']
}

df_all = pd.DataFrame(data)

# Select only numeric columns
df_numeric = df_all.select_dtypes(include=[np.number])

# Handle missing values (optional): fill with a specific value, or drop rows/columns
# For simplicity, we'll use forward fill to fill missing values
df_numeric = df_numeric.fillna(method='ffill')  # You could also use other methods like mean imputation

# Calculate correlations
df_all_corr = df_numeric.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)

# Filter correlations involving 'Age'
age_correlations = df_all_corr[df_all_corr['Feature 1'] == 'Age']

print(age_correlations)

   Feature 1    Feature 2  Correlation Coefficient
1        Age  PassengerId                      1.0
5        Age         Fare                      1.0
6        Age          Age                      1.0
7        Age        SibSp                      1.0
28       Age       Pclass                      NaN
29       Age        Parch                      NaN
30       Age        Cabin                      NaN


In [10]:
data = {
    'PassengerId': [1091, 1002, 905],
    'Pclass': [3, 2, 2],
    'Name': ['Rasmussen, Mrs. (Lena Jacobsen Solvang)', 'Stanton, Mr. Samuel Ward', 'Howard, Mr. Benjamin'],
    'Sex': ['female', 'male', 'male'],
    'Age': [np.nan, 41.0, 63.0],
    'SibSp': [0, 0, 1],
    'Parch': [0, 0, 0],
    'Ticket': ['65305', '237734', '24065'],
    'Fare': [8.1125, 15.0458, 26.0000],
    'Cabin': [np.nan, np.nan, np.nan],
    'Embarked': ['S', 'C', 'S']
}

df_all = pd.DataFrame(data)

# Step 1: Calculate median age by 'Sex' and 'Pclass'
age_by_pclass_sex = df_all.groupby(['Sex', 'Pclass'])['Age'].median()

# Print median age for each combination of 'Sex' and 'Pclass'
for pclass in range(1, 4):
    for sex in ['female', 'male']:
        # Check if the group exists in the median DataFrame
        if (sex, pclass) in age_by_pclass_sex.index:
            print(f'Median age of Pclass {pclass} {sex}s: {age_by_pclass_sex[sex][pclass]}')
        else:
            print(f'Median age of Pclass {pclass} {sex}s: Not available')

# Print median age of all passengers
print(f'Median age of all passengers: {df_all["Age"].median()}')

# Step 2: Fill missing values in 'Age' with median values from 'Sex' and 'Pclass' groups
def fill_age_with_group_median(df):
    return df.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

# Apply the filling function
df_all['Age'] = fill_age_with_group_median(df_all)

# Verify results
print(df_all)

Median age of Pclass 1 females: Not available
Median age of Pclass 1 males: Not available
Median age of Pclass 2 females: Not available
Median age of Pclass 2 males: 52.0
Median age of Pclass 3 females: nan
Median age of Pclass 3 males: Not available
Median age of all passengers: 52.0


TypeError: incompatible index of inserted column with frame index

In [None]:
print(df_all.columns)

df_all.columns = df_all.columns.str.strip()

print(df_all.head())


In [None]:
import pandas as pd

# Example DataFrame (adjust according to your actual data)
df_all = pd.DataFrame({
    'Sex': ['female', 'male', 'female', 'male'],
    'Pclass': [1, 2, 1, 3],
    'Age': [22, 25, 30, 29]
})

# Check column names
print("Columns in DataFrame:", df_all.columns)

# Strip whitespace from column names (if needed)
df_all.columns = df_all.columns.str.strip()

# Verify the DataFrame
print(df_all.head())

# Perform the groupby operation
age_by_pclass_sex = df_all.groupby(['Sex', 'Pclass']).median()['Age']

print(age_by_pclass_sex)


In [None]:
# Example DataFrame (adjust according to your actual data)
df_all = pd.DataFrame({
    'Sex': ['female', 'male', 'female', 'male'],
    'Pclass': [1, 2, 1, 3],
    'Age': [22, 25, 30, 29]
})

# Calculate median age by Sex and Pclass
age_by_pclass_sex = df_all.groupby(['Sex', 'Pclass']).median()['Age']

# Print median ages
for pclass in range(1, 4):
    for sex in ['female', 'male']:
        if (sex, pclass) in age_by_pclass_sex.index:
            median_age = age_by_pclass_sex.loc[sex, pclass]
            print('Median age of Pclass {} {}s: {}'.format(pclass, sex, median_age))
        else:
            print('Median age of Pclass {} {}s: Not available'.format(pclass, sex))

# Print the median age of all passengers
print('Median age of all passengers: {}'.format(df_all['Age'].median()))


In [None]:
#**EMBARKED**

In [None]:
# Example DataFrame creation (replace with your actual data loading method)
df_all = pd.DataFrame({
    'PassengerId': [1, 2, 3],
    'Pclass': [1, 2, 3],
    'Name': ['A', 'B', 'C'],
    'Age': [22, 38, 26]
    # 'Embarked' column is intentionally missing here for illustration
})

# Print columns to verify existence of 'Embarked'
print("Columns in DataFrame:", df_all.columns)

# Strip any extra spaces from column names
df_all.columns = df_all.columns.str.strip()



In [None]:
# Check for the presence of 'Embarked'
if 'Embarked' in df_all.columns:
    # Perform operations if 'Embarked' exists
    print(df_all[df_all['Embarked'].isnull()])
else:
    print("'Embarked' column is not present in the DataFrame")


# Filling the missing values in Embarked with S
df_all[df_all['Embarked'].isnull()]

df_all['Embarked'] = 'S'

print(df_all)

In [None]:
#**FARE**

In [None]:
df_all = pd.DataFrame({
    'PassengerId': [1, 2, 3],
    'Pclass': [1, 2, 3],
    'Name': ['A', 'B', 'C'],
    'Age': [22, 38, 26]
    # 'fare' column is intentionally missing here for illustration
})

# Print columns to verify existence of 'fare'
print("Columns in DataFrame:", df_all.columns)

# Strip any extra spaces from column names
df_all.columns = df_all.columns.str.strip()

# Check for the presence of 'Embarked'
if 'fare' in df_all.columns:
    # Perform operations if 'Embarked' exists
    print(df_all[df_all['fare'].isnull()])
else:
    print("'fare' column is not present in the DataFrame")


In [None]:

df_all = pd.DataFrame({
    'Pclass': [1, 2, 3, 3, 1, 3],
    'Parch': [0, 1, 0, 2, 0, 1],
    'SibSp': [1, 0, 1, 1, 0, 0],
    'Fare': [50, 30, 20, None, 70, None]
})

# Group by 'Pclass', 'Parch', 'SibSp' and calculate median of 'Fare'
med_fare_series = df_all.groupby(['Pclass', 'Parch', 'SibSp'])['Fare'].median()

# Print available index combinations
print("Available groups and their median fares:")
print(med_fare_series)

# Choose a valid group, or use a fallback
try:
    # Attempt to retrieve the median Fare for a specific group
    med_fare = med_fare_series.loc[(3, 0, 0)]
except KeyError:
    # Use the overall median fare if the specific group is not present
    print("Group (3, 0, 0) not found, using overall median fare instead.")
    med_fare = df_all['Fare'].median()

# Fill missing values in 'Fare' with the computed median
df_all['Fare'] = df_all['Fare'].fillna(med_fare)

print(df_all)

In [None]:
#**Cabin**

In [None]:
print(df_all.columns)
# Check columns available after groupby operation
grouped_df = df_all.groupby(['Pclass']).count()
print(grouped_df.columns)


In [None]:
# Adjust the list of columns to drop based on the existing columns
columns_to_drop = ['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'PassengerId', 'Ticket']
existing_columns_to_drop = [col for col in columns_to_drop if col in grouped_df.columns]

# Drop columns
df_all_decks = grouped_df.drop(columns=existing_columns_to_drop).rename(columns={'Name': 'Count'}).transpose()
print(df_all_decks)

# Inspect initial columns of df_all
print(df_all.columns)



In [None]:
# Group by 'Pclass' and count
grouped_df = df_all.groupby(['Pclass']).count()

# Define columns to drop
columns_to_drop = ['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'PassengerId', 'Ticket']
existing_columns_to_drop = [col for col in columns_to_drop if col in grouped_df.columns]

# Drop existing columns and rename
df_all_decks = grouped_df.drop(columns=existing_columns_to_drop, errors='ignore').rename(columns={'Name': 'Count'}).transpose()

# Print or analyze the result
print(df_all_decks)


In [None]:
# Creating Deck column from the first letter of the Cabin column (M stands for Missing)
df_all['Deck'] = df_all['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

df_all_decks = df_all.groupby(['Deck', 'Pclass']).count().drop(columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 
                                                                        'Fare', 'Embarked', 'Cabin', 'PassengerId', 'Ticket']).rename(columns={'Name': 'Count'}).transpose()

def get_pclass_dist(df):
    
    # Creating a dictionary for every passenger class count in every deck
    deck_counts = {'A': {}, 'B': {}, 'C': {}, 'D': {}, 'E': {}, 'F': {}, 'G': {}, 'M': {}, 'T': {}}
    decks = df.columns.levels[0]    
    
    for deck in decks:
        for pclass in range(1, 4):
            try:
                count = df[deck][pclass][0]
                deck_counts[deck][pclass] = count 
            except KeyError:
                deck_counts[deck][pclass] = 0
                
    df_decks = pd.DataFrame(deck_counts)    
    deck_percentages = {}

    # Creating a dictionary for every passenger class percentage in every deck
    for col in df_decks.columns:
        deck_percentages[col] = [(count / df_decks[col].sum()) * 100 for count in df_decks[col]]
        
    return deck_counts, deck_percentages

def display_pclass_dist(percentages):
    
    df_percentages = pd.DataFrame(percentages).transpose()
    deck_names = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'M', 'T')
    bar_count = np.arange(len(deck_names))  
    bar_width = 0.85
    
    pclass1 = df_percentages[0]
    pclass2 = df_percentages[1]
    pclass3 = df_percentages[2]
    
    plt.figure(figsize=(20, 10))
    plt.bar(bar_count, pclass1, color='#b5ffb9', edgecolor='white', width=bar_width, label='Passenger Class 1')
    plt.bar(bar_count, pclass2, bottom=pclass1, color='#f9bc86', edgecolor='white', width=bar_width, label='Passenger Class 2')
    plt.bar(bar_count, pclass3, bottom=pclass1 + pclass2, color='#a3acff', edgecolor='white', width=bar_width, label='Passenger Class 3')

    plt.xlabel('Deck', size=15, labelpad=20)
    plt.ylabel('Passenger Class Percentage', size=15, labelpad=20)
    plt.xticks(bar_count, deck_names)    
    plt.tick_params(axis='x', labelsize=15)
    plt.tick_params(axis='y', labelsize=15)
    
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1), prop={'size': 15})
    plt.title('Passenger Class Distribution in Decks', size=18, y=1.05)   
    
    plt.show()    

all_deck_count, all_deck_per = get_pclass_dist(df_all_decks)
display_pclass_dist(all_deck_per)