In [None]:
# import packages
!pip install mlxtend

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from mlxtend.frequent_patterns import apriori, fpgrowth
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()

In [None]:
# set file location
#if needed please choose commented format if needed
#file_loc = Path(r'C:\Users\hruss\OneDrive\Documents\GMU\Repositories\Data_files')
file_loc = Path(r"C:/Users/pgarc_1jof181/Desktop/titanic.csv")

#print("file location exists:", file_loc.is_dir())
print("file location exists:", file_loc.is_file())

In [None]:
# import dataset

#df = pd.read_csv("titanic.csv")
df = pd.read_csv(file_loc)
df.head()

In [None]:
# find out number of rows and columns

df.shape

In [None]:
# find null values

nulls = df.isnull().sum()
print("These are the features with null values, and the count of null values in each.")
print(nulls[nulls >= 1])

In [None]:
# Impute missing 'age' values based on another ('who') column, and check

category_means = df.groupby('who')['age'].transform('mean')
df1 = df
df1['age'].fillna(category_means, inplace=True)
df1[df1['who'] == 'child'].describe()


In [None]:
# try a quick association rule to see if there is some pattern to deck = NaN
# first we create a categories dataframe

df_categories = df1.drop(['pclass', 'sibsp', 'parch', 'Unnamed: 0', 'embarked', 'sex', 'age', 'fare', 'alive', 'adult_male'],axis = 1)
df_categories

In [None]:
# then we encode the category columns

codes = cat_encoder.fit_transform(df_categories[['class', 'who', 'deck', 'embark_town']])
names = cat_encoder.get_feature_names_out()
encoded_df = pd.DataFrame(codes.todense(), columns = names)
encoded_df

In [None]:
# then we clean things up a bit

df2 = pd.merge(encoded_df, df_categories[['survived', 'alone']], left_index=True, right_index=True)
df2['alone'].replace([0,1],[False, True], inplace=True)
df2

In [None]:
# then we do some association rule mining using apriori and print the results

results = apriori(df2, min_support = .2, use_colnames=True)
results = results[results['itemsets'].apply(lambda x: len(x)) > 2]
results = results[results['itemsets'].apply(lambda x: 'deck_nan' in x)]
results

In [None]:
# let's look at a subset of the data we suspect is most closely related to deck = NaN
# we want to filter to: who = man, embark town = southampton, alone = true and class_third = true
# we want the resulting table to just show the counts of each variable after filtering

df2[(df2['who_man'] == True)].sum()

In [None]:
# join df1 and df2 for analysis like clustering, correlation, and pca

df3 = pd.concat([df1,df2], axis=0)
df3

In [None]:
# normalize age and fare, so they don't bias our clustering and correlation

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df3['age'] = scaler.fit_transform(df3[['age']])
df3['fare'] = scaler.fit_transform(df3[['fare']])
df3

In [None]:
# discover correlations

corr_matrix = df3.corr()
plt.figure(figsize=(15, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()