# Reading in intial dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import (
    BaggingClassifier,
    GradientBoostingClassifier,
    StackingClassifier,
    VotingClassifier,
)
import pickle

train = pd.read_csv("titanic_train.csv")
test = pd.read_csv("titanic_test.csv")
gender_submission = pd.read_csv("gender_submission.csv")

df_data = pd.concat([train, test])
df_data[885:895]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
885,886,0.0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
886,887,0.0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1.0,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0.0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1.0,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0.0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q
0,892,,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S


## Title Variable Extraction (Only for replacing the age with title median)

In [2]:
df_data["Title"] = df_data.Name.str.extract(" ([A-Za-z]+)\.", expand=False)

df_data["Title"] = df_data["Title"].replace("Master", "Master")
df_data["Title"] = df_data["Title"].replace("Mlle", "Miss")
df_data["Title"] = df_data["Title"].replace(["Mme", "Dona", "Ms"], "Mrs")
df_data["Title"] = df_data["Title"].replace(["Don", "Jonkheer"], "Mr")
df_data["Title"] = df_data["Title"].replace(
    ["Capt", "Rev", "Major", "Col", "Dr"], "Military"
)
df_data["Title"] = df_data["Title"].replace(["Lady", "Countess", "Sir"], "Honor")

train["Title"] = df_data["Title"][:891]
test["Title"] = df_data["Title"][891:]

# convert Title categories to Columns
titledummies = pd.get_dummies(train[["Title"]], prefix_sep="_")  # Title
train = pd.concat([train, titledummies], axis=1)
ttitledummies = pd.get_dummies(test[["Title"]], prefix_sep="_")  # Title
test = pd.concat([test, ttitledummies], axis=1)

print(train.columns, test.columns)

# Imputes age by median based on specific title
titles = ["Master", "Miss", "Mr", "Mrs", "Military", "Honor"]
for title in titles:
    age_to_impute = df_data.groupby("Title")["Age"].median()[title]
    df_data.loc[(df_data["Age"].isnull()) & (df_data["Title"] == title), "Age"] = (
        age_to_impute
    )
train["Age"] = df_data["Age"][:891]
test["Age"] = df_data["Age"][891:]

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Title_Honor',
       'Title_Master', 'Title_Military', 'Title_Miss', 'Title_Mr',
       'Title_Mrs'],
      dtype='object') Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Title_Master',
       'Title_Military', 'Title_Miss', 'Title_Mr', 'Title_Mrs'],
      dtype='object')


## Dummies and Drops

In [3]:
train = pd.get_dummies(train, columns=["Pclass"], prefix=["class"])
test = pd.get_dummies(test, columns=["Pclass"], prefix=["class"])
train[["class_1", "class_2", "class_3"]] = train[
    ["class_1", "class_2", "class_3"]
].astype(int)
test[["class_1", "class_2", "class_3"]] = test[
    ["class_1", "class_2", "class_3"]
].astype(int)

train["Sex"] = pd.get_dummies(train[["Sex"]], drop_first=True)
train.rename(columns={"Sex": "Male"}, inplace=True)
test["Sex"] = pd.get_dummies(test[["Sex"]], drop_first=True)
test.rename(columns={"Sex": "Male"}, inplace=True)


train = train.drop(["Name", "Ticket", "Embarked", "Cabin"], axis=1)
train = train.drop(["Title"], axis=1)

title_list = [
    "Title_Honor",
    "Title_Master",
    "Title_Military",
    "Title_Miss",
    "Title_Mr",
    "Title_Mrs",
]

train = train.drop(title_list, axis=1)

test = test.drop(["Name", "Ticket", "Embarked", "Cabin"], axis=1)
test = test.drop(["Title"], axis=1)

test_title_list = [
    "Title_Master",
    "Title_Military",
    "Title_Miss",
    "Title_Mr",
    "Title_Mrs",
]

test = test.drop(test_title_list, axis=1)

test["Fare"].fillna(value=round(test["Fare"].mean()), inplace=True)

print(train.columns, test.columns)

Index(['PassengerId', 'Survived', 'Male', 'Age', 'SibSp', 'Parch', 'Fare',
       'class_1', 'class_2', 'class_3'],
      dtype='object') Index(['PassengerId', 'Male', 'Age', 'SibSp', 'Parch', 'Fare', 'class_1',
       'class_2', 'class_3'],
      dtype='object')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Fare'].fillna(value = round(test['Fare'].mean()), inplace = True)


In [6]:
with open("train.pkl", "wb") as file:
    pickle.dump(train, file)

with open("test.pkl", "wb") as file:
    pickle.dump(test, file)

print(test.iloc[1,])

PassengerId      893
Male           False
Age             47.0
SibSp              1
Parch              0
Fare             7.0
class_1            0
class_2            0
class_3            1
Name: 1, dtype: object


# x_train, y_train, x_test, y_test

In [13]:
x_train = train[
    ["Male", "Age", "SibSp", "Parch", "Fare", "class_1", "class_2", "class_3"]
]
y_train = train[["Survived"]]

x_test = test[
    ["Male", "Age", "SibSp", "Parch", "Fare", "class_1", "class_2", "class_3"]
]
y_test = gender_submission[["Survived"]]

In [14]:
# define the scaler
scaler = MinMaxScaler()

# define models
lr_mod = LogisticRegression()
svc_model = SVC()
rf_model = RandomForestClassifier()
knn_mod = KNeighborsClassifier(n_neighbors=5)

# pipelines
pipeline_lr = Pipeline([("scaler", scaler), ("lr", lr_mod)])

pipeline_svc = Pipeline([("scaler", scaler), ("svc", svc_model)])

pipeline_rf = Pipeline([("scaler", scaler), ("rfc", rf_model)])

pipeline_knn = Pipeline([("scaler", scaler), ("knn", knn_mod)])

In [15]:
# Fit pipelines
pipeline_lr.fit(x_train, y_train)
y_pred_lr = pipeline_lr.predict(x_test)
pipeline_lr_acc = accuracy_score(y_pred_lr, y_test)
print(f"lr accuracy = {pipeline_lr_acc}")

pipeline_svc.fit(x_train, y_train)
y_pred_svc = pipeline_svc.predict(x_test)
pipeline_svc_acc = accuracy_score(y_pred_svc, y_test)
print(f"svc accuracy = {pipeline_svc_acc}")

pipeline_rf.fit(x_train, y_train)
y_pred_rf = pipeline_rf.predict(x_test)
pipeline_rf_acc = accuracy_score(y_pred_rf, y_test)
print(f"rf accuracy = {pipeline_rf_acc}")

pipeline_knn.fit(x_train, y_train)
y_pred_knn = pipeline_knn.predict(x_test)
pipeline_knn_acc = accuracy_score(y_pred_knn, y_test)
print(f"knn accuracy = {pipeline_knn_acc}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


lr accuracy = 0.9593301435406698
svc accuracy = 0.9832535885167464
rf accuracy = 0.8157894736842105
knn accuracy = 0.8373205741626795


  return self._fit(X, y)


In [16]:
with open("svc_pipeline.pkl", "wb") as file:
    pickle.dump(pipeline_svc, file)

with open("svc_pipeline.pkl", "rb") as file:
    svc_pipeline2 = pickle.load(file)

svc_pipeline2.fit(x_train, y_train)
y_pred_svc2 = svc_pipeline2.predict(x_test)
svc_pipeline2_acc = accuracy_score(y_pred_svc2, y_test)
print(f"svc2 acc = {svc_pipeline2_acc}")

svc2 acc = 0.9832535885167464


  y = column_or_1d(y, warn=True)


In [7]:
# Bagging
bagging_model = BaggingClassifier(
    estimator=pipeline_lr, n_estimators=10, random_state=42
)
bagging_model.fit(x_train, y_train)
y_pred = bagging_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"lr_model: {accuracy}")

# Boosting
boosting_model = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, random_state=42
)
boosting_model.fit(x_train, y_train)
y_pred = boosting_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Boosting Model Accuracy: {accuracy:.2f}")

# Stacking
level1_models = [("svc", svc_model), ("rf", rf_model), ("knn", knn_mod)]
# Define the final estimator
final_estimator = lr_mod

stacking_model = StackingClassifier(
    estimators=level1_models, final_estimator=final_estimator, cv=5
)
stacking_model.fit(x_train, y_train)
y_pred = stacking_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Stacking Model Accuracy: {accuracy:.2f}")

# Majority
voting_model = VotingClassifier(
    estimators=level1_models, voting="hard"
)  # Hard voting for classification - SOFT is regression
voting_model.fit(x_train, y_train)
y_pred = voting_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Majority Voting Model Accuracy: {accuracy:.2f}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


lr_model: 0.9665071770334929
Boosting Model Accuracy: 0.88


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Stacking Model Accuracy: 0.81


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Majority Voting Model Accuracy: 0.73


In [8]:
print(voting_model.__class__.__name__)

dict = {"a": 1, "b": 2, "c": 3}

VotingClassifier


3
