In [5]:
import numpy as np
import pandas as pd

In [6]:
# Define Function for removing column

def remove_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(['PassengerId','Cabin','Ticket','Name'],axis = 1)


In [7]:
# Testing Function For Name Features
def preprocessName(df: pd.DataFrame) -> pd.DataFrame:
    df['Last Name'] = df.apply(lambda row: row["Name"][:row["Name"].find(',')], axis = 1)
    df['Title'] = df.apply(lambda row: row["Name"][row["Name"].find(',')+2:row["Name"].find('.')], axis=1)
    return df 

In [8]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(type(df))
print(type(df_test))
output_Id = df_test['PassengerId']

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [12]:
print(df.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [20]:
print(df.select_dtypes("object").isna().sum() > 0)

Name        False
Sex         False
Ticket      False
Cabin        True
Embarked     True
dtype: bool


In [5]:
# Transformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# df = remove_columns(df)
# df_test = remove_columns(df_test)

# X_columns = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
# Y_columns = ['Survived']

X_train, X_test, y_train, y_test = train_test_split(df.drop(["Survived"], axis = 1), df["Survived"], test_size=.33,random_state=10)

X_train = preprocessName(X_train)
X_test = preprocessName(X_test)

# Scale numeric values

num_features = ['Age','SibSp','Parch','Fare']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Transformer for Embarked
embarked_transformer = Pipeline(steps=[('imputing', SimpleImputer(strategy='most_frequent')),('encoding', OneHotEncoder())])

# Categorical encoding
categorical_features = ['Sex', 'Pclass', 'Title', 'Last Name', 'Ticket']

# Name


preprocessor = ColumnTransformer(transformers=[('embark', embarked_transformer, ['Embarked']), ('num_transform', num_transformer, num_features),('cat_transformer', OneHotEncoder(handle_unknown='ignore'), categorical_features)],remainder='drop')

pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [6]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# np.set_printoptions(precision=4, threshold = sys.maxsize)

pipe.fit(X_train, y_train)
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

In [7]:
# Predict 
print(type(X_train))

<class 'scipy.sparse.csr.csr_matrix'>


In [8]:
# Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,make_scorer
from sklearn.model_selection import GridSearchCV

# Trying multiple models

# Additional Imports
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

clf = MLPClassifier()
clf.fit(X_train, y_train)
print("Accuracy for MPC CLassifier", accuracy_score(y_test, clf.predict(X_test)))

clf = SVC()
clf.fit(X_train, y_train)
print("Accuracy for SVC", accuracy_score(y_test, clf.predict(X_test)))

clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
print("Accuracy for AdaBoostClassifier", accuracy_score(y_test, clf.predict(X_test)))

clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
print("Accuracy for KNeighborsClassifier", accuracy_score(y_test, clf.predict(X_test)))

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print("Accuracy for DecisionTreeClassifier", accuracy_score(y_test, clf.predict(X_test)))

# clf = GaussianProcessClassifier()
# clf.fit(X_train, y_train)
# print("Accuracy for GaussianProcessClassifier", accuracy_score(y_test, clf.predict(X_test)))


Accuracy for MPC CLassifier 0.8372881355932204
Accuracy for SVC 0.8338983050847457
Accuracy for AdaBoostClassifier 0.8338983050847457
Accuracy for KNeighborsClassifier 0.8372881355932204
Accuracy for DecisionTreeClassifier 0.8067796610169492


In [9]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)

print(accuracy_score(y_test, clf.predict(X_test)))

0.8440677966101695


In [10]:
# scorer = make_scorer(accuracy_score)

import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

n_estimators = [100, 300, 500, 800, 1200]
learning_rate = [0.01, 0.1, 0.2, 0.5]
max_depth = [1, 5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 
# kernel = ["linear", "poly", "rbf", "sigmoid"]
# degree = [1,2,3,4,5,6,7,8,9]

model = GradientBoostingClassifier(random_state=0)

hyperF = dict(n_estimators = n_estimators, learning_rate = learning_rate, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf=min_samples_leaf)

# clf = GridSearchCV(model, hyperF, verbose = 1, n_jobs = -1, refit='Accuracy', scoring = {'Accuracy': make_scorer(accuracy_score)})

# clf = RandomForestClassifier(max_depth= 25, min_samples_leaf = 1, min_samples_split = 10, n_estimators= 300,  n_jobs = -1)
# clf.fit(X_train, y_train)
# print(accuracy_score(y_test, clf.predict(X_test)))

# print(clf.best_params_)
# print(clf.cv_results_)

# bestF = gridF.fit(x_train, y_train)

# values = np.arange(10,30,1)

# arr = []

# for x in values:
#     clf = RandomForestClassifier(max_depth=5, random_state=0, n_estimators = 11, n_jobs = -1, verbose = 1)
#     clf.fit(X_train,y_train)
#     accuracy = accuracy_score(y_test, clf.predict(X_test))
    
#     arr.append(accuracy)

# plt.plot(values, np.array(arr))
# plt.title('Graph')
# plt.xlabel('Max Depth')
# plt.ylabel('Accuracy')
# plt.show()

In [11]:
# Training With Complete Train Data
# 'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300
# df_Y = df["Survived"]
# df_X = df.drop(["Survived"], axis = 1)
# df_X = pipe.fit_transform(df_X, df_Y)
# clf.fit(df_X, df_Y)

In [12]:
# print(clf.best_params_)

In [13]:
# Predicting
# clf = RandomForestClassifier(max_depth= 25, min_samples_leaf = 1, min_samples_split = 10, n_estimators= 300,  n_jobs = -1)
# clf = SVC(degree= 1, kernel= 'rbf')
clf = GradientBoostingClassifier(learning_rate= 0.1, max_depth= 15, min_samples_leaf= 1, min_samples_split= 5, n_estimators= 500)
df_Y = df["Survived"]
df_X = df.drop(["Survived"], axis = 1)
df_X = preprocessName(df_X)
pipe.fit(df_X, df_Y)
df_X = pipe.transform(df_X)
clf.fit(df_X, df_Y)
# print(df_test.info())
df_test = preprocessName(df_test)
df_test = pipe.transform(df_test)
print(df_test)
prediction = clf.predict(df_test)
pred_df = pd.DataFrame(prediction, columns=['Survived'])
output = pd.concat([output_Id, pred_df], axis=1)

  (0, 1)	1.0
  (0, 3)	0.3948865804412651
  (0, 4)	-0.47454519624983954
  (0, 5)	-0.4736736092984604
  (0, 6)	-0.49078316061772326
  (0, 8)	1.0
  (0, 11)	1.0
  (0, 23)	1.0
  (0, 335)	1.0
  (1, 2)	1.0
  (1, 3)	1.3555096219574048
  (1, 4)	0.4327933656785018
  (1, 5)	-0.4736736092984604
  (1, 6)	-0.5074788432328381
  (1, 7)	1.0
  (1, 11)	1.0
  (1, 24)	1.0
  (2, 1)	1.0
  (2, 3)	2.5082572717767726
  (2, 4)	-0.47454519624983954
  (2, 5)	-0.4736736092984604
  (2, 6)	-0.4533668714188957
  (2, 8)	1.0
  (2, 10)	1.0
  (2, 23)	1.0
  :	:
  (415, 3)	0.7022859537264299
  (415, 4)	-0.47454519624983954
  (415, 5)	-0.4736736092984604
  (415, 6)	-0.5024451714361923
  (415, 8)	1.0
  (415, 11)	1.0
  (415, 23)	1.0
  (416, 2)	1.0
  (416, 3)	-0.10463740114712752
  (416, 4)	-0.47454519624983954
  (416, 5)	-0.4736736092984604
  (416, 6)	-0.4863374216869257
  (416, 8)	1.0
  (416, 11)	1.0
  (416, 23)	1.0
  (417, 0)	1.0
  (417, 3)	-0.10463740114712752
  (417, 4)	0.4327933656785018
  (417, 5)	0.7676298785983874
  (4

In [14]:
# Writing to output
output.to_csv (r'./export_dataframe.csv', index = False, header=True)