In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('train.csv')

In [6]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [28]:
from sklearn.impute import SimpleImputer
class ImputedData(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        imputer = SimpleImputer(strategy = 'mean')
        X["ImputerAge"] = imputer.fit_transform(X[['Age']])
        return X


In [46]:
from sklearn.preprocessing import OneHotEncoder
class EncodedData(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X['gender']=np.where(X['Sex'] == 'female',1,0)
        X = X.drop(columns=['Sex'])
        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[['Embarked']]).toarray()
        column_name = ['C','S','Q','N']
        for i in range(len(matrix.T)):
            X[column_name[i]]= matrix.T[i]
        maskPclass1 = X['Pclass']==1
        maskPclass3 = X['Pclass']==3
        X.loc[maskPclass1,'Pclass']=3
        X.loc[maskPclass3,'Pclass']=1
        return X    

   

In [45]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X=X.drop(['Sex','Embarked','Name','Age','Cabin','N'],axis=1,errors='ignore')
        return X

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline([('imputedData',ImputedData()),
                    ("encodedData",EncodedData()),
                    ('featureDropper',FeatureDropper()),
                    ('Scaler',StandardScaler())])

In [35]:
from sklearn.model_selection import train_test_split
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'gender'],
      dtype='object')

In [74]:
X= df[['Sex','Embarked','Age','Pclass']]
y= df['Survived'].values

In [75]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size=0.2,stratify=y)

In [76]:
X_train_scaled = pipeline.fit_transform(X_train)

In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [78]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
random_forest = RandomForestClassifier()

In [82]:
grid_search = GridSearchCV(random_forest, param_grid, cv=5, scoring='accuracy')

In [83]:
grid_search.fit(X_train_scaled, y_train)

In [84]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

Best Parameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy:  0.8244755244755245


In [85]:
X_test_scaled = pipeline.transform(X_test)

In [86]:
grid_search.score(X_test_scaled,y_test)

0.7653631284916201

In [89]:
X_scaled = pipeline.fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["ImputerAge"] = imputer.fit_transform(X[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['gender']=np.where(X['Sex'] == 'female',1,0)


In [91]:
grid_search.fit(X_scaled,y)

In [92]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

Best Parameters:  {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Accuracy:  0.810369719414977


In [93]:
estimator = grid_search.best_estimator_

In [94]:
estimator

In [95]:
test_df = pd.read_csv("test.csv")

In [98]:
test_df_final = test_df[['Sex','Embarked','Age','Pclass']]

In [99]:
test_df_final = pipeline.transform(test_df_final)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["ImputerAge"] = imputer.fit_transform(X[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['gender']=np.where(X['Sex'] == 'female',1,0)


In [100]:
test_df_final

array([[-0.82737724,  0.36944878, -0.73769513, -0.48204268,  3.25137334,
        -1.61470971],
       [-0.82737724,  1.33137817,  1.35557354, -0.48204268, -0.30756234,
         0.61930636],
       [ 0.36936484,  2.48569343, -0.73769513, -0.48204268,  3.25137334,
        -1.61470971],
       ...,
       [-0.82737724,  0.67726619, -0.73769513, -0.48204268, -0.30756234,
         0.61930636],
       [-0.82737724,  0.04413122, -0.73769513, -0.48204268, -0.30756234,
         0.61930636],
       [-0.82737724,  0.04413122, -0.73769513,  2.0745051 , -0.30756234,
        -1.61470971]])

In [101]:
predictions = estimator.predict(test_df_final)

In [103]:
final_df = pd.DataFrame(test_df['PassengerId'])

In [105]:
final_df['Survived'] = predictions

In [108]:
final_df.to_csv("predictions.csv",index=False)