In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,precision_recall_fscore_support,accuracy_score,make_scorer
from sklearn.preprocessing import StandardScaler

In [2]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0] 
                               if X[c].dtype == np.dtype('O') 
                               else X[c].mean() for c in X],index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [3]:
df = pd.read_csv("train.csv")

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [6]:
df = DataFrameImputer().fit_transform(df)

In [7]:
df.drop(columns=["Name","Ticket","Cabin"],inplace = True,axis = 1)

In [8]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [9]:
str_dtyp = []
for dt in df.columns:
    if(df[dt].dtype==np.dtype('O')):
        str_dtyp.append(dt)
df3 = pd.get_dummies(df[str_dtyp],drop_first=True)

In [10]:
data = pd.concat([df3,df],axis=1)
data.head()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,1,1,0,3,male,22.0,1,0,7.25,S
1,0,0,0,2,1,1,female,38.0,1,0,71.2833,C
2,0,0,1,3,1,3,female,26.0,0,0,7.925,S
3,0,0,1,4,1,1,female,35.0,1,0,53.1,S
4,1,0,1,5,0,3,male,35.0,0,0,8.05,S


In [11]:
data.drop(columns=["Sex","Embarked"],inplace=True)

In [12]:
X = data.drop(columns=["Pclass"])
y = data.iloc[:,4]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
GB = GradientBoostingClassifier()
RF = RandomForestClassifier()

In [17]:
GB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [18]:
RF.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
y_pred_gb = GB.predict(X_test)
y_pred_rf = RF.predict(X_test)

In [20]:
precision_gb,recall_gb,fscore_gb,support_gb = precision_recall_fscore_support(y_test,y_pred_gb)

In [21]:
precision_rf,recall_rf,fscore_rf,support_rf = precision_recall_fscore_support(y_test,y_pred_rf)

In [22]:
accuracy_gb = accuracy_score(y_test,y_pred_gb)
accuracy_rf = accuracy_score(y_test,y_pred_rf)
conf_matrix_gb = confusion_matrix(y_test,y_pred_gb)
conf_matrix_rf = confusion_matrix(y_test,y_pred_rf)

In [23]:
print("GB:{}".format(accuracy_gb))
print("RF:{}".format(accuracy_rf))

GB:1.0
RF:1.0


In [24]:
from sklearn.metrics import roc_curve, auc

In [25]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.ravel(), y_pred_gb)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

1.0

## learning_rate Optimisation

In [39]:
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
train_results = []
test_results = []
for eta in learning_rates:
    model = GradientBoostingClassifier(learning_rate=eta)
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train.reshape(-1,1), y_pred_gb)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = model.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.ravel(), y_pred_gb)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(learning_rates, train_results, 'b', label='Train AUC')
line2, = plt.plot(learning_rates, test_results, 'r', label='Test AUC')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('learning rate')
plt.show()

AttributeError: 'Series' object has no attribute 'reshape'

In [41]:
from sklearn.model_selection import GridSearchCV

In [55]:
parameters = {
    'learning_rate' : [1, 0.5, 0.25, 0.1, 0.05, 0.01],
    'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'max_depth' : np.linspace(1, 32, 32, endpoint=True),
    'min_samples_split' : np.linspace(0.1, 1.0, 10, endpoint=True),
    'min_samples_leaf' : np.linspace(0.1, 0.5, 5, endpoint=True),
    "max_features":["log2","sqrt"]
}

In [57]:
import time

In [58]:
# Type of scoring used to compare parameter combinations
strt = time.time()
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(GB, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
end = time.time()
print("Total Time: {}".format(end-strt))



Total Time: 19988.30820083618


In [59]:
grid_obj.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1, loss='deviance', max_depth=1.0,
              max_features='log2', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=0.1, min_samples_split=0.1,
              min_weight_fraction_leaf=0.0, n_estimators=8,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [61]:
final_clf = grid_obj.best_estimator_

In [63]:
final_clf.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1, loss='deviance', max_depth=1.0,
              max_features='log2', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=0.1, min_samples_split=0.1,
              min_weight_fraction_leaf=0.0, n_estimators=8,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [74]:
final_pred = final_clf.predict(X_test)

In [75]:
accuracy_score(y_test.ravel(),final_pred)

1.0

In [76]:
accuracy_score(y_test,final_pred)

1.0

## Final Calculate

In [3]:
# Our Model is

In [4]:
model = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1, loss='deviance', max_depth=1.0,
              max_features='log2', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=0.1, min_samples_split=0.1,
              min_weight_fraction_leaf=0.0, n_estimators=8,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [5]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [6]:
df_train_imp = DataFrameImputer().fit_transform(df_train)
df_test_imp = DataFrameImputer().fit_transform(df_test)

In [7]:
df_test_imp.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,B57 B59 B63 B66,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,B57 B59 B63 B66,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,B57 B59 B63 B66,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,B57 B59 B63 B66,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,B57 B59 B63 B66,S


In [8]:
df_train_imp.drop(columns=["Name","Ticket","Cabin","PassengerId"],inplace = True,axis = 1)
df_test_imp.drop(columns=["Name","Ticket","Cabin","PassengerId"],inplace = True,axis = 1)

In [9]:
print(df_train_imp.head())
print(df_test_imp.head())

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0       3    male  34.5      0      0   7.8292        Q
1       3  female  47.0      1      0   7.0000        S
2       2    male  62.0      0      0   9.6875        Q
3       3    male  27.0      0      0   8.6625        S
4       3  female  22.0      1      1  12.2875        S


In [10]:
str_dtyp = []
for dt in df_train_imp.columns:
    if(df_train_imp[dt].dtype==np.dtype('O')):
        str_dtyp.append(dt)
df_train_dum = pd.get_dummies(df_train_imp[str_dtyp],drop_first=True)
df_test_dum = pd.get_dummies(df_test_imp[str_dtyp],drop_first=True)

In [11]:
df_train_imp.drop(columns=["Embarked","Sex"],inplace=True)
df_test_imp.drop(columns=["Embarked","Sex"],inplace=True)

In [12]:
train_data = pd.concat([df_train_dum,df_train_imp],axis=1)
test_data = pd.concat([df_test_dum,df_test_imp],axis=1)

In [13]:
train_data.head()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,1,0,3,22.0,1,0,7.25
1,0,0,0,1,1,38.0,1,0,71.2833
2,0,0,1,1,3,26.0,0,0,7.925
3,0,0,1,1,1,35.0,1,0,53.1
4,1,0,1,0,3,35.0,0,0,8.05


In [14]:
test_data.head()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Pclass,Age,SibSp,Parch,Fare
0,1,1,0,3,34.5,0,0,7.8292
1,0,0,1,3,47.0,1,0,7.0
2,1,1,0,2,62.0,0,0,9.6875
3,1,0,1,3,27.0,0,0,8.6625
4,0,0,1,3,22.0,1,1,12.2875


In [15]:
X_train = train_data.drop(columns=["Survived"]).values
y_train = train_data.Survived.values

In [16]:
X_test = test_data.values

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [20]:
model.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1, loss='deviance', max_depth=1.0,
              max_features='log2', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=0.1, min_samples_split=0.1,
              min_weight_fraction_leaf=0.0, n_estimators=8,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [21]:
y_predict = model.predict(X_test)

In [33]:
data = {
    'Survived': y_predict
}

In [34]:
kaggle_upload = pd.DataFrame(data = data,index=df_test.PassengerId)

In [36]:
kaggle_upload.to_csv("My_Submission.csv")