In [1]:
# Import supported libraries
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
# Read Train and Test data-sets
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
titanic_train = train_data.copy()
titanic_test = test_data.copy()

## Data Exploration

In [4]:
# Gather info from the Train data-set
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
# Gather info from the Train data-set
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [6]:
# View top Train data
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# View top Test data
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
# View Train bottom data
train_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [9]:
# View Test bottom data
test_data.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [10]:
# Train attributes data types
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [11]:
# Test attributes data types
test_data.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [12]:
# Train Summary
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
# Test Summary
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [14]:
# Train dimension
train_data.shape

(891, 12)

In [15]:
# Test dimension
test_data.shape

(418, 11)

In [16]:
# Train attributes name
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [17]:
# Test attributes name
test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Data Pre-processing

In [18]:
### Train and Test data-sets : Drop insignificant attributes ###

# Finding unique values in Train data
train_data.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [19]:
# Finding unique values in Test data
test_data.nunique()

PassengerId    418
Pclass           3
Name           418
Sex              2
Age             79
SibSp            7
Parch            8
Ticket         363
Fare           169
Cabin           76
Embarked         3
dtype: int64

In [20]:
# 'PassengerId', 'Name' are having more unique values, so dropping them

# Train data-set
train_data.drop(['PassengerId', 'Name'], axis=1, inplace=True)

# Test data-set
test_data.drop(['PassengerId', 'Name'], axis=1, inplace=True)

In [21]:
### Train data-set : Covert Numerical attributes to Categorical attributes ###

# Need to convert these: Survived, Pclass, SibSp, Parch
print("Data-type Before: \n", train_data.dtypes)

print("-------------------------------------------")
train_data.loc[:,['Survived', 'Pclass', 'SibSp', 'Parch']] = train_data.loc[:,['Survived', 'Pclass', 'SibSp', 'Parch']].apply(lambda x: x.astype('category'))

print("Data-type After: \n", train_data.dtypes)

# or

## Convert numeric attributes to categorical at once in train
#num_columns = titanic_orig_train.select_dtypes(['int64']).columns
#for col in num_columns:
#    titanic_orig_train[col] = titanic_orig_train[col].astype('category', copy = False)

Data-type Before: 
 Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object
-------------------------------------------
Data-type After: 
 Survived    category
Pclass      category
Sex           object
Age          float64
SibSp       category
Parch       category
Ticket        object
Fare         float64
Cabin         object
Embarked      object
dtype: object


In [22]:
### Test data-set : Covert Numerical attributes to Categorical attributes ###

# Need to convert these: Survived, Pclass, SibSp, Parch
print("Data-type Before: \n", test_data.dtypes)

print("-------------------------------------------")
test_data.loc[:,['Pclass', 'SibSp', 'Parch']] = test_data.loc[:,['Pclass', 'SibSp', 'Parch']].apply(lambda x: x.astype('category'))

print("Data-type After: \n", test_data.dtypes)

Data-type Before: 
 Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object
-------------------------------------------
Data-type After: 
 Pclass      category
Sex           object
Age          float64
SibSp       category
Parch       category
Ticket        object
Fare         float64
Cabin         object
Embarked      object
dtype: object


In [23]:
### Train data-set : Convert all "object" data-type to "category" ###
train_data[['Sex', 'Ticket', 'Cabin', 'Embarked']] = train_data[['Sex', 'Ticket', 'Cabin', 'Embarked']].apply(lambda x: x.astype('category'))

# View data-types
train_data.dtypes

Survived    category
Pclass      category
Sex         category
Age          float64
SibSp       category
Parch       category
Ticket      category
Fare         float64
Cabin       category
Embarked    category
dtype: object

In [24]:
### Test data-set : Convert all "object" data-type to "category" ###
test_data[['Sex', 'Ticket', 'Cabin', 'Embarked']] = test_data[['Sex', 'Ticket', 'Cabin', 'Embarked']].apply(lambda x: x.astype('category'))

# View data-types
test_data.dtypes

Pclass      category
Sex         category
Age          float64
SibSp       category
Parch       category
Ticket      category
Fare         float64
Cabin       category
Embarked    category
dtype: object

In [25]:
### Train data-set : Drop or replace with Mode vales for attributes with more number of NaN values ###

# Find the missing values for each attributes
print(train_data.isnull().sum(axis=0))

print('---------------------------')

# Drop the Cabin attribute due to many missing values
train_data.drop(['Cabin'], axis=1, inplace=True)

# Replace with Mode value
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode().iloc[0])

# Impute the Age missing values with Mean values
#mean_imputer = Imputer()
#train_data = pd.DataFrame(mean_imputer.fit_transform(train_data), columns=train_data.columns)

# Find the missing values, if any
print(train_data.isnull().sum(axis=0))

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64
---------------------------
Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Embarked      0
dtype: int64


In [26]:
### Test data-set : Drop or replace with Mode vales for attributes with more number of NaN values ###

# Find the missing values for each attributes
print(test_data.isnull().sum(axis=0))

print('---------------------------')

# Drop the Cabin attribute due to many missing values
test_data.drop(['Cabin'], axis=1, inplace=True)

# Replace with Mode value
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mode().iloc[0])

# Find the missing values, if any
print(test_data.isnull().sum(axis=0))

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64
---------------------------
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Ticket       0
Fare         0
Embarked     0
dtype: int64


In [27]:
### Train data-set : Separte the Categorical and Numerical data ###

# Categorical data Indexes
cat_train = train_data.dtypes[train_data.dtypes=='category'].index
print('CATEGORICAL ATTRIBUTES: \n', cat_train)

# Numeric data
num_train = train_data.columns.difference(cat_train)
print('NUMERICAL ATTRIBUTES: \n', num_train)

# Categorical data
cat_train_data = pd.DataFrame(train_data[cat_train])
cat_train_data.dtypes

CATEGORICAL ATTRIBUTES: 
 Index(['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Embarked'], dtype='object')
NUMERICAL ATTRIBUTES: 
 Index(['Age', 'Fare'], dtype='object')


Survived    category
Pclass      category
Sex         category
SibSp       category
Parch       category
Ticket      category
Embarked    category
dtype: object

In [28]:
# Removing Target attribute from Categorical data
cat_train_data.drop(['Survived'], axis=1, inplace=True)
cat_train_data.dtypes

Pclass      category
Sex         category
SibSp       category
Parch       category
Ticket      category
Embarked    category
dtype: object

In [29]:
# Numeric data
num_train_data = train_data[num_train]
num_train_data.dtypes

Age     float64
Fare    float64
dtype: object

In [30]:
### Test data-set : Separte the Categorical and Numerical data ###

# Categorical data indexes
cat_test = test_data.dtypes[test_data.dtypes=='category'].index
print('CATEGORICAL ATTRIBUTES: \n', cat_test)

# Numeric data
num_test = test_data.columns.difference(cat_test)
print('NUMERICAL ATTRIBUTES: \n', num_test)

# Categorical data
cat_test_data = pd.DataFrame(test_data[cat_test])
cat_test_data.dtypes

CATEGORICAL ATTRIBUTES: 
 Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Embarked'], dtype='object')
NUMERICAL ATTRIBUTES: 
 Index(['Age', 'Fare'], dtype='object')


Pclass      category
Sex         category
SibSp       category
Parch       category
Ticket      category
Embarked    category
dtype: object

In [31]:
# Numeric data
num_test_data = test_data[num_test]
num_test_data.dtypes

Age     float64
Fare    float64
dtype: object

In [32]:
### Train data-set : Impute the missing data with MEAN value

# Find the missing values
print("Before: \n", num_train_data.isnull().sum(axis=0))

print('-------------')

# Impute the Age missing values with Mean values
mean_imputer = Imputer()
num_train_data = pd.DataFrame(mean_imputer.fit_transform(num_train_data), columns=num_train_data.columns)

# Find the missing values again
print("After: \n", num_train_data.isnull().sum(axis=0))

Before: 
 Age     177
Fare      0
dtype: int64
-------------
After: 
 Age     0
Fare    0
dtype: int64


In [33]:
### Test data-set : Impute the missing data with MEAN value

# Find the missing values
print("Before: \n", num_test_data.isnull().sum(axis=0))

print('-------------')

# Impute the Age missing values with Mean values
mean_imputer = Imputer()
num_test_data = pd.DataFrame(mean_imputer.fit_transform(num_test_data), columns=num_test_data.columns)

# Find the missing values again
print("After: \n", num_test_data.isnull().sum(axis=0))

Before: 
 Age     86
Fare     0
dtype: int64
-------------
After: 
 Age     0
Fare    0
dtype: int64


In [34]:
### Train data-set : Find the number of levels for each attributes

print('Before Levels: \n', cat_train_data.iloc[:,0:6].nunique())

#print(cat_train_data[['Ticket']].head())
#print('min frequency: ', cat_train_data.groupby('Ticket').size().min())
#print('max frequency: ', cat_train_data.groupby('Ticket').size().max())
# Group levels according to frequencies

print('-------------------')
# Drop the Ticket attribute
cat_train_data.drop(['Ticket'], axis=1, inplace=True)

print('After Levels: \n', cat_train_data.iloc[:,0:6].nunique())

Before Levels: 
 Pclass        3
Sex           2
SibSp         7
Parch         7
Ticket      681
Embarked      3
dtype: int64
-------------------
After Levels: 
 Pclass      3
Sex         2
SibSp       7
Parch       7
Embarked    3
dtype: int64


In [35]:
### Test data-set : Find the number of levels for each attributes

print('Before Levels: \n', cat_test_data.iloc[:,0:5].nunique())

print('----------------')
# Drop the Ticket attribute
cat_test_data.drop(['Ticket'], axis=1, inplace=True)

print('After Levels: \n', cat_test_data.iloc[:,0:5].nunique())

Before Levels: 
 Pclass      3
Sex         2
SibSp       7
Parch       8
Ticket    363
dtype: int64
----------------
After Levels: 
 Pclass      3
Sex         2
SibSp       7
Parch       8
Embarked    3
dtype: int64


In [36]:
### Train data-set - Standardize the Numeric data-set ###

print(num_train_data.head())

print("------------------------------")

scaler = StandardScaler()

num_train_std = pd.DataFrame(scaler.fit_transform(num_train_data), columns=num_train_data.columns)

print(num_train_std.head())

    Age     Fare
0  22.0   7.2500
1  38.0  71.2833
2  26.0   7.9250
3  35.0  53.1000
4  35.0   8.0500
------------------------------
        Age      Fare
0 -0.592481 -0.502445
1  0.638789  0.786845
2 -0.284663 -0.488854
3  0.407926  0.420730
4  0.407926 -0.486337


In [37]:
### Test data-set - Standardize the Numeric data-set ###

print(num_test_data.head())

print("------------------------------")

scaler = StandardScaler()

num_test_std = pd.DataFrame(scaler.fit_transform(num_test_data), columns=num_test_data.columns)

print(num_test_std.head())

    Age     Fare
0  34.5   7.8292
1  47.0   7.0000
2  62.0   9.6875
3  27.0   8.6625
4  22.0  12.2875
------------------------------
        Age      Fare
0  0.334993 -0.497063
1  1.325530 -0.511926
2  2.514175 -0.463754
3 -0.259330 -0.482127
4 -0.655545 -0.417151


In [38]:
### Train data-set - Dummify the Categorical attributes ###

# Get the Categorical attribute columns
cat_train_col = cat_train_data.columns
print('Categorical columns: ', cat_train_col.shape)

# Convert all Categorical into dummies variables
train_cat_dummy = pd.concat([cat_train_data, pd.get_dummies(cat_train_data[:])], axis = 1)
print('Categorical with dummies: ', train_cat_dummy.shape)

# Get only dummies attributes
cat_train_col_dummy = train_cat_dummy.columns.difference(cat_train_col)
print('Categorical with only dummies columns: ', cat_train_col_dummy.shape)

# Remove orignial Categorical attributes
train_cat_dummy = pd.DataFrame(train_cat_dummy[cat_train_col_dummy])
print('Final Categorical with dummies: ', train_cat_dummy.shape)

Categorical columns:  (5,)
Categorical with dummies:  (891, 27)
Categorical with only dummies columns:  (22,)
Final Categorical with dummies:  (891, 22)


In [39]:
### Test data-set - Dummify the Categorical attributes ###

# Get the Categorical attribute columns
cat_test_col = cat_test_data.columns
print('Categorical columns: ', cat_test_col.shape)

# Convert all Categorical into dummies variables
test_cat_dummy = pd.concat([cat_test_data, pd.get_dummies(cat_test_data[:])], axis = 1)
print('Categorical with dummies: ', test_cat_dummy.shape)

# Get only dummies attributes
cat_test_col_dummy = test_cat_dummy.columns.difference(cat_test_col)
print('Categorical with only dummies columns: ', cat_test_col_dummy.shape)

# Remove orignial Categorical attributes
test_cat_dummy = pd.DataFrame(test_cat_dummy[cat_test_col_dummy])
print('Final Categorical with dummies: ', test_cat_dummy.shape)

Categorical columns:  (5,)
Categorical with dummies:  (418, 28)
Categorical with only dummies columns:  (23,)
Final Categorical with dummies:  (418, 23)


In [40]:
# Train data-set - extra columns in Train data than Test data, if any
train_cat_dummy.columns.difference(test_cat_dummy.columns)

Index([], dtype='object')

In [41]:
# Test data-set - extra columns in Train data than Test data, if any
test_cat_dummy.columns.difference(train_cat_dummy.columns)

Index(['Parch_9'], dtype='object')

In [42]:
test_cat_dummy.drop(['Parch_9'], axis=1, inplace=True)
print('Final Categorical with dummies: ', test_cat_dummy.shape)

Final Categorical with dummies:  (418, 22)


In [43]:
### Merging Numerical and Categorical data-sets ###

# Verify the dimension of Train and Test Categorical data
print('Train dim: ', train_cat_dummy.shape)
print('Train dim: ', num_train_data.shape)

# Verify the dimension of Train and Test Numerical data
print('Test dim: ', test_cat_dummy.shape)
print('Test dim: ', num_test_data.shape)

Train dim:  (891, 22)
Train dim:  (891, 2)
Test dim:  (418, 22)
Test dim:  (418, 2)


In [44]:
# Train data-set - Concatnate Numerical and Categorical data - Without Standardized data
train_final = pd.concat([num_train_data, train_cat_dummy], axis = 1)
print("Train dimension: ", train_final.shape)

train_final.head()

Train dimension:  (891, 24)


Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,...,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8
0,22.0,7.25,0,0,1,1,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
1,38.0,71.2833,1,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,26.0,7.925,0,0,1,1,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
3,35.0,53.1,0,0,1,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,35.0,8.05,0,0,1,1,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0


In [45]:
# Test data-set - Concatnate Numerical and Categorical data - Without Standardized data
test_final = pd.concat([num_test_data, test_cat_dummy], axis = 1)
print("Test dimension: ", test_final.shape)

test_final.head()

Test dimension:  (418, 24)


Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,...,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8
0,34.5,7.8292,0,1,0,1,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
1,47.0,7.0,0,0,1,1,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
2,62.0,9.6875,0,1,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,27.0,8.6625,0,0,1,1,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
4,22.0,12.2875,0,0,1,0,1,0,0,0,...,1,1,0,0,1,0,0,0,0,0


In [46]:
# Train data-set - Concatnate Numerical and Categorical data - Without Standardized data
train_final_std = pd.concat([num_train_std, train_cat_dummy], axis = 1)
print("Standardized Train dimension: ", train_final_std.shape)

train_final_std.head()

Standardized Train dimension:  (891, 24)


Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,...,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8
0,-0.592481,-0.502445,0,0,1,1,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
1,0.638789,0.786845,1,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,-0.284663,-0.488854,0,0,1,1,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
3,0.407926,0.42073,0,0,1,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,0.407926,-0.486337,0,0,1,1,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0


In [47]:
# Test data-set - Concatnate Numerical and Categorical data - Without Standardized data
test_final_std = pd.concat([num_test_std, test_cat_dummy], axis = 1)
print("Standardized Test dimension: ", test_final_std.shape)

test_final_std.head()

Standardized Test dimension:  (418, 24)


Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,...,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8
0,0.334993,-0.497063,0,1,0,1,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
1,1.32553,-0.511926,0,0,1,1,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
2,2.514175,-0.463754,0,1,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,-0.25933,-0.482127,0,0,1,1,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
4,-0.655545,-0.417151,0,0,1,0,1,0,0,0,...,1,1,0,0,1,0,0,0,0,0


In [48]:
y = train_data[['Survived']]

In [49]:
### Split the data-set into Train and Validation sets  - Without Std ###
x_train, x_val, y_train, y_val = train_test_split(train_final, y, test_size=0.3, random_state=56789)

In [50]:
# Dimension of Train and Test final
print(x_train.shape)
print(y_train.shape)

print('------------------')
print(x_val.shape)
print(y_val.shape)

(623, 24)
(623, 1)
------------------
(268, 24)
(268, 1)


In [51]:
### Split the data-set into Train and Validation sets  - With Std ###
x_train_std, x_val_std, y_train_std, y_val_std = train_test_split(train_final_std, y, test_size=0.3, 
                                                                      random_state=56789)

In [52]:
# Dimension of Train and Test final
print(x_train_std.shape)
print(y_train_std.shape)

print('------------------')
print(x_val_std.shape)
print(y_val_std.shape)

(623, 24)
(623, 1)
------------------
(268, 24)
(268, 1)


## Model Building

### a) Logistic Regression Model

In [53]:
# Build the Logistic Regression Model

model_log = LogisticRegression(C=1.5)
model_log.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [54]:
# Predict Train data
y_train_prediction = model_log.predict(x_train)

In [55]:
# Predict Validation data
y_val_prediction = model_log.predict(x_val)

In [56]:
# Accuracy of the Train data
print("Train Accuracy      : ", accuracy_score(y_train, y_train_prediction))

# Accuracy of the Vaildation data
print("Validation Accuracy : ", accuracy_score(y_val, y_val_prediction))

Train Accuracy      :  0.8202247191011236
Validation Accuracy :  0.7835820895522388


In [57]:
# Confusion Matrix
print('Train - Confusion Matrix')
print(confusion_matrix(y_train, y_train_prediction))

print('-------------------------------------')

print('Validation - Confusion Matrix')
print(confusion_matrix(y_val, y_val_prediction))

Train - Confusion Matrix
[[330  49]
 [ 63 181]]
-------------------------------------
Validation - Confusion Matrix
[[146  24]
 [ 34  64]]


In [58]:
# Predict Test data
y_test_prediction = model_log.predict(test_final)

### b) Logistic Regression Model - With RandomizedSearchCV

In [70]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()

param_grid = {"C" : np.arange(1, 3, 0.1), 
              "penalty" : ["l1", "l2"],         # l1 lasso l2 ridge
              "max_iter" : [100],  
              "n_jobs" : [-1]}

# Randomized Search Cross Validation returns an Object with best model parameters

log_search = RandomizedSearchCV(estimator = log, param_distributions = param_grid, cv=10)

In [71]:
log_search.fit(x_train, y_train_std.iloc[:,0])

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
       2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9]), 'penalty': ['l1', 'l2'], 'max_iter': [100], 'n_jobs': [-1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [72]:
log_search.best_estimator_

LogisticRegression(C=2.5000000000000013, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [73]:
# Accuracy of the Train data
print('Train Accuracy      : ', log_search.best_score_)

# Accuracy of the Vaildation data
print("Validation Accuracy : ", log_search.score(x_val, y_val))

Train Accuracy      :  0.8105939004815409
Validation Accuracy :  0.7835820895522388


In [74]:
# Confusion Matrix
print('Train - Confusion Matrix')
print(confusion_matrix(y_train, log_search.predict(x_train)))

print('-------------------------------------')

print('Validation - Confusion Matrix')
print(confusion_matrix(y_val, log_search.predict(x_val)))

Train - Confusion Matrix
[[331  48]
 [ 64 180]]
-------------------------------------
Validation - Confusion Matrix
[[146  24]
 [ 34  64]]


### b) Decision Tree Model

In [75]:
dt = DecisionTreeClassifier(max_leaf_nodes=7)
dt.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=7, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [76]:
# Accuracy of the Train data
print("Train Accuracy      : ", dt.score(x_train, y_train))

# Accuracy of the Vaildation data
print("Validation Accuracy : ", dt.score(x_val, y_val))

Train Accuracy      :  0.8378812199036918
Validation Accuracy :  0.7910447761194029


In [77]:
# Confusion Matrix
print('Train - Confusion Matrix')
print(confusion_matrix(y_train, dt.predict(x_train)))

print('--------------------------------')

print('Validation - Confusion Matrix')
print(confusion_matrix(y_val, dt.predict(x_val)))

Train - Confusion Matrix
[[342  37]
 [ 64 180]]
--------------------------------
Validation - Confusion Matrix
[[150  20]
 [ 36  62]]


In [78]:
# Predict Test data
y_test_prediction_dt = dt.predict(test_final)

In [79]:
# Plot Decision Tree

#from sklearn.externals.six import StringIO  
#from IPython.display import Image  
#from sklearn.tree import export_graphviz
#import pydotplus

#dot_data = StringIO()
#export_graphviz(dt, out_file=dot_data, filled=True, rounded=True, special_characters=True)
#graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
#Image(graph.create_png())

### c) Decision Tree Model - with Randomized Search cross-validation

In [80]:
#DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
#            max_features=None, max_leaf_nodes=7, min_impurity_split=1e-07,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
#            splitter='best')

In [81]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn import tree

dt1 = tree.DecisionTreeClassifier()

param_grid = {'criterion': ['gini', 'entropy'],
              'max_leaf_nodes': np.arange(3, 20, 1),
              'min_samples_split': np.arange(0.01, 0.1, 0.001),
              'max_depth': np.arange(5, 15, 1),
              'min_weight_fraction_leaf': np.arange(0.01, 0.05, 0.001)}

# Randomized Search Cross Validation returns an Object with best model parameters

rsearch = RandomizedSearchCV(estimator = dt1, param_distributions = param_grid)

In [82]:
rsearch.fit(x_train, y_train_std.iloc[:,0])
rsearch.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=5, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=0.09699999999999992,
            min_weight_fraction_leaf=0.015999999999999993, presort=False,
            random_state=None, splitter='best')

In [83]:
# Accuracy of the Train data
print('Train Accuracy      : ', rsearch.best_score_)

# Accuracy of the Vaildation data
print("Validation Accuracy : ", rsearch.score(x_val, y_val))

Train Accuracy      :  0.8089887640449438
Validation Accuracy :  0.7761194029850746


In [84]:
# Confusion Matrix
print('Train - Confusion Matrix')
print(confusion_matrix(y_train, rsearch.predict(x_train)))

print('-------------------------------------')

print('Validation - Confusion Matrix')
print(confusion_matrix(y_val, rsearch.predict(x_val)))

Train - Confusion Matrix
[[334  45]
 [ 60 184]]
-------------------------------------
Validation - Confusion Matrix
[[142  28]
 [ 32  66]]


### d) Random Forest

In [85]:
#Random Forest - With default values and n_estimators=20 (random)
rf = RandomForestClassifier(n_estimators = 80, max_depth=5)
rf.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [86]:
# Accuracy of the Train data
print('Train Accuracy      : ', rf.score(x_train, y_train))

# Accuracy of the Vaildation data
print("Validation Accuracy : ", rf.score(x_val, y_val))

Train Accuracy      :  0.8587479935794543
Validation Accuracy :  0.8134328358208955


In [87]:
# Confusion Matrix
print('Train - Confusion Matrix')
print(confusion_matrix(y_train, rf.predict(x_train)))

print('-------------------------------------')

print('Validation - Confusion Matrix')
print(confusion_matrix(y_val, rf.predict(x_val)))

Train - Confusion Matrix
[[358  21]
 [ 67 177]]
-------------------------------------
Validation - Confusion Matrix
[[156  14]
 [ 36  62]]


### e) Bagging

In [88]:
# Bagging
bg = BaggingClassifier(DecisionTreeClassifier(max_depth=5), max_samples=0.5, max_features=1.0, n_estimators=20)

bg.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.5, n_estimators=20, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [89]:
# Accuracy of the Train data
print('Train Accuracy      : ', bg.score(x_train, y_train))

# Accuracy of the Vaildation data
print("Validation Accuracy : ", bg.score(x_val, y_val))

Train Accuracy      :  0.8667736757624398
Validation Accuracy :  0.8097014925373134


In [90]:
# Confusion Matrix
print('Train - Confusion Matrix')
print(confusion_matrix(y_train, bg.predict(x_train)))

print('-------------------------------------')

print('Validation - Confusion Matrix')
print(confusion_matrix(y_val, bg.predict(x_val)))

Train - Confusion Matrix
[[357  22]
 [ 61 183]]
-------------------------------------
Validation - Confusion Matrix
[[154  16]
 [ 35  63]]


### f) Boosting - Ada Boost

In [91]:
#Boosting - Ada Boost (Default parameters)
adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4), n_estimators=50, learning_rate=1, algorithm='SAMME', 
                         random_state=12345)

adb.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1, n_estimators=50, random_state=12345)

In [92]:
# Accuracy of the Train data
print('Train Accuracy      : ', adb.score(x_train, y_train))

# Accuracy of the Vaildation data
print("Validation Accuracy : ", adb.score(x_val, y_val))

Train Accuracy      :  0.9261637239165329
Validation Accuracy :  0.8097014925373134


In [93]:
# Confusion Matrix
print('Train - Confusion Matrix')
print(confusion_matrix(y_train, adb.predict(x_train)))

print('-------------------------------------')

print('Validation - Confusion Matrix')
print(confusion_matrix(y_val, adb.predict(x_val)))

Train - Confusion Matrix
[[369  10]
 [ 36 208]]
-------------------------------------
Validation - Confusion Matrix
[[149  21]
 [ 30  68]]


### g) GradientBoostingClassifier 

In [94]:
from sklearn.ensemble import GradientBoostingClassifier

# GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', 
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, 
# min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, 
# warm_start=False, presort='auto')

gbc = GradientBoostingClassifier(
        learning_rate=0.1,
        max_depth = 4,
        n_estimators = 100,
        warm_start=True,
        random_state=12345
)

gbc.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=12345, subsample=1.0, verbose=0,
              warm_start=True)

In [95]:
# Accuracy of the Train data
print('Train Accuracy      : ', gbc.score(x_train, y_train))

# Accuracy of the Vaildation data
print("Validation Accuracy : ", gbc.score(x_val, y_val))

Train Accuracy      :  0.942215088282504
Validation Accuracy :  0.8059701492537313


In [96]:
# Confusion Matrix
print('Train - Confusion Matrix')
print(confusion_matrix(y_train, gbc.predict(x_train)))

print('-------------------------------------')

print('Validation - Confusion Matrix')
print(confusion_matrix(y_val, gbc.predict(x_val)))

Train - Confusion Matrix
[[370   9]
 [ 27 217]]
-------------------------------------
Validation - Confusion Matrix
[[146  24]
 [ 28  70]]


In [97]:
######################## Final Submission ########################
#Submit the prediction
submission = pd.DataFrame({
        "PassengerId": titanic_test['PassengerId'],
        "Survived": gbc.predict(test_final)
    })
submission.to_csv('test_Output.csv', index=False)

## <Center> Final Accuracy Table

In [None]:
# Final accuracy values from various models applied

accuracy_table = {'Logistic_Regression_Train       '   : accuracy_score(y_train, y_train_prediction),
                  'Logistic_Regression_Test        '   : accuracy_score(y_val, y_val_prediction),
                  
                  'Decision_Tree_Train             '   : dt.score(x_train, y_train),
                  'Decision_Tree_Test              '   : dt.score(x_val, y_val),
                  
                  #'Decision_Tree_RandomGrid_Train  '   : rsearch.best_score_,
                  #'Decision_Tree_RandomGrid_Test   '   : rsearch.score(x_val, y_val),
                  
                  'Random_Forest_Train             '   : rf.score(x_train, y_train),
                  'Random_Forest_Test              '   : rf.score(x_val, y_val),
                  
                  'Bagging_Train                   '   : bg.score(x_train, y_train),
                  'Bagging_Test                    '   : bg.score(x_val, y_val),
                  
                  'AdaBoosting_Train               '   : adb.score(x_train, y_train),
                  'AdaBoosting_Test                '   : adb.score(x_val, y_val),
                  
                  'GradientBoostingClassifier_Train'   : gbc.score(x_train, y_train),
                  'GradientBoostingClassifier_Test '   : gbc.score(x_val, y_val),
                 }

# Check the final accuracy table
accuracy_table

### h) KNN Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier 

classifier = KNeighborsClassifier(n_neighbors=5)  
classifier.fit(x_train_std, y_train_std)  

In [None]:
# Accuracy of the Train data
print('Train Accuracy      : ', classifier.score(x_train_std, y_train_std))

# Accuracy of the Vaildation data
print("Validation Accuracy : ", classifier.score(x_val_std, y_val_std))

In [None]:
# Confusion Matrix
print('Train - Confusion Matrix')
print(confusion_matrix(y_train, classifier.predict(x_train_std)))

print('-------------------------------------')

print('Validation - Confusion Matrix')
print(confusion_matrix(y_val, classifier.predict(x_val_std)))

In [None]:
from sklearn.metrics import classification_report 

print(classification_report(y_val_std, classifier.predict(x_val_std))) 

### i) KNN Model - With RandomSearchCV

In [None]:
#KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
#           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
#           weights='uniform')

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier 

knn_neigh = KNeighborsClassifier()

param_grid = {'n_neighbors' : np.arange(1, 15, 1),
              'weights' : ['uniform', 'distance'],
              'n_jobs' : -1,
              'leaf_size' : np.arange(25, 75, 5)}

# Randomized Search Cross Validation returns an Object with best model parameters

knn = RandomizedSearchCV(estimator = knn_neigh, param_distributions = param_grid, n_iter = 10, 
                             cv=10, scoring='accuracy', random_state=357678)

In [None]:
#y_train_std.iloc[:,0]
knn.fit(x_train_std, y_train_std.iloc[:,0])

In [None]:
# Accuracy of the Train data
print('Train Accuracy      : ', knn.best_score_)

# Accuracy of the Vaildation data
print("Validation Accuracy : ", knn.score(x_val_std, y_val_std))

### XGBoost Model

In [None]:
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import RandomizedSearchCV

xg_model = XGBClassifier(booster='gbtree', silent=1, seed=0, base_score=0.5, subsample=0.75, n_jobs=-1)

parameters = {'n_estimators':np.arange(50, 100, 3), #83
              'max_depth':np.arange(1, 10, 1),#7
#             'gamma':np.arange(1, 10, 1), #4
#             'max_delta_step': np.arange(1, 10, 1),   #1
#             'min_child_weight':np.arange(1, 10, 1), #1 
#             'colsample_bytree':np.arange(0.50, 0.75, 0.1),    #0.6,
              'learning_rate': np.arange(0.01, 0.1, 0.01)  # 0.040000000000000001
            }

tune_model =  GridSearchCV(xg_model, parameters, cv=2, scoring='accuracy')

In [None]:
tune_model.fit(x_train, y_train.iloc[:,0])

In [None]:
print('Best parameters : \n', tune_model.best_params_)
print('Results : \n', format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))

In [None]:
#Learn on the whole data
#tune_model.fit(x_val, y_val.iloc[:,0])

In [None]:
# Accuracy of the Train data
print('Train Accuracy      : ', tune_model.best_score_)

# Accuracy of the Vaildation data
print("Validation Accuracy : ", tune_model.score(x_val, y_val.iloc[:,0]))

### GBM Model - with RandomizedSearchCV

In [105]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import xgboost as xgb
        
gbc1 = xgb.XGBClassifier()

param_grid = {'loss' : ['deviance', 'exponential'],
             'learning_rate' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
             'n_estimators' : [200],
             'max_depth' : np.arange(1, 20, 3),
#             'subsample' : [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#             'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#             'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#             'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
             'gamma': [0, 0.25, 0.5, 1.0],
             'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
             }

fit_params = {'eval_metric': 'mlogloss', 
              'early_stopping_rounds': 50,
              'eval_set' : [(x_train, y_train)]
             }

rs_gbc = RandomizedSearchCV(gbc1, param_distributions=param_grid, n_iter=5, n_jobs=-1, verbose=2, 
                                cv=10, fit_params=fit_params, random_state = 12345)


In [108]:
rs_gbc.fit(x_train, y_train.iloc[:,0])



Fitting 10 folds for each of 5 candidates, totalling 50 fits


JoblibXGBoostError: JoblibXGBoostError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
C:\ProgramData\Anaconda3\lib\runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
C:\ProgramData\Anaconda3\lib\runpy.py in _run_code(code=<code object <module> at 0x00000224D4736F60, fil...lib\site-packages\ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\ProgramData\Anaconda3\lib\site-packages\__pycache__\ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\ProgramD...a3\\lib\\site-packages\\ipykernel\\kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x00000224D4736F60, fil...lib\site-packages\ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\ProgramData\Anaconda3\lib\site-packages\__pycache__\ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\ProgramD...a3\\lib\\site-packages\\ipykernel\\kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    481         if self.poller is not None:
    482             self.poller.start()
    483         self.kernel.start()
    484         self.io_loop = ioloop.IOLoop.current()
    485         try:
--> 486             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    487         except KeyboardInterrupt:
    488             pass
    489 
    490 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    122         except (RuntimeError, AssertionError):
    123             old_loop = None
    124         try:
    125             self._setup_logging()
    126             asyncio.set_event_loop(self.asyncio_loop)
--> 127             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Win...EventLoop running=True closed=False debug=False>>
    128         finally:
    129             asyncio.set_event_loop(old_loop)
    130 
    131     def stop(self):

...........................................................................
C:\ProgramData\Anaconda3\lib\asyncio\base_events.py in run_forever(self=<_WindowsSelectorEventLoop running=True closed=False debug=False>)
    417             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    418                                    finalizer=self._asyncgen_finalizer_hook)
    419         try:
    420             events._set_running_loop(self)
    421             while True:
--> 422                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_Windo...EventLoop running=True closed=False debug=False>>
    423                 if self._stopping:
    424                     break
    425         finally:
    426             self._stopping = False

...........................................................................
C:\ProgramData\Anaconda3\lib\asyncio\base_events.py in _run_once(self=<_WindowsSelectorEventLoop running=True closed=False debug=False>)
   1427                         logger.warning('Executing %s took %.3f seconds',
   1428                                        _format_handle(handle), dt)
   1429                 finally:
   1430                     self._current_handle = None
   1431             else:
-> 1432                 handle._run()
        handle._run = <bound method Handle._run of <Handle BaseAsyncIOLoop._handle_events(564, 1)>>
   1433         handle = None  # Needed to break cycles when an exception occurs.
   1434 
   1435     def _set_coroutine_wrapper(self, enabled):
   1436         try:

...........................................................................
C:\ProgramData\Anaconda3\lib\asyncio\events.py in _run(self=<Handle BaseAsyncIOLoop._handle_events(564, 1)>)
    140             self._callback = None
    141             self._args = None
    142 
    143     def _run(self):
    144         try:
--> 145             self._callback(*self._args)
        self._callback = <bound method BaseAsyncIOLoop._handle_events of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (564, 1)
    146         except Exception as exc:
    147             cb = _format_callback_source(self._callback, self._args)
    148             msg = 'Exception in callback {}'.format(cb)
    149             context = {

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py in _handle_events(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, fd=564, events=1)
    112             self.writers.remove(fd)
    113         del self.handlers[fd]
    114 
    115     def _handle_events(self, fd, events):
    116         fileobj, handler_func = self.handlers[fd]
--> 117         handler_func(fileobj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fileobj = <zmq.sugar.socket.Socket object>
        events = 1
    118 
    119     def start(self):
    120         try:
    121             old_loop = asyncio.get_event_loop()

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    445             return
    446         zmq_events = self.socket.EVENTS
    447         try:
    448             # dispatch events:
    449             if zmq_events & zmq.POLLIN and self.receiving():
--> 450                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    451                 if not self.socket:
    452                     return
    453             if zmq_events & zmq.POLLOUT and self.sending():
    454                 self._handle_send()

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    475             else:
    476                 raise
    477         else:
    478             if self._recv_callback:
    479                 callback = self._recv_callback
--> 480                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    481         
    482 
    483     def _handle_send(self):
    484         """Handle a send event."""

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    427         close our socket."""
    428         try:
    429             # Use a NullContext to ensure that all StackContexts are run
    430             # inside our blanket exception handler rather than outside.
    431             with stack_context.NullContext():
--> 432                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    433         except:
    434             gen_log.error("Uncaught exception in ZMQStream callback",
    435                           exc_info=True)
    436             # Re-raise the exception so that IOLoop.handle_callback_exception

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'rs_gbc.fit(x_train, y_train.iloc[:,0])', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 11, 30, 10, 8, 17, 622362, tzinfo=tzutc()), 'msg_id': 'f56f48c5a89b418781e67206de85f269', 'msg_type': 'execute_request', 'session': '42f826f738734673960671e50f29783e', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'f56f48c5a89b418781e67206de85f269', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warn("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'42f826f738734673960671e50f29783e']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'rs_gbc.fit(x_train, y_train.iloc[:,0])', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 11, 30, 10, 8, 17, 622362, tzinfo=tzutc()), 'msg_id': 'f56f48c5a89b418781e67206de85f269', 'msg_type': 'execute_request', 'session': '42f826f738734673960671e50f29783e', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'f56f48c5a89b418781e67206de85f269', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'42f826f738734673960671e50f29783e'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'rs_gbc.fit(x_train, y_train.iloc[:,0])', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 11, 30, 10, 8, 17, 622362, tzinfo=tzutc()), 'msg_id': 'f56f48c5a89b418781e67206de85f269', 'msg_type': 'execute_request', 'session': '42f826f738734673960671e50f29783e', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'f56f48c5a89b418781e67206de85f269', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='rs_gbc.fit(x_train, y_train.iloc[:,0])', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'rs_gbc.fit(x_train, y_train.iloc[:,0])'
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('rs_gbc.fit(x_train, y_train.iloc[:,0])',), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('rs_gbc.fit(x_train, y_train.iloc[:,0])',)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='rs_gbc.fit(x_train, y_train.iloc[:,0])', store_history=True, silent=False, shell_futures=True)
   2657         -------
   2658         result : :class:`ExecutionResult`
   2659         """
   2660         try:
   2661             result = self._run_cell(
-> 2662                 raw_cell, store_history, silent, shell_futures)
        raw_cell = 'rs_gbc.fit(x_train, y_train.iloc[:,0])'
        store_history = True
        silent = False
        shell_futures = True
   2663         finally:
   2664             self.events.trigger('post_execute')
   2665             if not silent:
   2666                 self.events.trigger('post_run_cell', result)

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in _run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='rs_gbc.fit(x_train, y_train.iloc[:,0])', store_history=True, silent=False, shell_futures=True)
   2780                 self.displayhook.exec_result = result
   2781 
   2782                 # Execute the user code
   2783                 interactivity = 'none' if silent else self.ast_node_interactivity
   2784                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2785                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2786                 
   2787                 self.last_execution_succeeded = not has_raised
   2788                 self.last_execution_result = result
   2789 

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Expr object>], cell_name='<ipython-input-108-890652c98351>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 224dbfb9668, executio...rue silent=False shell_futures=True> result=None>)
   2904                     return True
   2905 
   2906             for i, node in enumerate(to_run_interactive):
   2907                 mod = ast.Interactive([node])
   2908                 code = compiler(mod, cell_name, "single")
-> 2909                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x00000224DC015660, file "<ipython-input-108-890652c98351>", line 1>
        result = <ExecutionResult object at 224dbfb9668, executio...rue silent=False shell_futures=True> result=None>
   2910                     return True
   2911 
   2912             # Flush softspace
   2913             if softspace(sys.stdout, 0):

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x00000224DC015660, file "<ipython-input-108-890652c98351>", line 1>, result=<ExecutionResult object at 224dbfb9668, executio...rue silent=False shell_futures=True> result=None>)
   2958         outflag = True  # happens in more places, so it's easier as default
   2959         try:
   2960             try:
   2961                 self.hooks.pre_run_code_hook()
   2962                 #rprint('Running code', repr(code_obj)) # dbg
-> 2963                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x00000224DC015660, file "<ipython-input-108-890652c98351>", line 1>
        self.user_global_ns = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'BaggingClassifier': <class 'sklearn.ensemble.bagging.BaggingClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'Imputer': <class 'sklearn.preprocessing.imputation.Imputer'>, 'In': ['', '# Import supported libraries\nimport numpy as np\n...from sklearn.metrics import classification_report', '# Read Train and Test data-sets\ntrain_data = pd....("train.csv")\ntest_data = pd.read_csv("test.csv")', 'titanic_train = train_data.copy()\ntitanic_test = test_data.copy()', '# Gather info from the Train data-set\ntrain_data.info()', '# Gather info from the Train data-set\ntest_data.info()', '# View top Train data\ntrain_data.head()', '# View top Test data\ntest_data.head()', '# View Train bottom data\ntrain_data.tail()', '# View Test bottom data\ntest_data.tail()', '# Train attributes data types\ntrain_data.dtypes', '# Test attributes data types\ntest_data.dtypes', '# Train Summary\ntrain_data.describe()', '# Test Summary\ntest_data.describe()', '# Train dimension\ntrain_data.shape', '# Test dimension\ntest_data.shape', '# Train attributes name\ntrain_data.columns', '# Test attributes name\ntest_data.columns', '### Train and Test data-sets : Drop insignifican... unique values in Train data\ntrain_data.nunique()', '# Finding unique values in Test data\ntest_data.nunique()', ...], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {6:    PassengerId  Survived  Pclass  \
0           ...    0            373450   8.0500   NaN        S  , 7:    PassengerId  Pclass                          ...      1      1  3101298  12.2875   NaN        S  , 8:      PassengerId  Survived  Pclass              ...     0      0      370376   7.75   NaN        Q  , 9:      PassengerId  Pclass                        ... 1                2668   22.3583   NaN        C  , 10: PassengerId      int64
Survived         int64
Pc...      object
Embarked        object
dtype: object, 11: PassengerId      int64
Pclass           int64
Na...      object
Embarked        object
dtype: object, 12:        PassengerId    Survived      Pclass      ...000   31.000000  
max      6.000000  512.329200  , 13:        PassengerId      Pclass         Age      ...0   76.000000    8.000000    9.000000  512.329200, 14: (891, 12), 15: (418, 11), ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'RandomizedSearchCV': <class 'sklearn.model_selection._search.RandomizedSearchCV'>, ...}
        self.user_ns = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'BaggingClassifier': <class 'sklearn.ensemble.bagging.BaggingClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'Imputer': <class 'sklearn.preprocessing.imputation.Imputer'>, 'In': ['', '# Import supported libraries\nimport numpy as np\n...from sklearn.metrics import classification_report', '# Read Train and Test data-sets\ntrain_data = pd....("train.csv")\ntest_data = pd.read_csv("test.csv")', 'titanic_train = train_data.copy()\ntitanic_test = test_data.copy()', '# Gather info from the Train data-set\ntrain_data.info()', '# Gather info from the Train data-set\ntest_data.info()', '# View top Train data\ntrain_data.head()', '# View top Test data\ntest_data.head()', '# View Train bottom data\ntrain_data.tail()', '# View Test bottom data\ntest_data.tail()', '# Train attributes data types\ntrain_data.dtypes', '# Test attributes data types\ntest_data.dtypes', '# Train Summary\ntrain_data.describe()', '# Test Summary\ntest_data.describe()', '# Train dimension\ntrain_data.shape', '# Test dimension\ntest_data.shape', '# Train attributes name\ntrain_data.columns', '# Test attributes name\ntest_data.columns', '### Train and Test data-sets : Drop insignifican... unique values in Train data\ntrain_data.nunique()', '# Finding unique values in Test data\ntest_data.nunique()', ...], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {6:    PassengerId  Survived  Pclass  \
0           ...    0            373450   8.0500   NaN        S  , 7:    PassengerId  Pclass                          ...      1      1  3101298  12.2875   NaN        S  , 8:      PassengerId  Survived  Pclass              ...     0      0      370376   7.75   NaN        Q  , 9:      PassengerId  Pclass                        ... 1                2668   22.3583   NaN        C  , 10: PassengerId      int64
Survived         int64
Pc...      object
Embarked        object
dtype: object, 11: PassengerId      int64
Pclass           int64
Na...      object
Embarked        object
dtype: object, 12:        PassengerId    Survived      Pclass      ...000   31.000000  
max      6.000000  512.329200  , 13:        PassengerId      Pclass         Age      ...0   76.000000    8.000000    9.000000  512.329200, 14: (891, 12), 15: (418, 11), ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'RandomizedSearchCV': <class 'sklearn.model_selection._search.RandomizedSearchCV'>, ...}
   2964             finally:
   2965                 # Reset our crash handler in place
   2966                 sys.excepthook = old_excepthook
   2967         except SystemExit as e:

...........................................................................
C:\Studies\INSOFE\Self Practice\titanic data-set\<ipython-input-108-890652c98351> in <module>()
----> 1 rs_gbc.fit(x_train, y_train.iloc[:,0])

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self=RandomizedSearchCV(cv=10, error_score='raise',
 ...turn_train_score='warn', scoring=None, verbose=2), X=           Age      Fare  Embarked_C  Embarked_Q...   0        0        0  

[623 rows x 24 columns], y=787    0
99     0
279    1
315    1
852    0
675...23, dtype: category
Categories (2, int64): [0, 1], groups=None, **fit_params={'early_stopping_rounds': 50, 'eval_metric': 'mlogloss', 'eval_set': [(           Age      Fare  Embarked_C  Embarked_Q...   0        0        0  

[623 rows x 24 columns],     Survived
787        0
99         0
279      ...548        0
527        0

[623 rows x 1 columns])]})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method StratifiedKFold.split of Stratifie...d(n_splits=10, random_state=None, shuffle=False)>
        X =            Age      Fare  Embarked_C  Embarked_Q...   0        0        0  

[623 rows x 24 columns]
        y = 787    0
99     0
279    1
315    1
852    0
675...23, dtype: category
Categories (2, int64): [0, 1]
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
XGBoostError                                       Fri Nov 30 15:38:33 2018
PID: 1432                 Python 3.6.5: C:\ProgramData\Anaconda3\python.exe
...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (XGBClassifier(base_score=0.5, booster='gbtree', ...ht=1, seed=None, silent=True,
       subsample=1),            Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[623 rows x 24 columns], 787    0
99     0
279    1
315    1
852    0
675...23, dtype: category
Categories (2, int64): [0, 1], {'score': <function _passthrough_scorer>}, array([ 62,  64,  65,  66,  67,  68,  69,  70,  ..., 615, 616, 617, 618, 619, 620, 621,
       622]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 1... 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63]), 2, {'gamma': 0, 'learning_rate': 0.2, 'loss': 'exponential', 'max_depth': 10, 'n_estimators': 200, 'reg_lambda': 5.0}), {'error_score': 'raise', 'fit_params': {'early_stopping_rounds': 50, 'eval_metric': 'mlogloss', 'eval_set': [(           Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[623 rows x 24 columns],     Survived
787        0
99         0
279      ...548        0
527        0

[623 rows x 1 columns])]}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (XGBClassifier(base_score=0.5, booster='gbtree', ...ht=1, seed=None, silent=True,
       subsample=1),            Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[623 rows x 24 columns], 787    0
99     0
279    1
315    1
852    0
675...23, dtype: category
Categories (2, int64): [0, 1], {'score': <function _passthrough_scorer>}, array([ 62,  64,  65,  66,  67,  68,  69,  70,  ..., 615, 616, 617, 618, 619, 620, 621,
       622]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 1... 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63]), 2, {'gamma': 0, 'learning_rate': 0.2, 'loss': 'exponential', 'max_depth': 10, 'n_estimators': 200, 'reg_lambda': 5.0})
        kwargs = {'error_score': 'raise', 'fit_params': {'early_stopping_rounds': 50, 'eval_metric': 'mlogloss', 'eval_set': [(           Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[623 rows x 24 columns],     Survived
787        0
99         0
279      ...548        0
527        0

[623 rows x 1 columns])]}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator=XGBClassifier(base_score=0.5, booster='gbtree', ...ht=1, seed=None, silent=True,
       subsample=1), X=           Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[623 rows x 24 columns], y=787    0
99     0
279    1
315    1
852    0
675...23, dtype: category
Categories (2, int64): [0, 1], scorer={'score': <function _passthrough_scorer>}, train=array([ 62,  64,  65,  66,  67,  68,  69,  70,  ..., 615, 616, 617, 618, 619, 620, 621,
       622]), test=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 1... 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63]), verbose=2, parameters={'gamma': 0, 'learning_rate': 0.2, 'loss': 'exponential', 'max_depth': 10, 'n_estimators': 200, 'reg_lambda': 5.0}, fit_params={'early_stopping_rounds': 50, 'eval_metric': 'mlogloss', 'eval_set': [(           Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[623 rows x 24 columns],     Survived
787        0
99         0
279      ...548        0
527        0

[623 rows x 1 columns])]}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    453 
    454     try:
    455         if y_train is None:
    456             estimator.fit(X_train, **fit_params)
    457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method XGBClassifier.fit of XGBClassifier...t=1, seed=None, silent=True,
       subsample=1)>
        X_train =            Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[560 rows x 24 columns]
        y_train = 660    1
74     1
82     1
267    1
591    1
463...60, dtype: category
Categories (2, int64): [0, 1]
        fit_params = {'early_stopping_rounds': 50, 'eval_metric': 'mlogloss', 'eval_set': [(           Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[623 rows x 24 columns],     Survived
787        0
99         0
279      ...548        0
527        0

[623 rows x 1 columns])]}
    459 
    460     except Exception as e:
    461         # Note fit time as time until error
    462         fit_time = time.time() - start_time

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py in fit(self=XGBClassifier(base_score=0.5, booster='gbtree', ...ht=1, seed=None, silent=True,
       subsample=1), X=           Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[560 rows x 24 columns], y=660    1
74     1
82     1
267    1
591    1
463...60, dtype: category
Categories (2, int64): [0, 1], sample_weight=None, eval_set=[(           Age      Fare  Embarked_C  Embarked_Q...     0        0        0

[623 rows x 24 columns],     Survived
787        0
99         0
279      ...548        0
527        0

[623 rows x 1 columns])], eval_metric='mlogloss', early_stopping_rounds=50, verbose=True, xgb_model=None, sample_weight_eval_set=[None], callbacks=None)
    695         self._Booster = train(xgb_options, train_dmatrix, self.n_estimators,
    696                               evals=evals,
    697                               early_stopping_rounds=early_stopping_rounds,
    698                               evals_result=evals_result, obj=obj, feval=feval,
    699                               verbose_eval=verbose, xgb_model=None,
--> 700                               callbacks=callbacks)
        callbacks = None
    701 
    702         self.objective = xgb_options["objective"]
    703         if evals_result:
    704             for val in evals_result.items():

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\training.py in train(params={'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1, 'eval_metric': 'mlogloss', 'gamma': 0, 'learning_rate': 0.2, 'loss': 'exponential', 'max_delta_step': 0, 'max_depth': 10, ...}, dtrain=<xgboost.core.DMatrix object>, num_boost_round=200, evals=[(<xgboost.core.DMatrix object>, 'validation_0')], obj=None, feval=None, maximize=False, early_stopping_rounds=50, evals_result={}, verbose_eval=True, xgb_model=None, callbacks=[<function print_evaluation.<locals>.callback>, <function early_stop.<locals>.callback>, <function record_evaluation.<locals>.callback>], learning_rates=None)
    211 
    212     return _train_internal(params, dtrain,
    213                            num_boost_round=num_boost_round,
    214                            evals=evals,
    215                            obj=obj, feval=feval,
--> 216                            xgb_model=xgb_model, callbacks=callbacks)
        xgb_model = None
        callbacks = [<function print_evaluation.<locals>.callback>, <function early_stop.<locals>.callback>, <function record_evaluation.<locals>.callback>]
    217 
    218 
    219 class CVPack(object):
    220     """"Auxiliary datastruct to hold one fold of CV."""

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\training.py in _train_internal(params={'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1, 'eval_metric': 'mlogloss', 'gamma': 0, 'learning_rate': 0.2, 'loss': 'exponential', 'max_delta_step': 0, 'max_depth': 10, ...}, dtrain=<xgboost.core.DMatrix object>, num_boost_round=200, evals=[(<xgboost.core.DMatrix object>, 'validation_0')], obj=None, feval=None, xgb_model=None, callbacks=[<function print_evaluation.<locals>.callback>, <function early_stop.<locals>.callback>, <function record_evaluation.<locals>.callback>])
     79 
     80         nboost += 1
     81         evaluation_result_list = []
     82         # check evaluation result.
     83         if len(evals) != 0:
---> 84             bst_eval_set = bst.eval_set(evals, i, feval)
        bst_eval_set = undefined
        bst.eval_set = <bound method Booster.eval_set of <xgboost.core.Booster object>>
        evals = [(<xgboost.core.DMatrix object>, 'validation_0')]
        i = 0
        feval = None
     85             if isinstance(bst_eval_set, STRING_TYPES):
     86                 msg = bst_eval_set
     87             else:
     88                 msg = bst_eval_set.decode()

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\core.py in eval_set(self=<xgboost.core.Booster object>, evals=[(<xgboost.core.DMatrix object>, 'validation_0')], iteration=0, feval=None)
   1101         evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
   1102         msg = ctypes.c_char_p()
   1103         _check_call(_LIB.XGBoosterEvalOneIter(self.handle, ctypes.c_int(iteration),
   1104                                               dmats, evnames,
   1105                                               c_bst_ulong(len(evals)),
-> 1106                                               ctypes.byref(msg)))
        msg = c_char_p(None)
   1107         res = msg.value.decode()
   1108         if feval is not None:
   1109             for dmat, evname in evals:
   1110                 feval_ret = feval(self.predict(dmat), dmat)

...........................................................................
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\core.py in _check_call(ret=-1)
    160     ----------
    161     ret : int
    162         return value from API calls
    163     """
    164     if ret != 0:
--> 165         raise XGBoostError(_LIB.XGBGetLastError())
    166 
    167 
    168 def ctypes2numpy(cptr, length, dtype):
    169     """Convert a ctypes pointer array to a numpy array.

XGBoostError: b'[15:38:33] C:\\Users\\Administrator\\Desktop\\xgboost\\src\\metric\\multiclass_metric.cc:53: Check failed: label_error >= 0 && label_error < static_cast<int>(nclass) MultiClassEvaluation: label must be in [0, num_class), num_class=1 but found 1 in label'
___________________________________________________________________________

In [104]:
rs_gbc.best_score_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_score_'

In [None]:
rs_gbc.predict(x_val)

In [None]:
rs_gbc.best_params_