# Handle Categorical Data | Handling Binning Data | Titanic Data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/train-data/train.csv
/kaggle/input/titanic/titanicdata.csv


In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

#warnings filter

warnings.filterwarnings('ignore')

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, r2_score
from sklearn.preprocessing import KBinsDiscretizer

In [4]:
df= pd.read_csv('/kaggle/input/titanic/titanicdata.csv', usecols= ['Age', 'Fare', 'Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Age       714 non-null    float64
 2   Fare      891 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 21.0 KB


In [6]:
df.describe()

Unnamed: 0,Survived,Age,Fare
count,891.0,714.0,891.0
mean,0.383838,29.699118,32.204208
std,0.486592,14.526497,49.693429
min,0.0,0.42,0.0
25%,0.0,20.125,7.9104
50%,0.0,28.0,14.4542
75%,1.0,38.0,31.0
max,1.0,80.0,512.3292


In [7]:
df.isna().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [8]:
df['Age']= df['Age'].fillna(df['Age'].median())

In [9]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.head(2)

Unnamed: 0,Age,Fare
331,45.5,28.5
733,23.0,13.0


In [11]:
clf= DecisionTreeClassifier(random_state= 0)
clf.fit(X_train, y_train)

y_pred= clf.predict(X_test)

In [12]:
accuracy_score(y_test, y_pred)

0.6536312849162011

In [13]:
print(np.mean(cross_val_score(DecisionTreeClassifier(), X,y, cv= 15, scoring= 'accuracy')))

0.6643879472693031


In [14]:
#using binning

kbins_age= KBinsDiscretizer(n_bins= 15, encode= 'ordinal', strategy='quantile')
kbins_fare= KBinsDiscretizer(n_bins=15,encode='ordinal',strategy='quantile')

In [15]:
trf= ColumnTransformer([
    ('first', kbins_age,[0]),
    ('second', kbins_fare,[1])
])

In [16]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [17]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42,  8.4 , 18.  , 21.  , 23.  , 25.  , 28.  , 31.  , 34.  ,
              38.  , 44.  , 51.  , 80.  ])                                  ],
      dtype=object)

In [18]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42,  8.4 , 18.  , 21.  , 23.  , 25.  , 28.  , 31.  , 34.  ,
              38.  , 44.  , 51.  , 80.  ])                                  ],
      dtype=object)

In [19]:
trf.named_transformers_['second'].bin_edges_

array([array([  0.     ,   7.2292 ,   7.75   ,   7.88916,   7.925  ,   8.6625 ,
               10.5    ,  13.     ,  15.54834,  21.045  ,  26.     ,  29.355  ,
               39.6875 ,  65.     ,  90.     , 512.3292 ])                     ],
      dtype=object)

In [20]:
output = pd.DataFrame({
    'age':X_train['Age'],
    'age_trf':X_train_trf[:,0],
    'fare':X_train['Fare'],
    'fare_trf':X_train_trf[:,1]
})

In [21]:
output['age_labels'] = pd.cut(x=X_train['Age'],
                                    bins=trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x=X_train['Fare'],
                                    bins=trf.named_transformers_['second'].bin_edges_[0].tolist())

In [22]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
337,41.0,9.0,134.5,14.0,"(38.0, 44.0]","(90.0, 512.329]"
563,28.0,6.0,8.05,4.0,"(25.0, 28.0]","(7.925, 8.662]"
636,32.0,7.0,7.925,4.0,"(31.0, 34.0]","(7.889, 7.925]"
639,28.0,6.0,16.1,8.0,"(25.0, 28.0]","(15.548, 21.045]"
3,35.0,8.0,53.1,12.0,"(34.0, 38.0]","(39.688, 65.0]"


In [23]:
clf = DecisionTreeClassifier(random_state= 0)
clf.fit(X_train_trf,y_train)
y_pred2 = clf.predict(X_test_trf)

In [24]:
accuracy_score(y_test,y_pred2)

0.6368715083798883

In [25]:
X_trf = trf.fit_transform(X)
print(np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy')))

0.6610611735330838
