## Categories

- Labels 

- The number of labels in the dataset are different 

- __high cardinality__ refers to uniqueness of data values 

- The lower the cardinality, the more duplicated elements in a column

-  A column with the lowest possible cardinality would have the same value for every row

- Highly cardinal variables dominate tree based algorithms

- Labels may only be present in the training data set, but not in the test data set

- Labels may appear in the test set that were not present in the training set


__Tree methods are biased towards variables with many labels__


In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [108]:
from google.colab import drive
drive.mount('/content/gdrive')
data = pd.read_csv("gdrive/My Drive/Colab Notebooks/FeatureEngineering/train_date.csv")


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [109]:
cat_cols = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

for i in cat_cols:
  print('Number of categories in the variable {}: {}'.format(i,len(data[i].unique())))

print('Total rows: {}'.format(len(data)))

Number of categories in the variable Name: 891
Number of categories in the variable Sex: 2
Number of categories in the variable Ticket: 681
Number of categories in the variable Cabin: 148
Number of categories in the variable Embarked: 4
Total rows: 891


In [110]:
data['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [111]:
data['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [112]:
data['Cabin_processed'] = data['Cabin'].astype(str).str[0]
data[['Cabin', 'Cabin_processed']].head()

Unnamed: 0,Cabin,Cabin_processed
0,,n
1,C85,C
2,,n
3,C123,C
4,,n


In [113]:
cat_cols = ['Cabin_processed', 'Cabin']

for i in cat_cols:
  print('Number of categories in the variable {}: {}'.format(i,len(data[i].unique())))


Number of categories in the variable Cabin_processed: 9
Number of categories in the variable Cabin: 148


In [114]:
from sklearn.model_selection import train_test_split
use_cols = ['Cabin', 'Sex', 'Cabin_processed']
X_train, X_test, y_train, y_test = train_test_split(data[use_cols], 
                                                    data['Survived'],  
                                                    test_size=0.2)

X_train.shape, X_test.shape

((712, 3), (179, 3))

In [115]:
# Labels in training set that are not in testing set
len([x for x in X_train.Cabin.unique() if x not in X_test['Cabin'].unique()])

112

In [116]:
# Labels in testing set that are not in training set
len([x for x in X_test.Cabin.unique() if x not in X_train['Cabin'].unique()])

20

In [117]:
type(X_train)

pandas.core.frame.DataFrame

In [0]:
mapper = {k:i for i, k in enumerate(X_train['Cabin'].unique(), 0)} 

In [119]:
# replace the labels in Cabin, using the dic created above
X_train.loc[:, 'Cabin_mapped'] = X_train.loc[:, 'Cabin'].map(mapper)
X_test.loc[:, 'Cabin_mapped'] = X_test.loc[:, 'Cabin'].map(mapper)

X_train[['Cabin_mapped', 'Cabin']].head(10)

Unnamed: 0,Cabin_mapped,Cabin
319,0,E34
201,1,
206,1,
882,1,
304,1,
221,1,
531,1,
144,1,
846,1,
641,2,B35


In [120]:
mapper = {k: i for i, k in enumerate(X_train['Cabin_processed'].unique(), 0)}

# replace labels by numbers with dictionary
X_train.loc[:, 'Cabin_processed'] = X_train.loc[:, 'Cabin_processed'].map(mapper)
X_test.loc[:, 'Cabin_processed'] = X_test.loc[:, 'Cabin_processed'].map(mapper)

X_train[['Cabin_processed', 'Cabin', 'Cabin_mapped']].head(10)

Unnamed: 0,Cabin_processed,Cabin,Cabin_mapped
319,0,E34,0
201,1,,1
206,1,,1
882,1,,1
304,1,,1
221,1,,1
531,1,,1
144,1,,1
846,1,,1
641,2,B35,2


In [121]:
X_train.loc[:, 'Sex'] = X_train.loc[:, 'Sex'].map({'male': 0, 'female': 1})
X_test.loc[:, 'Sex'] = X_test.loc[:, 'Sex'].map({'male': 0, 'female': 1})

X_train.Sex.head()

319    1
201    0
206    0
882    1
304    0
Name: Sex, dtype: int64

In [122]:
X_train[['Cabin_mapped','Cabin_processed', 'Sex']].isnull().sum()

Cabin_mapped       0
Cabin_processed    0
Sex                0
dtype: int64

In [123]:
X_test[['Cabin_mapped','Cabin_processed', 'Sex']].isnull().sum()

Cabin_mapped       21
Cabin_processed     0
Sex                 0
dtype: int64

In [124]:
len(X_train['Cabin_mapped'].unique()), len(X_train['Cabin_processed'].unique())

(129, 9)

In [125]:
X_train.columns

Index(['Cabin', 'Sex', 'Cabin_processed', 'Cabin_mapped'], dtype='object')

In [0]:
X_test = X_test.fillna(0)

In [0]:
from sklearn.metrics import accuracy_score

In [128]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.770949720670391
0.776536312849162


In [129]:
from sklearn.linear_model import RidgeClassifierCV
classifier = RidgeClassifierCV()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.776536312849162
0.776536312849162


In [130]:
from sklearn.linear_model import RidgeClassifierCV
classifier = RidgeClassifierCV()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.776536312849162
0.776536312849162


In [131]:
from sklearn.svm import SVC
classifier = SVC()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.6871508379888268
0.776536312849162


In [132]:
from sklearn.neural_network import MLPClassifier
classifier = MLPClassifier()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.770949720670391
0.776536312849162




In [133]:
from sklearn.svm import LinearSVC
classifier = LinearSVC()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.770949720670391
0.776536312849162




In [134]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.7932960893854749
0.776536312849162


In [135]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.7932960893854749
0.7877094972067039


In [136]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.776536312849162
0.7877094972067039


In [137]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.776536312849162
0.33519553072625696


In [138]:
from sklearn.linear_model import Perceptron
classifier = Perceptron()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.7430167597765364
0.776536312849162


In [139]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.770949720670391
0.7486033519553073


In [140]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(X_train[['Sex', 'Cabin_mapped']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_mapped']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train[['Sex', 'Cabin_processed']],y_train)
y_pred = classifier.predict(X_test[['Sex', 'Cabin_processed']])
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

0.770949720670391
0.7597765363128491
