# Data exploration

In [38]:
# Imports
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from math import pow

print('Tensorflow version:', tf.__version__)

Tensorflow version: 2.6.0


In [39]:
# Load data
training_data = pd.read_csv('../input/tdt05-2021-challenge-2/challenge2_train.csv', index_col=0)
X_test = pd.read_csv('../input/tdt05-2021-challenge-2/challenge2_test.csv', index_col=0)

print('Shape training:', training_data.shape)
print('Shape test:', X_test.shape)
summary_statistics = pd.DataFrame(
    {
        '#nan': training_data.isnull().sum(axis = 0),
        'categories': training_data.nunique(), 
        'dtype': training_data.dtypes
    }, 
    index=training_data.columns,
)

print(summary_statistics)
training_data.head(5)

Shape training: (50000, 30)
Shape test: (50000, 29)
         #nan  categories    dtype
target      0           2    int64
f0       1459           2  float64
f1       1487         190   object
f2       1439           6   object
f3       1488           3  float64
f4       1498           2   object
f5      11617          13  float64
f6       1490           2  float64
f7       1525           6  float64
f8       1490        2177   object
f9       1489           5   object
f10      1501          26   object
f11       145        7633  float64
f12      1541         222   object
f13      1447          15   object
f14      1451        1204   object
f15      1477         222   object
f16      1460          12  float64
f17      9762       19209  float64
f18      1556           6   object
f19      1437           6  float64
f20      1464           4  float64
f21      1510           7  float64
f22      1513           3   object
f23      1516        1495   object
f24      2696        4148  float64
f25

Unnamed: 0_level_0,target,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1.0,gL,e,3.0,A,,0.0,6.0,96ae67d3e,...,0.5,0.0,3.0,R,328b0cf4e,0.834041,T,N,1.0,14.2364
1,0,0.0,Rj,c,1.0,A,7.0,1.0,4.0,9fcf422f2,...,0.4,0.0,1.0,,328b0cf4e,0.686021,T,N,1.0,
2,0,,In,a,1.0,A,10.0,1.0,6.0,a5adff44e,...,0.5,1.0,3.0,G,0c67fcbbd,1.141271,T,N,3.0,
3,1,1.0,rA,c,3.0,A,7.0,1.0,1.0,15c90ab2e,...,0.6,1.0,1.0,G,fee4e3007,0.662382,T,N,3.0,
4,0,1.0,pE,c,3.0,A,7.0,0.0,6.0,b36490559,...,0.5,0.0,1.0,B,587e040bd,-1.0,T,N,1.0,13.9537


## Feature description
- target: 0 or 1 -> binary classification
- f0: 0.0 or 1.0
- f1: weird combination of lower and uppercase letters. 173 unique pairs (categories) with occurences from ranging from 8 to 874
- f2: letters a-f with distribution (a=24%, b=11%, c=14%, d=16%, e=11%, f=21%, null=3%)
- f3: 1.0, 2.0, or 3.0 with distribution (1.0=38%, 2.0=26%, 3.0=33%, null=3%)
- f4: A or B with distribution (A=88%, B=9%, null=3%)
- f5: -1 to 11 with distribution (~0%, 1%, ~0%, ~0%, 1%, 3%, 2%, 8%, 23%, 2%, 3%, 6%, 27%, null=23%)

The features labeled:
- f8 
- f12 
- f14 
- f15 
- f23

All seem to be hexadecimal. These might just be an id of sorts, or they can be the hex representation of a number.  
An idea can be to covert these into decimal and see if they are important somehow.

In [40]:
hexadecimal_columns = ['f8', 'f12', 'f14', 'f15', 'f23']
merged_datasets = pd.concat([training_dataraining_data, X_test])[hexadecimal_columns]

merged_statistics = pd.DataFrame(
    {
        '#categories before merge': training_data.nunique(), 
        '#categories after merge': merged_datasets.nunique(), 
    }, 
    index=merged_datasets.columns,
)

print(merged_statistics)

NameError: name 'training_dataraining_data' is not defined

## Data cleaning
Data cleaning consists of the following steps
- Removal of unwanted observations
- Fixing structural errors
- Managing unwanted outliers
- Handeling missing data

In [None]:
# Partition the data
X = training_data.drop(columns=['target']).copy()
y = training_data[['target']].copy()

### Making the data types *correct*
- Boolean features should be `boolean`
- Categorical features should be `categorical`
- Numerical features should be `int64` or `float64`

### Dealing with missing values
XBoost and CatBoost handles missing data differently. This needs to be taken into account before training.
- XGBoost: missing values should be zero
- CatBoost: missing values should be way off the distribution, like -999

In [43]:
# CatBoost specific preprocessing
fill_value = -999

X.fillna(fill_value, inplace=True), X_test.fillna(fill_value, inplace=True)

X['f0'] = X['f0'].astype(str)
X_test['f0'] = X_test['f0'].astype(str)

missing_values = pd.DataFrame(
    {
        '#nan_train': X.isnull().sum(axis = 0),
        '#nan_test': X.isnull().sum(axis = 0),
    }, 
    index=X.columns,
)
print(missing_values)

     #nan_train  #nan_test
f0            0          0
f1            0          0
f2            0          0
f3            0          0
f4            0          0
f5            0          0
f6            0          0
f7            0          0
f8            0          0
f9            0          0
f10           0          0
f11           0          0
f12           0          0
f13           0          0
f14           0          0
f15           0          0
f16           0          0
f17           0          0
f18           0          0
f19           0          0
f20           0          0
f21           0          0
f22           0          0
f23           0          0
f24           0          0
f25           0          0
f26           0          0
f27           0          0
f28           0          0


## CatBoost

In [41]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# categorical_features_indices = np.where(X.dtypes != np.float)[0]
categorical_features_indices = [0, 1, 2, 4, 8, 9, 10, 12, 13, 14, 15, 18, 22, 23, 25, 26]

X['f0'] = X['f0'].astype(str)
X_test['f0'] = X_test['f0'].astype(str)

# X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

model.fit(
    X, y,
    cat_features=categorical_features_indices,
    # eval_set=(X_validation, y_validation),
    # plot=True
)

# cv_params = model.get_params()
# cv_params.update({
#     'loss_function': metrics.Logloss()
# })

# cv_data = cv(
#     Pool(X, y, cat_features=categorical_features_indices),
#     cv_params,
#     plot=True
# )

print(model.get_best_score())


<catboost.core.CatBoostClassifier at 0x7f02c1f168b0>

## Make predictions

Make predictions from `X_test` and save to file.

In [57]:
y_pred = model.predict(X_test, prediction_type='Probability')
print(model.get_best_score())
predictions = pd.DataFrame({'id': X_test.index, 'target': y_pred[:,1]})

predictions.to_csv('../output/prediction.txt', index=False)

{'learn': {'Accuracy': 0.84024, 'Logloss': 0.3674780978985}}


## One-hot encoding
The features labeled:
- f0 (Boolean)
- f1
- f2
- f4
- f9
- f10
- f13
- f18
- f22
- f25 (Boolean)
- f26 (Boolean)

Toghether with the hex features discussed above:
- f8 
- f12 
- f14 
- f15 
- f23


All seem to be categorical and should therefore be converted to one-hot encoding (depending on the learning algorithm).

In [None]:
# One-hot encode categorical features
categorical_features = ['f0', 'f1', 'f2', 'f4', 'f8', 'f9', 'f10', 'f12', 'f13', 'f14', 'f15', 'f18', 'f22', 'f23', 'f25', 'f26']

X_encoded = pd.get_dummies(X, columns=categorical_features)
X_encoded.head()

In [None]:
# Normalize numerical 

In [None]:
# Turn boolean and categorical features into type category
# boolean_features = ['f0', 'f25', 'f26']
# categorical_features = ['f1', 'f2', 'f4', 'f9', 'f10', 'f13', 'f18', 'f22']
# non_numeric_features = boolean_features + categorical_features
# 
# X[non_numeric_features] = X[non_numeric_features].astype('category')
# X[boolean_features] = X[boolean_features].apply(lambda x: x.cat.codes)
# X[boolean_features[1:]] = X[boolean_features[1:]].replace({1: 0, 2: 1})
# 
# # Turn categorical columns into categorical 
# #categorical_features = ['f1', 'f2', 'f4', 'f9', 'f10', 'f13', 'f18', 'f22']
# #X_train[categorical_features] = X_train[categorical_features].astype("category")
# #X_test[categorical_features] = X_test[categorical_features].astype("category")
# 
# print(X[boolean_features].head())
# print(X[boolean_features].nunique())

In [None]:
# fill_value = 0
# 
# X_train.fillna(fill_value, inplace=True), X_test.fillna(fill_value, inplace=True)
# missing_values = pd.DataFrame(
#     {
#         '#nan_train': X_train.isnull().sum(axis = 0),
#         '#nan_test': X_test.isnull().sum(axis = 0),
#     }, 
#     index=X_train.columns,
# )
# print(missing_values)

In [None]:
# plt.figure(figsize=(32, 12))
# sns.heatmap(training_data.corr(), annot=True, cmap='viridis')

In [None]:
# # Cell for just testing out random stuff
# f8 = training_data[['f8']].fillna('0')
# print(f8.isnull().sum(axis = 0))
# print(f8.isna().sum(axis = 0))
# print(f8.dtypes)

## Feature engineering
The features labeled:
- f0 (0.0 or 1.0 / Boolean)
- f1
- f2
- f4
- f9
- f10
- f13
- f18
- f22
- f25 (Boolean)
- f26 (Boolean)

All seem to be categorical and should be converted to numbers (depending on the learning algorithm).

---

The features labeled:
- f8 
- f12 
- f14 
- f15 
- f23

All seem to be hexadecimal. These might just be an id of sorts, or they can be the hex representation of a number.  
An idea can be to covert these into decimal and see if they are important somehow.

In [None]:
radix = 16
hexadecimal_columns = ['f8', 'f12', 'f14', 'f15', 'f23']

X = training_data.copy()
X[hexadecimal_columns] = X[hexadecimal_columns].fillna('-1').transform(lambda x: x.astype(str).map(lambda x: int(x, base=radix)))
X = X.replace(-1, float('nan'))

print(X[hexadecimal_columns].nunique())
X[hexadecimal_columns].hist(bins=2203, figsize=(25, 15), layout=(2, 7))

Converting categorical data to numbers

In [None]:
# categorical_columns = ['f1', 'f2', 'f4', 'f9', 'f10', 'f13', 'f18', 'f22', 'f25', 'f26']

# transformed_training_data[categorical_columns] = transformed_training_data[categorical_columns].apply(lambda x: x.cat.codes)
# transformed_training_data

# TODOS:

* Prøve å konvertere null-verdier til 0, og å ha null som en egen kategori
* Teste med både OHE og annen type category encoding 

# Data exploration