## Import data

In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()

#create the dataframe
dataset_df = pd.DataFrame(dataset.data)

columns = dataset.feature_names
dataset_df.columns = columns

dataset_df.head()

In [None]:
dataset_df.shape

In [None]:
dataset_df['target']=dataset.target

In [None]:
dataset_df

# Remove low variance variables (be carefull!!!)

In [None]:
dataset_df.describe()

In [None]:
from sklearn import feature_selection

selector = feature_selection.VarianceThreshold(threshold=0.02)
selector.fit_transform(dataset_df).shape

In [None]:
selector.get_support()

In [None]:
dataset_df.columns[selector.get_support()]

In [None]:
dataset_df[dataset_df.columns[selector.get_support()]]

In [None]:
%matplotlib inline
dataset_df.hist('mean smoothness')

## Univariate distributions

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

X0=dataset_df[dataset_df['target']==0]
X1=dataset_df[dataset_df['target']==1]

fig, axes = plt.subplots(ncols=5, nrows=6, figsize=(20,15))
fig.tight_layout()

for i, ax in zip(range(columns.size), axes.flat):
    sns.distplot(X0.iloc[:,i], color="blue", ax=ax)
    sns.distplot(X1.iloc[:,i], color="red", ax=ax)
plt.show()

In [None]:
%matplotlib inline

fig, axes = plt.subplots(ncols=5, nrows=6, figsize=(12,15))
fig.tight_layout()

for i, col in enumerate(columns):
    sns.boxplot(y = col, x = "target",data=dataset_df, orient='v', ax=axes[int(i/5),i%5])




## Univariate test

In [None]:
from sklearn import feature_selection
from sklearn.feature_selection import SelectKBest

X=dataset_df.iloc[:,:-1]

y=dataset.target

selector_chi=SelectKBest(feature_selection.chi2, k=5)
selector_f=SelectKBest(feature_selection.f_classif, k=5)

#Test
#classification: chi2, f_classif, mutual_info_classif
#regression: f_regression, mutual_info_regression

X_chi = pd.DataFrame(selector_chi.fit_transform(X, y),columns=X.columns[selector_chi.get_support()])
X_f = pd.DataFrame(selector_f.fit_transform(X, y),columns=X.columns[selector_f.get_support()])


In [None]:
print(X.columns[selector_chi.get_support()])
print(X.columns[selector_f.get_support()])


In [None]:
X.columns

In [None]:
for i in range(len(selector_chi.scores_)):
    print('Feature %i %s: %f' % (i, X.columns[i], selector_chi.scores_[i]))


## Bivariate 

In [None]:
dataset_df.head()

In [None]:
dataset_df

In [None]:
# We take a sample of columns with the target

df_subset=dataset_df.copy()

df_subset=df_subset.iloc[:,:6]

In [None]:
df_subset['target']=dataset_df['target']
df_subset

In [None]:
%matplotlib inline

import seaborn as sns
sns.pairplot(df_subset,hue='target')

In [None]:
## Contingency Matrix

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')# strategy='quantile'
discretizer.fit(dataset_df[['mean radius']])
discrete_uniform=discretizer.transform(dataset_df[['mean radius']])


In [None]:
pd.crosstab(discrete_uniform.flatten(), dataset_df['target'])

In [None]:
import seaborn as sns
sns.heatmap(pd.crosstab(discrete_uniform.flatten(), dataset_df['target']))