In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder ### available? Imho should be, like cmon
from sklearn.model_selection import train_test_split ###hopefully we're allowed to use this too
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor

Function to remove features from the dataset basing on correlation

In [None]:
def DeleteCorrelated(X,thresh=0.75):
    X=X.copy()
    cor_matrix = X.corr().abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] >= thresh)]
    X_cleaned=X.drop(columns=to_drop)
    return X_cleaned

class to remove features from the dataset basing on Variance Importance Factor (VIF)

In [None]:
#source: https://www.kaggle.com/remilpm/how-to-remove-multicollinearity

class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=5, impute=True, impute_strategy='median'):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        # The statsmodel function will fail with NaN values, as such we have to impute them.
        # By default we impute using the median value.
        # This imputation could be taken out and added as part of an sklearn Pipeline.
        if impute:
            self.imputer = SimpleImputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh):
        dropped=True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]
            
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={max_vif}')
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        print(X.shape[1]," features left in dataset")
        return X

In [None]:
class Preprocessor:
    
    @staticmethod
    def train_test_split(X, y, train_subset_proportion=0.75, keep_y_balance=True):
        if set(X.index) != set(y.index):
            raise AttributeError('Indices in X and y are not indetical')
        n=X.shape[0]
        train_rows_n = int(train_subset_proportion * n)
        test_rows_n = n - train_rows_n 
        if keep_y_balance:
            if ((y.unique()!=0) & (y.unique()!=1)).any():
                raise ValueError('Using keep_y_balance requires y values to be 0 and 1.')
            pos_index = y[y==1].index
            neg_index = y[y==0].index
            train_pos_index = np.random.choice(pos_index, int(train_subset_proportion*len(pos_index)), replace=False)
            train_neg_index = np.random.choice(neg_index, int(train_subset_proportion*len(neg_index)), replace=False)
            test_pos_index = np.array(list(set(pos_index) - set(train_pos_index)))
            test_neg_index = np.array(list(set(neg_index) - set(train_neg_index)))
            train_index = np.concatenate((train_pos_index, train_neg_index))
            test_index = np.concatenate((test_pos_index, test_neg_index))
        else:
            train_index = np.random.choice(y.index, train_rows_n, replace=False)
            test_index = np.array(set(y.index) - set(train_index))
        return X.loc[train_index, :], X.loc[test_index, :], y.loc[train_index], y.loc[test_index]
    
    @staticmethod
    def remove_multicollinearity(X):
        """
        https://stackoverflow.com/questions/25676145/capturing-high-multi-collinearity-in-statsmodels
        https://en.wikipedia.org/wiki/Multicollinearity#Detection
        """
        X = X.copy()
        while True:
            corr_m = np.corrcoef(X)
            eigenvalues, eigenvectors = np.linalg.eig(corr_m)
            #TODO
            print(eigenvalues)
            break
    
    def one_hot_encoding(self):
        #TODO
        pass        
            

# 1. Bank data

In [None]:
bank_df = pd.read_csv('data/bank-additional-full.csv', sep=';')
bank_df.head()

In [None]:
bank_df.info()

In [None]:
for c in bank_df.columns:
    if bank_df[c].dtype=='O':
        print(c,bank_df[c].unique())

At first, it looks like there are no missing values in data. However, most off the categorical variables do have a special value `unknown` which is actually a missing value. While transforming the data by OHE, we can treat it like any other value or simply drop it to keep the columns linear independence. 

In [None]:
y_bank = bank_df['y']=='no'
bank_df.drop('y', axis=1)

In [None]:
X_bank_raw_categorical=bank_df.drop(columns='y').select_dtypes(include='object')
encoder=OneHotEncoder(sparse=False)
encoder.fit(X_bank_raw_categorical)
X_bank_cat_enc=encoder.transform(X_bank_raw_categorical)

In [None]:
X_bank_numerical=bank_df.select_dtypes(exclude='object')
Mult_Coll=ReduceVIF()
X_bank_numerical = Mult_Coll.fit_transform(X_bank_numerical)
X_bank_enc=X_bank_numerical.join(pd.DataFrame(X_bank_cat_enc,columns=np.concatenate(encoder.categories_)))

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X_bank_enc,y_bank,test_size=0.25)

model2 = LogisticRegression(max_iter=1e20)
model2.fit(X_train, y_train)
print(X_train.shape[1])
abs(model2.score(X_test, y_test))

In [None]:
x1, x2, y1, y2 = Preprocessor.train_test_split(bank_df.drop(columns='y'), bank_df['y']=='yes')

In [None]:
y1.shape

In [None]:
y2.shape

# 2. Diabetic Retinopathy

In [None]:
from scipy.io import arff
import pandas as pd
data = arff.loadarff('data/messidor_features.arff')
retinopathy_df = pd.DataFrame(data[0])
retinopathy_df.head()

In [None]:
retinopathy_df.info()

In [None]:
for c in retinopathy_df.columns:
    print("column name: ",c,"number of unique values: ",len(retinopathy_df[c].unique()),"number of nulls",retinopathy_df.isna().sum().sum())

description of the dataset can be found [here](https://archive.ics.uci.edu/ml/datasets/Diabetic+Retinopathy+Debrecen+Data+Set#)

In [None]:
retinopathy_df.drop(columns='Class',inplace=True)

In [None]:
X_retinopathy=retinopathy_df.drop(columns='18')
y_retinopathy=retinopathy_df['18']

In [None]:
plt.figure(figsize=(16,13))
sns.heatmap(X_retinopathy.corr())
plt.show()

In [None]:
Mult_Coll = ReduceVIF()

X_retinopathy_clean = Mult_Coll.fit_transform(X_retinopathy)

In [None]:
plt.figure(figsize=(16,13))
sns.heatmap(X_retinopathy_clean.corr())
plt.show()

#### Results without VIF reduction

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X_retinopathy,y_retinopathy,test_size=0.2)

model = LogisticRegression(max_iter=1e20)
model.fit(X_train, y_train)
print(X_train.shape[1])
model.score(X_test, y_test)

#### Results with VIF reduction

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X_retinopathy_clean,y_retinopathy,test_size=0.2)

model = LogisticRegression(max_iter=1e20)
model.fit(X_train, y_train)
print(X_train.shape[1])
model.score(X_test, y_test)

#### Results with feature selection based on correlation matrix

In [None]:
X_no_corr=DeleteCorrelated(X_retinopathy)

X_train, X_test, y_train, y_test=train_test_split(X_no_corr,y_retinopathy,test_size=0.2)

model = LogisticRegression(max_iter=1e20)
model.fit(X_train, y_train)
print(X_train.shape[1])
model.score(X_test, y_test)

# 3. Breast Cancer Wisconsin

In [None]:
wdbc_df=pd.read_csv('data/wdbc.csv')

wdbc_df.head()

In [None]:
y_wdbc=wdbc_df['diagnosis']=="M"
X_wdbc=wdbc_df.drop(columns=["id","diagnosis","Unnamed: 32"])

X_wdbc.info()

As we can see all features are non-null numeric type variables. Which means that in this case one-hot-encoding won't be needed. The only things left to do is to remove collinear and multicollinear ones (maybe remove some outliers? from data) and split data into training and testing sets.

Correlation matrix showing that we should probably remove a fair number of variables

In [None]:
plt.figure(figsize=(16,13))
sns.heatmap(X_wdbc.corr())
plt.show()

Removal of variables based only on correlation

In [None]:
X_wdbc_cleaned_corr=DeleteCorrelated(X_wdbc,0.8)
X_wdbc_cleaned_corr.columns

Removal of variables using Variance Inflation Factor (VIF)

In [None]:
Mult_Coll = ReduceVIF()
X_wdbc_cleaned = Mult_Coll.fit_transform(X_wdbc)
X_wdbc_cleaned.tail()

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X_wdbc_cleaned,y_wdbc,test_size=0.2)

model = LogisticRegression(max_iter=1e20)
model.fit(X_train, y_train)
model.score(X_test, y_test)

## 4. Etherneum frauds

In [None]:
etherneum_df=pd.read_csv('data/transaction_dataset.csv')

etherneum_df.head()

In [None]:
for c in etherneum_df.columns:
    if(len(etherneum_df[c].unique())<10):
        print(c,etherneum_df[c].unique())

In [None]:
etherneum_df[' ERC20 uniq sent addr.1'].fillna(0)
etherneum_df.describe().T

In [None]:
to_drop=['Unnamed: 0',
         'Index',
         'Address',
         ' ERC20 avg time between sent tnx',
         ' ERC20 avg time between rec tnx',
         ' ERC20 avg time between rec 2 tnx',
         ' ERC20 avg time between contract tnx',
         ' ERC20 min val sent contract',
         ' ERC20 max val sent contract',
         ' ERC20 avg val sent contract']
etherneum_df.drop(columns=to_drop,inplace=True)

In [None]:
for c in etherneum_df.columns:
    if etherneum_df[c].dtype=='O':
        print(c,len(etherneum_df[c].unique()))

In [None]:
y_eth=etherneum_df['FLAG']
X_eth_raw=etherneum_df.drop(columns='FLAG')

In [None]:
plt.figure(figsize=(16,13))
sns.heatmap(X_eth_raw.corr())
plt.show()

In [None]:
Mult_Coll = ReduceVIF(thresh=7)
X_eth_cleaned = Mult_Coll.fit_transform(X_eth_raw.drop(columns=[' ERC20_most_rec_token_type',' ERC20 most sent token type']))

In [None]:
plt.figure(figsize=(16,13))
sns.heatmap(X_eth_cleaned.corr())
plt.show()
#one could still consider droping some of the variables because of high correlation

#### Results when using only numerical features
without VIF reduction

In [None]:
X_eth_no_categorical=X_eth_raw.drop(columns=[' ERC20 most sent token type',' ERC20_most_rec_token_type']).fillna(0)
X_train, X_test, y_train, y_test=train_test_split(X_eth_no_categorical,
                                                  y_eth,
                                                  test_size=0.2
                                                 )

model0 = LogisticRegression(max_iter=1e20)
model0.fit(X_train, y_train)
print(X_train.shape[1])
model0.score(X_test, y_test)

#### Results when using only numerical features
with VIF reduction

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X_eth_cleaned,y_eth,test_size=0.2)

model1 = LogisticRegression(max_iter=1e20)
model1.fit(X_train, y_train)
print(X_eth_cleaned.shape[1])
model1.score(X_test, y_test)

In [None]:
X_eth_raw_categorical=X_eth_raw[[' ERC20 most sent token type',' ERC20_most_rec_token_type']].fillna("unknown")
encoder=OneHotEncoder(sparse=False)
encoder.fit(X_eth_raw_categorical)
X_eth_cat_enc=encoder.transform(X_eth_raw_categorical)

ciekawostka:

#### Results when using only categorical features encoded

In [None]:
X_train, X_test, y_train, y_test=train_test_split(pd.DataFrame(X_eth_cat_enc,columns=np.concatenate(encoder.categories_)),
                                                  y_eth,
                                                  test_size=0.25
                                                 )

model2 = LogisticRegression(max_iter=1e20)
model2.fit(X_train, y_train)
print(X_train.shape[1])
abs(model2.score(X_test, y_test)-model2.score(X_train, y_train))

#### Results with categorical features encoded

In [None]:
X_eth_enc = X_eth_cleaned.join(pd.DataFrame(X_eth_cat_enc.toarray(),columns=np.concatenate(encoder.categories_)))

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X_eth_enc,y_eth,test_size=0.2)

model2 = LogisticRegression(max_iter=1e20)
model2.fit(X_train, y_train)
print(X_train.shape[1])
abs(model2.score(X_test, y_test)-model2.score(X_train, y_train))