# TODOs

1. Data Generation:
    - numerical data generation - improvements in distribution, 
    - business rules adjustments (only after numerical data generation is fixed)
    - check the distribution of each numerical feature of the merged data
    - categorical data - add a check (with Kolmogorov Smirnov or another one) that tells you if the distribution of the synthetic data is ok
<br>
<br>
2. Feature Engineering:
    - Generate new features (**custom made**, statistical) - ***Girls***
    - Balance data 
        - research other methods beside SMOTE; **Multiclass problems handling - should we resample for each target class individually or not. Describe all possible prediction scenarious** - ***ALL***
<br>
<br>
3. Feature Selection 
    - research methods for numerical and categorical feature selections
<br>
<br>
4. Modeling:
    - try different models - with at least 5k examples!!! 
        - black box and explainable ones
    - visualizations - compare performances
    - try to explain the black box models (OPTIONAL)
    - hyperparam optimization (OPTIONAL)

# Inputs and tools

## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.linear_model import SGDClassifier, LogisticRegression



# Increase the maximum number of rows and columns to be displayed
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 1000)

## Categorical data distributions

In [None]:
dists = {
    'sex':{'labels':['M', 'F'], 'values':[0.4854368932,0.5145631068]},
    'lv_educ':{'labels':['Incomplete', 'Primary', 'Basic', 'Secondary', 'Higher'], 'values':[0.0595,0.07788016474,0.2309254283,0.4359496722,0.1957067711]},
    'empl_stat':{'labels':['Employers', 'Self-employed', 'Employed in private sector', 'Employed in public sector', 'Unpaid family workers'], 'values':[0.03631598652,0.07272557095,0.6708785723,0.2126544365,0.00742543367]},
    'marit_stat':{'labels':['Single', 'Married', 'Divorced', 'Widowed'], 'values':[0.397,0.443,0.058,0.102]},
    'house_memb':{'labels':['1', '2', '3', '4', '5', '6', '7+'], 'values':[0.1805,0.3778,0.2387,0.1157,0.0525,0.0238,0.011]},
    'chil_u_18_y':{'labels':['No children under 18', 'One child under 18', 'Two children under 18', 'Three children under 18', 'Four children under 18', 'Five children under 18', 'Six or more children under 18'], 'values':[0.422602157,0.36552047,0.183222339,0.020674764,0.004993779,0.001875149,0.001111341]},
    'nation':{'labels':['Bulgaria', 'EU', 'Other'], 'values':[0.9950198043,0.001146570676,0.003833625045]},
    'religion':{'labels':['Protestant', 'Catholic', 'Orthodox', 'Muslim', 'Other', 'No religion', 'I do not identify myself'], 'values':[0.011,0.008,0.76,0.1,0.002,0.047,0.072]},
    'soc_econ_stat':{'labels':['Economically active', 'Economically inactive'], 'values':[0.6151643031,0.3848356969]},
    'prof_ind':{'labels':['Agriculture, forestry and fisheries', 'Mining and processing industry', 'Utilities (electricity distribution and water supply)', 'Construction', 'Trade, automobile and motorcycle repair', 'Transportation, warehousing and mail', 'Hospitality and restaurant services', 'Creation and distribution of information and creative products, Telecommunications', 'Financial and administrative activities', 'Public administration', 'Education and research', 'Human health and social work', 'Other activities'], 'values':[0.03090815115,0.2353,0.029,0.05523651408,0.1645618594,0.06439111505,0.05161626582,0.03936261795,0.07356911161,0.04836124844,0.104946474,0.06006423384,0.04269692032]},
    'prof_stat':{'labels':['Management contract', 'Employment contract', 'Civil contract', 'Self-employed', 'Unemployed', 'Pensioner'], 'values':[0.01783393631,0.4732428049,0.02497602302,0.0385148509,0.167699009,0.277733376]},
    'count_house':{'labels':['0', '1', '2+'], 'values':[0.37,0.6,0.03]},
    'own_field':{'labels':['YES', 'NO'], 'values':[0.2621335023,0.737866497676384]},
    'num_car_house':{'labels':['0', '1', '2', '3+'], 'values':[0.5714285714,0.36,0.06428571429,0.004285714286]},
    'own_rent_house':{'labels':['my own', 'rented'], 'values':[0.843,0.157]},
    'edu':{'labels':['Educational Sciences', 'Humanities', 'Social, Economic and Legal Sciences', 'Natural Sciences, Mathematics and Informatics', 'Technical Sciences', 'Agricultural Sciences and Veterinary Medicine', 'Health and Sports', 'Arts', 'Security and Defense'], 'values':[0.07591254907,0.0461889827,0.5266633332,0.04571641724,0.1533297557,0.01776640163,0.0930038303,0.02247374859,0.01891291637]},
    'temp':{'labels':['Choleric', 'Phlegmatic', 'Sanguine', 'Melancholic'], 'values':[0.38,0.11,0.23,0.28]},
    'invest_exp':{'labels':['0', '1-5', '6-10', '11-15', '16-25'], 'values':[0.7,0.2,0.06,0.03,0.01]},
    'shares':{'labels':['YES', 'NO'], 'values':[0.003394353314,0.9966056467]},
    'corp_oblig':{'labels':['YES', 'NO'], 'values':[0.0003792213936,0.9996207786]},
    'oth':{'labels':['YES', 'NO'], 'values':[0.000592597502012084,0.999407402497988]},
    'inv_fund':{'labels':['YES', 'NO'], 'values':[0.06491199709,0.9350880029]},
    'cash':{'labels':['YES', 'NO'], 'values':[0.04105169923,0.9589483008]},
    'crypto':{'labels':['YES', 'NO'], 'values':[0.003284135938,0.9967158641]},
    'gov_bond':{'labels':['YES', 'NO'], 'values':[0.06835666691,0.9316433331]},
    'deposits':{'labels':['YES', 'NO'], 'values':[0.8180293286,0.1819706714]},
    'banking':{'labels':['Online', 'Offline'], 'values':[0.09,0.91]},
    'bk_oprat':{'labels':['Up to 7', 'From 8 to 10', 'From 11 to 13', 'From 14 to 18', 'From 19 to more'], 'values':[0.0084,0.2424,0.4729,0.2615,0.0148]},
    'bk_dc':{'labels':['Under one', 'One', 'Two', 'Three'], 'values':[0.01,0.57,0.38,0.04]},
    'bk_cc':{'labels':['YES', 'NO'], 'values':[0.17,0.83]},
    'bk_acc':{'labels':['YES', 'NO'], 'values':[0.8634087377,0.1365912623]},
    'ins_prop':{'labels':['YES', 'NO'], 'values':[0.05,0.95]},
    'ins_life':{'labels':['YES', 'NO'], 'values':[0.09,0.91]},
    'ins_casco':{'labels':['YES', 'NO'], 'values':[0.03,0.97]},
    'health_ins':{'labels':['YES', 'NO'], 'values':[0.02,0.98]},
    'overdraft':{'labels':['YES', 'NO'], 'values':[0.19,0.81]},
    'cons_cred':{'labels':['YES', 'NO'], 'values':[0.26,0.74]},
    'mortgage':{'labels':['YES', 'NO'], 'values':[0.02,0.98]},
    'car_leas':{'labels':['YES', 'NO'], 'values':[0.2,0.8]},
    'pens_ins':{'labels':['YES', 'NO'], 'values':[0.11,0.89]},
    'overdraft_app':{'labels':['YES', 'NO'], 'values':[0.2439,0.7561]},
    'cons_cred_app':{'labels':['YES', 'NO'], 'values':[0.305299502487562,0.694700497512438]},
    'mortgage_app':{'labels':['YES', 'NO'], 'values':[0.03,0.97]},
    'bk_cc_app':{'labels':['YES', 'NO'], 'values':[0.21,0.79]}
}

In [None]:
len(dists)

## Correlation of Numerical data 

In [None]:
corr = {
    'features': ['age', 'ind_risk', 'income', 'pers_exp', 'house_exp', 'taxes', 'transp_telecom', 'hobby'],
    'age': [1, -0.00665947056405372, 0.00291644965339247, 0.0107779942638097, 0.00698674581731255, 0.00729153655132963, 0.0099866509330216, 0.00931630696561133],
    'ind_risk': [-0.00665947056405372, 1, 0.0039918072709289, 0.00806259039194059, 0.00457023635440603, 0.0061985340641631, 0.00768699810849585, -0.00332322616613201],
    'income': [0.00291644965339247, 0.0039918072709289, 1, 0.560949334881676, 0.58892666343229, 0.581907424628933, 0.562946509689962, 0.352350802339294],
    'pers_exp': [0.0107779942638097, 0.00806259039194059, 0.560949334881676, 1, 0.928449923861951, 0.929598634668897, 0.934775947642248, 0.714298364869941],
    'house_exp': [0.00698674581731255, 0.00457023635440603, 0.58892666343229, 0.928449923861951, 1, 0.93031279279417, 0.927846735467478, 0.679286362990223],
    'taxes': [0.00729153655132963, 0.0061985340641631, 0.581907424628933, 0.929598634668897, 0.93031279279417, 1, 0.92920510128812, 0.689442053350162],
    'transp_telecom': [0.0099866509330216, 0.00768699810849585, 0.562946509689962, 0.934775947642248, 0.927846735467478, 0.92920510128812, 1, 0.714114127908189],
    'hobby': [0.00931630696561133, -0.00332322616613201, 0.352350802339294, 0.714298364869941, 0.679286362990223, 0.689442053350162, 0.714114127908189, 1]
}

## Extract distributions from dictionary

In [None]:
def extract_dists(x, dists):
    '''
    A function to extract distributions from the dictionary dists,where:
    x is the name of the feature to extract in ''
    dists is the dictionary with all distributions
    '''
    import pandas as pd
    column_names = dists[x]['labels']
    values = [dists[x]['values']]
    pd_df = pd.DataFrame(data=values, columns=column_names)
    pd_df.index = pd.Index([x])
    return pd_df

## Convert the dictionary with correlation matrix to dataframe

In [None]:
def corr2df(corr):
    '''
    A function to create correlation dataframe from dictionary corr, where
    corr is the dictionary with the correlation matrix
    '''
    import pandas as df
    corr_df = pd.DataFrame(corr)
    corr_df.set_index('features', inplace=True)
    corr_df.index.name=None
    return corr_df

# Data Generation (Data synthesis)

## Categorical Data

In [None]:
import numpy as np

# Given dictionary
dists = {
    'sex':{'labels':['M', 'F'], 'values':[0.4854368932,0.5145631068]},
    'lv_educ':{'labels':['Incomplete', 'Primary', 'Basic', 'Secondary', 'Higher'], 'values':[0.0595,0.07788016474,0.2309254283,0.4359496722,0.1957067711]},
    'empl_stat':{'labels':['Employers', 'Self-employed', 'Employed in private sector', 'Employed in public sector', 'Unpaid family workers'], 'values':[0.03631598652,0.07272557095,0.6708785723,0.2126544365,0.00742543367]},
    'marit_stat':{'labels':['Single', 'Married', 'Divorced', 'Widowed'], 'values':[0.397,0.443,0.058,0.102]},
    'house_memb':{'labels':['1', '2', '3', '4', '5', '6', '7+'], 'values':[0.1805,0.3778,0.2387,0.1157,0.0525,0.0238,0.011]},
    'chil_u_18_y':{'labels':['No children under 18', 'One child under 18', 'Two children under 18', 'Three children under 18', 'Four children under 18', 'Five children under 18', 'Six or more children under 18'], 'values':[0.422602157,0.36552047,0.183222339,0.020674764,0.004993779,0.001875149,0.001111341]},
    'nation':{'labels':['Bulgaria', 'EU', 'Other'], 'values':[0.9950198043,0.001146570676,0.003833625045]},
    'religion':{'labels':['Protestant', 'Catholic', 'Orthodox', 'Muslim', 'Other', 'No religion', 'I do not identify myself'], 'values':[0.011,0.008,0.76,0.1,0.002,0.047,0.072]},
    'soc_econ_stat':{'labels':['Economically active', 'Economically inactive'], 'values':[0.6151643031,0.3848356969]},
    'prof_ind':{'labels':['Agriculture, forestry and fisheries', 'Mining and processing industry', 'Utilities (electricity distribution and water supply)', 'Construction', 'Trade, automobile and motorcycle repair', 'Transportation, warehousing and mail', 'Hospitality and restaurant services', 'Creation and distribution of information and creative products, Telecommunications', 'Financial and administrative activities', 'Public administration', 'Education and research', 'Human health and social work', 'Other activities'], 'values':[0.03090815115,0.2353,0.029,0.05523651408,0.1645618594,0.06439111505,0.05161626582,0.03936261795,0.07356911161,0.04836124844,0.104946474,0.06006423384,0.04269692032]},
    'prof_stat':{'labels':['Management contract', 'Employment contract', 'Civil contract', 'Self-employed', 'Unemployed', 'Pensioner'], 'values':[0.01783393631,0.4732428049,0.02497602302,0.0385148509,0.167699009,0.277733376]},
    'count_house':{'labels':['0', '1', '2+'], 'values':[0.37,0.6,0.03]},
    'own_field':{'labels':['YES', 'NO'], 'values':[0.2621335023,0.737866497676384]},
    'num_car_house':{'labels':['0', '1', '2', '3+'], 'values':[0.5714285714,0.36,0.06428571429,0.004285714286]},
    'own_rent_house':{'labels':['my own', 'rented'], 'values':[0.843,0.157]},
    'edu':{'labels':['Educational Sciences', 'Humanities', 'Social, Economic and Legal Sciences', 'Natural Sciences, Mathematics and Informatics', 'Technical Sciences', 'Agricultural Sciences and Veterinary Medicine', 'Health and Sports', 'Arts', 'Security and Defense'], 'values':[0.07591254907,0.0461889827,0.5266633332,0.04571641724,0.1533297557,0.01776640163,0.0930038303,0.02247374859,0.01891291637]},
    'temp':{'labels':['Choleric', 'Phlegmatic', 'Sanguine', 'Melancholic'], 'values':[0.38,0.11,0.23,0.28]},
    'invest_exp':{'labels':['0', '1-5', '6-10', '11-15', '16-25'], 'values':[0.7,0.2,0.06,0.03,0.01]},
    'shares':{'labels':['YES', 'NO'], 'values':[0.003394353314,0.9966056467]},
    'corp_oblig':{'labels':['YES', 'NO'], 'values':[0.0003792213936,0.9996207786]},
    'oth':{'labels':['YES', 'NO'], 'values':[0.000592597502012084,0.999407402497988]},
    'inv_fund':{'labels':['YES', 'NO'], 'values':[0.06491199709,0.9350880029]},
    'cash':{'labels':['YES', 'NO'], 'values':[0.04105169923,0.9589483008]},
    'crypto':{'labels':['YES', 'NO'], 'values':[0.003284135938,0.9967158641]},
    'gov_bond':{'labels':['YES', 'NO'], 'values':[0.06835666691,0.9316433331]},
    'deposits':{'labels':['YES', 'NO'], 'values':[0.8180293286,0.1819706714]},
    'banking':{'labels':['Online', 'Offline'], 'values':[0.09,0.91]},
    'bk_oprat':{'labels':['Up to 7', 'From 8 to 10', 'From 11 to 13', 'From 14 to 18', 'From 19 to more'], 'values':[0.0084,0.2424,0.4729,0.2615,0.0148]},
    'bk_dc':{'labels':['Under one', 'One', 'Two', 'Three'], 'values':[0.01,0.57,0.38,0.04]},
    'bk_cc':{'labels':['YES', 'NO'], 'values':[0.17,0.83]},
    'bk_acc':{'labels':['YES', 'NO'], 'values':[0.8634087377,0.1365912623]},
    'ins_prop':{'labels':['YES', 'NO'], 'values':[0.05,0.95]},
    'ins_life':{'labels':['YES', 'NO'], 'values':[0.09,0.91]},
    'ins_casco':{'labels':['YES', 'NO'], 'values':[0.03,0.97]},
    'health_ins':{'labels':['YES', 'NO'], 'values':[0.02,0.98]},
    'overdraft':{'labels':['YES', 'NO'], 'values':[0.19,0.81]},
    'cons_cred':{'labels':['YES', 'NO'], 'values':[0.26,0.74]},
    'mortgage':{'labels':['YES', 'NO'], 'values':[0.02,0.98]},
    'car_leas':{'labels':['YES', 'NO'], 'values':[0.2,0.8]},
    'pens_ins':{'labels':['YES', 'NO'], 'values':[0.11,0.89]},
    'overdraft_app':{'labels':['YES', 'NO'], 'values':[0.2439,0.7561]},
    'cons_cred_app':{'labels':['YES', 'NO'], 'values':[0.305299502487562,0.694700497512438]},
    'mortgage_app':{'labels':['YES', 'NO'], 'values':[0.03,0.97]},
    'bk_cc_app':{'labels':['YES', 'NO'], 'values':[0.21,0.79]}
}

# Number of rows in the synthetic dataset
num_rows = 250000

# Create the synthetic dataset
dataset = {}
for key, value_dict in dists.items():
    labels = value_dict['labels']
    probabilities = value_dict['values']
    
    # Normalize probabilities to ensure they sum to 1
    normalized_probabilities = probabilities / np.sum(probabilities)
    
    sampled_values = np.random.choice(labels, size=num_rows, p=normalized_probabilities)
    dataset[key] = sampled_values

# Printing the first 10 rows of the synthetic dataset
# for key in dataset:
#     print(f"{key}: {dataset[key][:10]}")


In [None]:
import pandas as pd

cat_df = pd.DataFrame(dataset)

In [None]:
cat_df.head()

In [None]:
cat_df.shape

In [None]:
cat_df["sex"].value_counts(normalize=True)

## Save

In [None]:
cat_df.to_csv("df_cat_No_Br_250k_v1.csv")

## Numerical Data (to be replaced by better approach)

In [None]:
import numpy as np

corr = {
    'features': ['age', 'ind_risk', 'income', 'pers_exp', 'house_exp', 'taxes', 'transp_telecom', 'hobby'],
    'age': [1, -0.00665947056405372, 0.00291644965339247, 0.0107779942638097, 0.00698674581731255, 0.00729153655132963, 0.0099866509330216, 0.00931630696561133],
    'ind_risk': [-0.00665947056405372, 1, 0.0039918072709289, 0.00806259039194059, 0.00457023635440603, 0.0061985340641631, 0.00768699810849585, -0.00332322616613201],
    'income': [0.00291644965339247, 0.0039918072709289, 1, 0.560949334881676, 0.58892666343229, 0.581907424628933, 0.562946509689962, 0.352350802339294],
    'pers_exp': [0.0107779942638097, 0.00806259039194059, 0.560949334881676, 1, 0.928449923861951, 0.929598634668897, 0.934775947642248, 0.714298364869941],
    'house_exp': [0.00698674581731255, 0.00457023635440603, 0.58892666343229, 0.928449923861951, 1, 0.93031279279417, 0.927846735467478, 0.679286362990223],
    'taxes': [0.00729153655132963, 0.0061985340641631, 0.581907424628933, 0.929598634668897, 0.93031279279417, 1, 0.92920510128812, 0.689442053350162],
    'transp_telecom': [0.0099866509330216, 0.00768699810849585, 0.562946509689962, 0.934775947642248, 0.927846735467478, 0.92920510128812, 1, 0.714114127908189],
    'hobby': [0.00931630696561133, -0.00332322616613201, 0.352350802339294, 0.714298364869941, 0.679286362990223, 0.689442053350162, 0.714114127908189, 1]
}

In [None]:
corr.keys()

In [None]:
l_corr_matrix = []

for k, v in corr.items():
    if k == "features":
        continue
    else:
        l_corr_matrix.append(v)

        
arr_corr_matrix = np.array(l_corr_matrix)

In [None]:
print(arr_corr_matrix)

## Generate synthetic data

In [None]:
feature_bounds = []

possible_values = {
    'age': [20, 86],
    'ind_risk': [0, 1],
    'income': [0, 150000],
    'pers_exp': [0, 6000],
    'house_exp': [0, 4000],
    'taxes': [0, 2500],
    'transp_telecom': [0, 2500],
    'hobby': [0, 3000],
}



for v in possible_values.values():
    
    feature_bounds.append(tuple(v))


import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import MinMaxScaler

def generate_synthetic_data_with_bounds(correlation_matrix, num_samples, feature_bounds):
    num_features = correlation_matrix.shape[0]
    lower_bounds, upper_bounds = zip(*feature_bounds)
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    # Check if the correlation matrix is valid (symmetric and positive definite)
    if not np.allclose(correlation_matrix, correlation_matrix.T):
        raise ValueError("Correlation matrix must be symmetric.")
    if not np.all(np.linalg.eigvals(correlation_matrix) > 0):
        raise ValueError("Correlation matrix must be positive definite.")
    
    # Generate synthetic data using multivariate normal distribution
    mean = np.zeros(num_features)
    synthetic_data = np.random.multivariate_normal(mean, correlation_matrix, num_samples)
    
    # Apply Gaussian copula to maintain correlation structure
    synthetic_data = norm.cdf(synthetic_data)
    
    # Scale the data to the specified bounds for each feature
    for i in range(num_features):
        synthetic_data[:, i] = lower_bounds[i] + synthetic_data[:, i] * (upper_bounds[i] - lower_bounds[i])
    
    return synthetic_data

# Example usage:
correlation_matrix = np.array(l_corr_matrix)

num_samples = 250000

synthetic_data = generate_synthetic_data_with_bounds(correlation_matrix, num_samples, feature_bounds)
print(synthetic_data)


In [None]:
import pandas as pd

adjusted_df = pd.DataFrame(synthetic_data, columns=possible_values.keys())
adjusted_df

In [None]:
pd.DataFrame(correlation_matrix)

## Plot distributions

In [None]:
import plotly.express as px

def plot_distribution(data, column_name):
    """
    Plots the distribution of a pandas column using Plotly.

    Parameters:
        data (pd.DataFrame): The pandas DataFrame containing the data.
        column_name (str): The name of the column to plot.

    Returns:
        None
    """
    # Ensure the column exists in the DataFrame
    if column_name not in data.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # Use Plotly Express to plot the distribution
    fig = px.histogram(data, x=column_name, nbins=50, title=f'Distribution of {column_name}')
    fig.show()


In [None]:
for col in adjusted_df.columns:
    plot_distribution(adjusted_df, col)

## Plot the two correlation matrices

In [None]:
possible_values.keys()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ... (Code from the previous answer)

# Calculate the correlation matrix for the adjusted DataFrame
correlation_matrix_adjusted = adjusted_df.corr()


# Display the correlation matrix
print("Adjusted Correlation Matrix:")
print(correlation_matrix_adjusted)

# Convert the original correlation data (corr) to a DataFrame
original_corr_df = pd.DataFrame(corr)
original_corr_df.set_index('features', inplace=True)

# Display the original correlation data
print("\nOriginal Correlation Matrix:")
print(original_corr_df)

# Plot the correlation matrix heatmaps for both adjusted and original data side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Comparison of Correlation Matrices", fontsize=16)

# Adjusted correlation matrix heatmap
axes[0].imshow(correlation_matrix_adjusted, cmap='coolwarm', interpolation='nearest')
axes[0].set_xticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_yticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_xticklabels(correlation_matrix_adjusted.columns, rotation=45)
axes[0].set_yticklabels(correlation_matrix_adjusted.columns)
axes[0].set_title("Adjusted Correlation Matrix")

# Original correlation matrix heatmap
axes[1].imshow(original_corr_df, cmap='coolwarm', interpolation='nearest')
axes[1].set_xticks(np.arange(len(original_corr_df)))
axes[1].set_yticks(np.arange(len(original_corr_df)))
axes[1].set_xticklabels(original_corr_df.columns, rotation=45)
axes[1].set_yticklabels(original_corr_df.columns)
axes[1].set_title("Original Correlation Matrix")

plt.show()


In [None]:
# Function to measure the distance between two correlation matrices using Frobenius norm
def correlation_distance(matrix1, matrix2):
    return np.linalg.norm(matrix1 - matrix2, ord='fro')


current_distance = correlation_distance(matrix1=correlation_matrix_adjusted, matrix2=original_corr_df)

In [None]:
current_distance

## Save

In [None]:
adjusted_df.to_csv("adjusted_df_num_No_BR_250k_v1.csv")

## Combine the two datasets

In [None]:
import pandas as pd

In [None]:
num_df = pd.read_csv("adjusted_df_num_No_BR_250k.csv", index_col=[0])
cat_df = pd.read_csv("df_cat_No_Br_250k.csv", index_col=[0])

In [None]:
df_merged = pd.concat([num_df, cat_df], axis=1)
df_merged.shape

In [None]:
df_merged.head()

## Apply the business rules

In [None]:
df_merged.columns

**INVESTMENT IN STOCKS NOT FOUND!!!**
<br> Currency investments means - cash??
<br> Investment in government securities??
<br> Investment in stocks??

In [None]:
import pandas as pd


# Business rules as a list of dictionaries
full_business_rules = [
    {"Independent feature": "marit_stat", "Independent feature value": "=='Married'", "Dependent feature": "house_memb", "Dependent feature value filter": ">'2'", "Note": "The number of household members in family households is more likely to be greater than 2"},
    {"Independent feature": "prof_ind", "Independent feature value": "=='Financial and administrative activities'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "They are more likely to own a bank account"},
    {"Independent feature": "age", "Independent feature value": "<25", "Dependent feature": "invest_exp", "Dependent feature value filter": "=='0'", "Note": "Under 24s are less likely to have investment experience. Between 35-44 and 45-54 are more likely to have extensive investment experience"},
    {"Independent feature": "age", "Independent feature value": "<25", "Dependent feature": "lv_educ", "Dependent feature value filter": "!='Higher'", "Note": "Under 24s are less likely to have a college degree"},
    {"Independent feature": "age", "Independent feature value": "<25", "Dependent feature": "chil_u_18_y", "Dependent feature value filter": "<'2'", "Note": "From 20-24, it is less likely to have more than 1 child under 18"},
    {"Independent feature": "invest_exp", "Independent feature value": ">'0'", "Dependent feature": "deposits", "Dependent feature value filter": "=='Y'", "Note": "They are more likely to own a bank account"},
    {"Independent feature": "shares", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "corp_oblig", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "oth", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "inv_fund", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "cash", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "crypto", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "gov_bond", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "age", "Independent feature value": "<25", "Dependent feature": "bk_acc", "Dependent feature value filter": "=='N'", "Note": "Under 24s are less likely to have a checking account"},
    {"Independent feature": "age", "Independent feature value": "<18", "Dependent feature": "bk_acc", "Dependent feature value filter": "=='N'", "Note": "Under 18 is not possible to have a current account"},
    {"Independent feature": "lv_educ", "Independent feature value": "=='Higher'", "Dependent feature": "income", "Dependent feature value filter": ">27601", "Note": "A higher level of education implies earnings in the upper range"},
    {"Independent feature": "chil_u_18_y", "Independent feature value": ">'1'", "Dependent feature": "house_memb", "Dependent feature value filter": ">'3'", "Note": "The number of household members is directly dependent on the number of children under 18"},
    {"Independent feature": "lv_educ", "Independent feature value": "=='Higher'", "Dependent feature": "soc_econ_stat", "Dependent feature value filter": "=='Economically active'", "Note": "A higher level of education implies an economically active status"},
    {"Independent feature": "income", "Independent feature value": ">27601", "Dependent feature": "taxes", "Dependent feature value filter": ">2500", "Note": "Earnings in the upper range correspond to higher taxes and insurance"},
]


# Function to apply a single business rule
def apply_business_rule(rule, dataframe):
    independent_feature = rule["Independent feature"]
    independent_feature_value = rule["Independent feature value"]
    dependent_feature = rule["Dependent feature"]
    dependent_feature_value_filter = rule["Dependent feature value filter"]

    # Construct the filter condition dynamically using f-strings
#     filter_condition = f"(dataframe['marit_stat'] == 'Married') & (dataframe['house_memb'] > {dependent_feature_value_filter})"
#     filter_condition = f"(dataframe['age'] {independent_feature_value}) & (dataframe['invest_exp']  {dependent_feature_value_filter})"
#         ({independent_feature} {independent_feature_value}) & ({dependent_feature} {dependent_feature_value_filter}))"

    filter_condition = f"[(dataframe['{independent_feature}'] {independent_feature_value}) & (dataframe['{dependent_feature}'] {dependent_feature_value_filter})]"   
#     [(df_merged['marit_stat'] =='Married') & (df_merged['house_memb'] >'2')]
    
    print("filter_condition: ", filter_condition)
    
    list_mask = eval(filter_condition)

    
    
    # Apply the filter condition to the DataFrame
#     filtered_df = df_merged.loc[eval(filter_condition)]
    filtered_df = df_merged[list_mask[0]]

    return filtered_df

# Apply all business rules to the DataFrame
filtered_dfs = []
for rule in full_business_rules:
#     print("Rule: ", rule)
    df_filtered = apply_business_rule(rule, df_merged)
    
    # CHECK DISTRIBUTION OF THE NEW INDEPENDENT VARIABLE
    # CHECK DISTRIBUTION OF THE OLD INDEPENDENT VARIABLE
    
    # IF DIFFERENT
        # ADJUST THE NEW ONE TO FOLLOW THE OLD ONE
        
    # DO THE SAME WITH THE DEPENDENT VARIABLE
    
    
    
    
    filtered_dfs.append(df_filtered)

# Concatenate all filtered DataFrames
all_filtered_dfs = pd.concat(filtered_dfs)
df_BR_applied = all_filtered_dfs.drop_duplicates()

# print(final_df)


In [None]:
df_BR_applied

## Save

In [None]:
df_BR_applied.to_csv("df_BR_applied_v1.csv")

# Feature Engineering

In [None]:
df = pd.read_csv("df_BR_applied_v1.csv", index_col="Unnamed: 0")
data_sample = df.sample(n=100)

In [None]:
df["mortgage"].value_counts()

## Generate new features

In [None]:
def generate_new_features(data):
    pass

## Adjust current features (cleaning, imputing, deleting)

In [None]:
def clean_features(data):
    pass

def impute_features(data):
    pass

def del_features(data):
    pass

## Split on categorical and numerical

## Data encoding

In [None]:
cat_cols = list(dists.keys())
print("few categorical columns: ", cat_cols[:5])
num_cols = [col for col in df.columns if col not in cat_cols]
print("few numerical columns: ", num_cols[:5])

In [None]:
def data_encoding(data, categorical_cols):
    
    return pd.get_dummies(data, columns=categorical_cols)


In [None]:
encoded_data_sample = data_encoding(data_sample, categorical_cols=cat_cols)

In [None]:
encoded_data_sample

In [None]:
encoded_data_sample.shape

## Feature scaling (data standardization)

min-max scaling and standardization (z-score normalization).

### Split the data into numerical and categorical

In [None]:
def data_split_cat_num(data, numerical_cols):
    
    num_data = data[numerical_cols]
    cat_cols = [col for col in data.columns if col not in numerical_cols]
    cat_data = data[cat_cols]
    
    cat_data.reset_index(inplace=True)
    num_data.reset_index(inplace=True)
    return cat_data, num_data


In [None]:
encoded_cat_data, encoded_num_data = data_split_cat_num(data=encoded_data_sample, 
                                                       numerical_cols=num_cols)

In [None]:
encoded_cat_data.head()

In [None]:
encoded_num_data

### TODO adjust the function below; Should work only with Num features

In [None]:
def data_standardization(data):
    
    from sklearn.preprocessing import StandardScaler
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit and transform the DataFrame to perform standardization
    standardized_df = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    
    return standardized_df

test standardization with all features (not just the numerical ones)

In [None]:
encoded_data_sample_standardized_numerical = data_standardization(encoded_num_data)

In [None]:
encoded_cat_data

In [None]:
encoded_data_sample_standardized_numerical

In [None]:
encoded_data_sample_standardized = pd.concat([encoded_cat_data, encoded_data_sample_standardized_numerical],
                                            axis=1)

In [None]:
encoded_data_sample_standardized.shape

## Delete binary cols (columns that have only two possible options as values)

Will keep only the "YES" features (e.g. mortgage_YES - a value of 1 here means the client wants to have a mortgage, 0 - otherwise) to reduce the size of the dataset

In [None]:
def del_NO_cols(data, additional_cols_to_del=['banking_Offline', 'own_rent_house_my own', 'soc_econ_stat_Economically inactive', 'sex_F']):
    
    # select all "_NO" columns 
    cols_to_del = list(data.filter(regex='_NO$').columns)
    
    # and extend the list with additional columns to be deleted
    cols_to_del.extend(additional_cols_to_del)
    
    reduced_data = data.drop(cols_to_del, axis=1)
    
    return reduced_data

In [None]:
encoded_data_sample_reduced = del_NO_cols(encoded_data_sample_standardized)

In [None]:
encoded_data_sample_reduced.shape

In [None]:
encoded_data_sample_reduced.head()

## Split data - predictors and target

Target features:

    Overdraft
    Consumer credit
    Mortgage loan
    Credit card


In [None]:
encoded_data_sample_reduced.columns

In [None]:
target_columns = ["overdraft_YES", "cons_cred_YES", "mortgage_YES", "bk_cc_YES"]

def split_pred_target(data, target_cols):

    predictor_cols = [col for col in data.columns if col not in target_cols]

    
    X_data = data[predictor_cols]
    y_data = data[target_cols]

    return X_data, y_data



In [None]:
X, y = split_pred_target(data=encoded_data_sample_reduced, target_cols=target_columns)

In [None]:
y.shape

In [None]:
y

In [None]:
X.shape

In [None]:
y.columns

## Split data on train and test sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Balance the data (target)

Multiple ways to do that:
- SMOTE
- **TODO** RandomOverSampler with correlation-aware sampling (ROS-CAS)


Will split y_train into four different y_trains for each category. The idea is that we'll have four separate models in the end that are going to make predictions for each class separately.

### SMOTE

In [None]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter


y_train_overdraft = y_train["overdraft_YES"]
y_train_cons_cred = y_train["cons_cred_YES"]
y_train_mortgage  = y_train["mortgage_YES"]
y_train_bk_cc     = y_train["bk_cc_YES"]



y_test_overdraft = y_test["overdraft_YES"]
y_test_cons_cred = y_test["cons_cred_YES"]
y_test_mortgage  = y_test["mortgage_YES"]
y_test_bk_cc     = y_test["bk_cc_YES"]


#### Try with 4 different models

In [None]:
y_train_overdraft.name

In [None]:
def oversample_with_SMOTE(train_X, train_y):
    

    os = SMOTE(random_state=42)

    os_X_tr, os_y_tr = os.fit_resample(train_X, train_y)
    # TODO ...the rest to follow later

    df_os_X_tr = pd.DataFrame(data=os_X_tr ,columns=train_X.columns)
    df_os_y_tr = pd.DataFrame(data=os_y_tr, columns=[train_y.name])


    # check old and new distributions:
    print("Original data target distributions:")
    print(train_y.value_counts())
    print()
    print()
    print("Oversampled data target distributions:")
    print(df_os_y_tr.value_counts())
    
    
    return df_os_X_tr, df_os_y_tr

In [None]:
df_os_X_train_overdraft, df_os_y_train_overdraft = oversample_with_SMOTE(train_X=X_train, train_y=y_train_overdraft)

In [None]:
df_os_y_train_overdraft.values.flatten()

# Feature selection

In [None]:
def feature_sel():
    pass

# Modeling 

In [None]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
lg1 = LogisticRegression(random_state=42, max_iter=1000, class_weight=None)


def model_predict(model, train_X, train_y, test_X, test_y):
    
    from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score
    
    # extract values
    train_y_values = train_y.values.flatten() 
    
    # fit it
    model.fit(train_X, train_y_values)
    
    # test
    pred_y = model.predict(test_X)# performance
    print(f'Accuracy Score: {accuracy_score(test_y, pred_y)}')
    print(f'Confusion Matrix: \n{confusion_matrix(test_y, pred_y)}')
    print(f'Area Under Curve: {roc_auc_score(test_y, pred_y)}')
    print(f'Recall score: {recall_score(test_y, pred_y)}')
    
    return pred_y

In [None]:
pred_y_overdraft = model_predict(model=lg1,
                                train_X=df_os_X_train_overdraft,
                                train_y=df_os_y_train_overdraft,
                                test_X=X_test,
                                test_y=y_test_overdraft)

## Analyse feature importances

# Old tests

## Apply business rules

In [None]:
# test = f"[(df_merged['marit_stat'] =='Married') & (df_merged['house_memb'] >'2')]"
test = f"[(df_merged['age'] <25) & (df_merged['lv_educ'] !='Higher')]"

list_mask = eval(test)
df_merged[list_mask[0]]

In [None]:
df_merged.columns

In [None]:
df_merged.loc[(df_merged['marit_stat'] =='Married') & (df_merged['house_memb'] >'2')]