# Load Conf and Credentials

## Load Directory Locations

In [2]:
import json
import os

# Check if the file exists and load the JSON file into a dictionary
file_path = r'C:\Users\mike\Develop\Projects\Code Notebook\Credentials\locations_conf.json'
if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        locations_data = json.load(f)
    print(locations_data)
else:
    print(f"File not found: {file_path}")

{'Common_Funcs_Dir': '/Users/mike/Develop/Projects/Code Notebook/Common/Functions', 'Credentials_Dir': '/Users/mike/Develop/Projects/Code Notebook/Credentials', 'Rel_Pickes_Dir': '../.pickles', 'Pub_Data_Dir': "'/Users/mike/Data/Public", 'BQ_Service_Key': '/Users/mike/Develop/Conf/GCP Service Keys/mikecancell-development-0bcca41f8486.json'}


### Get the Common Funcs Dir into the Sys Path
This appears to be required bc the Funcs are .py files vs .ipynb files

In [3]:
import sys
sys.path.append(locations_data['Common_Funcs_Dir'])
from func_Load_Data_to_Frame import *

# Load The Source Data to a DF

In [4]:
import pandas as pd

source_file = 'Banking_Data.xlsx'
source_path = os.path.join(locations_data['Pub_Data_Dir'].strip("'"), source_file)
df = pd.read_excel(source_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 23 columns):
 #   Column                                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                                              --------------  -----  
 0   CLIENTNUM                                                                                                                           10127 non-null  int64  
 1   Attrition_Flag                                                                                                                      10127 non-null  object 
 2   Age                                                                                                                                 10127 non-null  int64  
 3   Gender                                                                           

# Create Dimensions for Categories

### Age Dim

In [5]:
# Define age bins and labels
age_bins = [0, 17, 24, 34, 44, 54, 64, 74, 100]
age_labels = ['Age: 0-17', 'Age: 18-24', 'Age: 25-34', 'Age: 35-44', 'Age: 45-54', 'Age: 55-64', 'Age: 65-74', 'Age: 75+']

# Create the age dimension DataFrame
dim_age = pd.DataFrame({'Age': df['Age'].unique()})
dim_age['Age_Group'] = pd.cut(dim_age['Age'], bins=age_bins, labels=age_labels, right=True)

print(dim_age)

    Age   Age_Group
0    45  Age: 45-54
1    49  Age: 45-54
2    51  Age: 45-54
3    40  Age: 35-44
4    44  Age: 35-44
5    32  Age: 25-34
6    37  Age: 35-44
7    48  Age: 45-54
8    42  Age: 35-44
9    65  Age: 65-74
10   56  Age: 55-64
11   35  Age: 35-44
12   57  Age: 55-64
13   41  Age: 35-44
14   61  Age: 55-64
15   47  Age: 45-54
16   62  Age: 55-64
17   54  Age: 45-54
18   59  Age: 55-64
19   63  Age: 55-64
20   53  Age: 45-54
21   58  Age: 55-64
22   55  Age: 55-64
23   66  Age: 65-74
24   50  Age: 45-54
25   38  Age: 35-44
26   46  Age: 45-54
27   52  Age: 45-54
28   39  Age: 35-44
29   43  Age: 35-44
30   64  Age: 55-64
31   68  Age: 65-74
32   67  Age: 65-74
33   60  Age: 55-64
34   73  Age: 65-74
35   70  Age: 65-74
36   36  Age: 35-44
37   34  Age: 25-34
38   33  Age: 25-34
39   26  Age: 25-34
40   31  Age: 25-34
41   29  Age: 25-34
42   30  Age: 25-34
43   28  Age: 25-34
44   27  Age: 25-34


### Dim Inactivity Months

In [6]:
# Define months inactive bins and labels
months_inactive_bins = [-1, 1, 3, 6, float('inf')]
months_inactive_labels = ['Months Inactive: 0-1', 'Months Inactive: 2-3', 'Months Inactive: 4-6', 'Months Inactive: >6']

# Create the months inactive dimension DataFrame
dim_inactive = pd.DataFrame({'Months_Inactive': df['Months_Inactive_12_mon'].unique()})
dim_inactive['Months_Inactive_Group'] = pd.cut(
    dim_inactive['Months_Inactive'], 
    bins=months_inactive_bins, 
    labels=months_inactive_labels, 
    right=True
)

print(dim_inactive)

   Months_Inactive Months_Inactive_Group
0                1  Months Inactive: 0-1
1                4  Months Inactive: 4-6
2                2  Months Inactive: 2-3
3                3  Months Inactive: 2-3
4                6  Months Inactive: 4-6
5                0  Months Inactive: 0-1
6                5  Months Inactive: 4-6


### Dim Contacts

In [7]:
# Define contacts count bins and labels
contacts_count_bins = [-1, 1, 3, 5, float('inf')]
contacts_count_labels = ['Total 12 Mon Contacts: 0-1', 'Total 12 Mon Contacts: 2-3', 'Total 12 Mon Contacts: 4-5', 'Total 12 Mon Contacts: >5']

# Create the contacts count dimension DataFrame
dim_contact = pd.DataFrame({'Contacts_Count': df['Contacts_Count_12_mon'].unique()})
dim_contact['Contacts_Count_Group'] = pd.cut(
    dim_contact['Contacts_Count'], 
    bins=contacts_count_bins, 
    labels=contacts_count_labels, 
    right=True
)

print(dim_contact)

   Contacts_Count        Contacts_Count_Group
0               3  Total 12 Mon Contacts: 2-3
1               2  Total 12 Mon Contacts: 2-3
2               0  Total 12 Mon Contacts: 0-1
3               1  Total 12 Mon Contacts: 0-1
4               4  Total 12 Mon Contacts: 4-5
5               5  Total 12 Mon Contacts: 4-5
6               6   Total 12 Mon Contacts: >5


### Dim Credit Limits

In [8]:
# Define credit limit bins and labels
credit_limit_bins = [0, 5000, 10000, 20000, 50000, float('inf')]
credit_limit_labels = ['Credit Limit: $0-$5K', 'Credit Limit: $5K-$10K', 'Credit Limit: $10K-$20K', 'Credit Limit: $20K-$50K', 'Credit Limit: >$50K']

# Create the credit limit dimension DataFrame
dim_credit_limits = pd.DataFrame({'Credit_Limit': df['Credit_Limit'].unique()})
dim_credit_limits['Credit_Limit_Group'] = pd.cut(
    dim_credit_limits['Credit_Limit'], 
    bins=credit_limit_bins, 
    labels=credit_limit_labels, 
    right=True
)

print(dim_credit_limits)

      Credit_Limit       Credit_Limit_Group
0          12691.0  Credit Limit: $10K-$20K
1           8256.0   Credit Limit: $5K-$10K
2           3418.0     Credit Limit: $0-$5K
3           3313.0     Credit Limit: $0-$5K
4           4716.0     Credit Limit: $0-$5K
...            ...                      ...
6200        3688.0     Credit Limit: $0-$5K
6201        4003.0     Credit Limit: $0-$5K
6202        5409.0   Credit Limit: $5K-$10K
6203        5281.0   Credit Limit: $5K-$10K
6204       10388.0  Credit Limit: $10K-$20K

[6205 rows x 2 columns]


### Dim Revolving Bals

In [9]:
# Define Total_Revolving_Bal bins and labels
revolving_bal_bins = [0, 1000, 5000, 10000, 20000, float('inf')]
def new_func():
    revolving_bal_labels = ['Total Revolving Bal: $0-$1K', 'Total Revolving Bal: $1K-$5K', 'Total Revolving Bal: $5K-$10K', 'Total Revolving Bal: $10K-$20K', 'Total Revolving Bal: >$20K']
    return revolving_bal_labels

revolving_bal_labels = new_func()

# Create the Total_Revolving_Bal dimension DataFrame
dim_revolving_bal = pd.DataFrame({'Total_Revolving_Bal': df['Total_Revolving_Bal'].unique()})
dim_revolving_bal['Revolving_Bal_Group'] = pd.cut(
    dim_revolving_bal['Total_Revolving_Bal'], 
    bins=revolving_bal_bins, 
    labels=revolving_bal_labels, 
    right=True, 
    include_lowest=True
)

print(dim_revolving_bal)

      Total_Revolving_Bal           Revolving_Bal_Group
0                     777   Total Revolving Bal: $0-$1K
1                     864   Total Revolving Bal: $0-$1K
2                       0   Total Revolving Bal: $0-$1K
3                    2517  Total Revolving Bal: $1K-$5K
4                    1247  Total Revolving Bal: $1K-$5K
...                   ...                           ...
1969                 1768  Total Revolving Bal: $1K-$5K
1970                  779   Total Revolving Bal: $0-$1K
1971                  534   Total Revolving Bal: $0-$1K
1972                  476   Total Revolving Bal: $0-$1K
1973                 2241  Total Revolving Bal: $1K-$5K

[1974 rows x 2 columns]


### Dim Transaction Amount

In [10]:
# Define Total_Trans_Amt bins and labels
total_trans_amt_bins = [0, 2500, 5000, 10000, 20000, float('inf')]
total_trans_amt_labels = ['Total Trans Amt: 0-2.5K', 'Total Trans Amt: 2.5K-5K', 'Total Trans Amt: 5K-10K', 'Total Trans Amt: 10K-20K', 'Total Trans Amt: >20K']

# Create the Total_Trans_Amt dimension DataFrame
dim_trans_amt = pd.DataFrame({'Total_Trans_Amt': df['Total_Trans_Amt'].unique()})
dim_trans_amt['Total_Trans_Amt_Group'] = pd.cut(
    dim_trans_amt['Total_Trans_Amt'], 
    bins=total_trans_amt_bins, 
    labels=total_trans_amt_labels, 
    right=True
)

print(dim_trans_amt)

      Total_Trans_Amt     Total_Trans_Amt_Group
0                1144   Total Trans Amt: 0-2.5K
1                1291   Total Trans Amt: 0-2.5K
2                1887   Total Trans Amt: 0-2.5K
3                1171   Total Trans Amt: 0-2.5K
4                 816   Total Trans Amt: 0-2.5K
...               ...                       ...
5028            15476  Total Trans Amt: 10K-20K
5029             8764   Total Trans Amt: 5K-10K
5030            10291  Total Trans Amt: 10K-20K
5031             8395   Total Trans Amt: 5K-10K
5032            10294  Total Trans Amt: 10K-20K

[5033 rows x 2 columns]


### Transaction Counts

In [11]:
# Define Total_Trans_Ct bins and labels
total_trans_ct_bins = [0, 20, 40, 60, 80, float('inf')]
total_trans_ct_labels = [
    'Total Transactions: 0-20', 
    'Total Transactions: 21-40', 
    'Total Transactions: 41-60', 
    'Total Transactions: 61-80', 
    'Total Transactions: >80'
]

# Create the Total_Trans_Ct dimension DataFrame
dim_trans_cnt = pd.DataFrame({'Total_Trans_Ct': df['Total_Trans_Ct'].unique()})
dim_trans_cnt['Total_Trans_Ct_Group'] = pd.cut(
    dim_trans_cnt['Total_Trans_Ct'], 
    bins=total_trans_ct_bins, 
    labels=total_trans_ct_labels, 
    right=True
)

print(dim_trans_cnt)

     Total_Trans_Ct       Total_Trans_Ct_Group
0                42  Total Transactions: 41-60
1                33  Total Transactions: 21-40
2                20   Total Transactions: 0-20
3                28  Total Transactions: 21-40
4                24  Total Transactions: 21-40
..              ...                        ...
121             139    Total Transactions: >80
122             123    Total Transactions: >80
123             130    Total Transactions: >80
124             138    Total Transactions: >80
125             132    Total Transactions: >80

[126 rows x 2 columns]


### Dim Utilization Ratios

In [12]:
# Define Avg_Utilization_Ratio bins and labels
utilization_ratio_bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
utilization_ratio_labels = ['Avg Util Ratio: 0-20%', 'Avg Util Ratio: 21-40%', 'Avg Util Ratio: 41-60%', 'Avg Util Ratio: 61-80%', 'Avg Util Ratio: 81-100%']

# Create the Avg_Utilization_Ratio dimension DataFrame
dim_utilization = pd.DataFrame({'Avg_Utilization_Ratio': df['Avg_Utilization_Ratio'].unique()})
dim_utilization['Utilization_Ratio_Group'] = pd.cut(
    dim_utilization['Avg_Utilization_Ratio'], 
    bins=utilization_ratio_bins, 
    labels=utilization_ratio_labels, 
    right=True, 
    include_lowest=True
)

print(dim_utilization)

     Avg_Utilization_Ratio  Utilization_Ratio_Group
0                    0.061    Avg Util Ratio: 0-20%
1                    0.105    Avg Util Ratio: 0-20%
2                    0.000    Avg Util Ratio: 0-20%
3                    0.760   Avg Util Ratio: 61-80%
4                    0.311   Avg Util Ratio: 21-40%
..                     ...                      ...
959                  0.909  Avg Util Ratio: 81-100%
960                  0.005    Avg Util Ratio: 0-20%
961                  0.007    Avg Util Ratio: 0-20%
962                  0.014    Avg Util Ratio: 0-20%
963                  0.009    Avg Util Ratio: 0-20%

[964 rows x 2 columns]


### Dim Naive Bayes Probabilities

In [13]:
# Define bins and labels for Naive Bayes column
naive_bayes_bins = [0, 0.25, 0.5, 0.75, 1.0]
naive_bayes_labels = ['Naive Bayes: Low Probability', 'Naive Bayes: Medium Probability', 'Naive Bayes: High Probability', 'Naive Bayes: Very High Probability']

# Create the Naive Bayes dimension DataFrame
dim_naive_bayes = pd.DataFrame({'Naive_Bayes': df['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'].unique()})
dim_naive_bayes['Naive_Bayes_Group'] = pd.cut(
    dim_naive_bayes['Naive_Bayes'], 
    bins=naive_bayes_bins, 
    labels=naive_bayes_labels, 
    right=True, 
    include_lowest=True
)

print(dim_naive_bayes)

      Naive_Bayes                   Naive_Bayes_Group
0        0.000093        Naive Bayes: Low Probability
1        0.000057        Naive Bayes: Low Probability
2        0.000021        Naive Bayes: Low Probability
3        0.000134        Naive Bayes: Low Probability
4        0.000022        Naive Bayes: Low Probability
...           ...                                 ...
1586     0.000634        Naive Bayes: Low Probability
1587     0.000868        Naive Bayes: Low Probability
1588     0.000896        Naive Bayes: Low Probability
1589     0.994380  Naive Bayes: Very High Probability
1590     0.996620  Naive Bayes: Very High Probability

[1591 rows x 2 columns]


# Fact Table

In [16]:
# Select the specified columns to create the fact table
fact_table = df[[
    'Attrition_Flag', 'Age', 'Gender', 'Dependent_count', 'Education_Level', 
    'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 
    'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 
    'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 
    'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'
]].copy()

# Add a new boolean column 'is_Attrited'
fact_table.loc[:, 'is_Attrited'] = fact_table['Attrition_Flag'] == 'Attrited Customer'
fact_table.rename(columns={
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1': 'Naive_Bayes'
}, inplace=True)
# Display the fact table
print(fact_table)

          Attrition_Flag  Age Gender  Dependent_count Education_Level  \
0      Existing Customer   45      M                3     High School   
1      Existing Customer   49      F                5        Graduate   
2      Existing Customer   51      M                3        Graduate   
3      Existing Customer   40      F                4     High School   
4      Existing Customer   40      M                3      Uneducated   
...                  ...  ...    ...              ...             ...   
10122  Existing Customer   50      M                2        Graduate   
10123  Attrited Customer   41      M                2         Unknown   
10124  Attrited Customer   44      F                1     High School   
10125  Attrited Customer   30      M                2        Graduate   
10126  Attrited Customer   43      F                2        Graduate   

      Marital_Status Income_Category Card_Category  Months_on_book  \
0            Married     $60K - $80K          Blue   

# Pickle & Zip each Dimension and Fact Table
This is for downstream processing

In [17]:
import pickle
import zipfile
import os

# Define the output directory
output_dir = os.path.join(locations_data['Rel_Pickes_Dir'])

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Define the files to pickle
data_to_pickle = {
    'dim_age.pkl': dim_age,
    'dim_contact.pkl': dim_contact,
    'dim_credit_limits.pkl': dim_credit_limits,
    'dim_inactive.pkl': dim_inactive,
    'dim_naive_bayes.pkl': dim_naive_bayes,
    'dim_revolving_bal.pkl': dim_revolving_bal,
    'dim_trans_amt.pkl': dim_trans_amt,
    'dim_trans_cnt.pkl': dim_trans_cnt,
    'dim_utilization.pkl': dim_utilization,
    'fact_table.pkl': fact_table
}

# Pickle and zip each file
for file_name, data in data_to_pickle.items():
    pickle_path = os.path.join(output_dir, file_name)
    zip_path = os.path.join(output_dir, f"{file_name}.zip")
    
    # Pickle the data
    with open(pickle_path, 'wb') as f:
        pickle.dump(data, f)
    
    # Zip the pickle file
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(pickle_path, arcname=file_name)
    
    # Optionally, remove the original pickle file after zipping
    os.remove(pickle_path)

print(f"Pickled and zipped files are saved to {output_dir}")

Pickled and zipped files are saved to ../.pickles
