In [4]:
# Importing Libraries
import pyodbc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Database Connection
conn = pyodbc.connect(
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'SERVER=MOMIN;'
    'DATABASE=banking_project;'
    'Trusted_Connection=yes;'
)

In [6]:
# Loading Data
query = "SELECT * FROM customer"
df = pd.read_sql(query, conn)

  df = pd.read_sql(query, conn)


In [4]:
# Shape
print("Shape (Rows, Columns):", df.shape)

Shape (Rows, Columns): (3000, 25)


In [5]:
# Info
print("\n--- Data Info ---")
df.info()


--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Client_ID                 3000 non-null   object 
 1   Name                      3000 non-null   object 
 2   Age                       3000 non-null   int64  
 3   Location_ID               3000 non-null   int64  
 4   Joined_Bank               3000 non-null   object 
 5   Banking_Contact           3000 non-null   object 
 6   Nationality               3000 non-null   object 
 7   Occupation                3000 non-null   object 
 8   Fee_Structure             3000 non-null   object 
 9   Loyalty_Classification    3000 non-null   object 
 10  Estimated_Income          3000 non-null   float64
 11  Superannuation_Savings    3000 non-null   float64
 12  Amount_of_Credit_Cards    3000 non-null   int64  
 13  Credit_Card_Balance       3000 non-null   fl

In [6]:
# Preview
print("\n--- First 5 Rows ---")
display(df.head())


--- First 5 Rows ---


Unnamed: 0,Client_ID,Name,Age,Location_ID,Joined_Bank,Banking_Contact,Nationality,Occupation,Fee_Structure,Loyalty_Classification,...,Bank_Deposits,Checking_Accounts,Saving_Accounts,Foreign_Currency_Account,Business_Lending,Properties_Owned,Risk_Weighting,BRId,GenderId,IAId
0,IND81288,Raymond Mills,24,34324,2019-06-05,Anthony Torres,American,Safety Technician IV,High,Jade,...,1485829.0,603617.9,607332.4375,12249.959961,1134475.0,1,2,1,1,1
1,IND65833,Julia Spencer,23,42205,2001-10-12,Jonathan Hawkins,African,Software Consultant,High,Jade,...,641482.8,229521.4,344635.15625,61162.308594,2000526.0,1,3,2,1,2
2,IND47499,Stephen Murray,27,7314,2010-01-25,Anthony Berry,European,Help Desk Operator,High,Gold,...,1033402.0,652674.7,203054.34375,79071.78125,548137.6,1,3,3,2,3
3,IND72498,Virginia Garza,40,34594,2019-03-28,Steve Diaz,American,Geologist II,Mid,Silver,...,1048158.0,1048158.0,234685.015625,57513.648438,1148402.0,0,4,4,1,4
4,IND60181,Melissa Sanders,46,41269,2012-07-20,Shawn Long,American,Assistant Professor,Mid,Platinum,...,487782.5,446644.2,128351.453125,30012.140625,1674412.0,0,3,1,2,5


In [14]:
# Loading lookup tables
gender_df = pd.read_sql("SELECT * FROM gender", conn)
br_df = pd.read_sql("SELECT * FROM banking_relationship", conn)
ia_df = pd.read_sql("SELECT * FROM investment_advisor", conn)

# Joining Gender
df = df.merge(gender_df, on='GenderId', how='left')

# Joining Banking Relationship
df = df.merge(br_df, on='BRId', how='left')

# Joining Investment Advisor
df = df.merge(ia_df, on='IAId', how='left')

# Preview the joined DataFrame
display(df.head())


  gender_df = pd.read_sql("SELECT * FROM gender", conn)
  br_df = pd.read_sql("SELECT * FROM banking_relationship", conn)
  ia_df = pd.read_sql("SELECT * FROM investment_advisor", conn)


Unnamed: 0,Client_ID,Name,Age,Location_ID,Joined_Bank,Banking_Contact,Nationality,Occupation,Fee_Structure,Loyalty_Classification,...,Foreign_Currency_Account,Business_Lending,Properties_Owned,Risk_Weighting,BRId,GenderId,IAId,Gender,Banking_Relationship,Investment_Advisor
0,IND81288,Raymond Mills,24,34324,2019-06-05,Anthony Torres,American,Safety Technician IV,High,Jade,...,12249.959961,1134475.0,1,2,1,1,1,Male,Retail,Victor Dean
1,IND65833,Julia Spencer,23,42205,2001-10-12,Jonathan Hawkins,African,Software Consultant,High,Jade,...,61162.308594,2000526.0,1,3,2,1,2,Male,Institutional,Jeremy Porter
2,IND47499,Stephen Murray,27,7314,2010-01-25,Anthony Berry,European,Help Desk Operator,High,Gold,...,79071.78125,548137.6,1,3,3,2,3,Female,Private Bank,Ernest Knight
3,IND72498,Virginia Garza,40,34594,2019-03-28,Steve Diaz,American,Geologist II,Mid,Silver,...,57513.648438,1148402.0,0,4,4,1,4,Male,Commercial,Eric Shaw
4,IND60181,Melissa Sanders,46,41269,2012-07-20,Shawn Long,American,Assistant Professor,Mid,Platinum,...,30012.140625,1674412.0,0,3,1,2,5,Female,Retail,Kevin Kim


In [7]:
# Missing Values
print("\n--- Missing Values Per Column ---")
print(df.isnull().sum())


--- Missing Values Per Column ---
Client_ID                   0
Name                        0
Age                         0
Location_ID                 0
Joined_Bank                 0
Banking_Contact             0
Nationality                 0
Occupation                  0
Fee_Structure               0
Loyalty_Classification      0
Estimated_Income            0
Superannuation_Savings      0
Amount_of_Credit_Cards      0
Credit_Card_Balance         0
Bank_Loans                  0
Bank_Deposits               0
Checking_Accounts           0
Saving_Accounts             0
Foreign_Currency_Account    0
Business_Lending            0
Properties_Owned            0
Risk_Weighting              0
BRId                        0
GenderId                    0
IAId                        0
dtype: int64


In [8]:
# Duplicates
num_duplicates = df.duplicated().sum()
print(f"\n--- Number of Duplicate Rows: {num_duplicates} ---")
if num_duplicates > 0:
    display(df[df.duplicated()])


--- Number of Duplicate Rows: 0 ---


In [7]:
# Descriptive Stats
print("\n--- Descriptive Statistics (Numeric Columns) ---")
display(df.describe().T)


--- Descriptive Statistics (Numeric Columns) ---


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,3000.0,51.039667,19.85476,17.0,34.0,51.0,69.0,85.0
Location_ID,3000.0,21563.323,12462.273017,12.0,10803.5,21129.5,32054.5,43369.0
Estimated_Income,3000.0,171305.034184,111935.80818,15919.480469,82906.597656,142313.476562,242290.3,522330.2
Superannuation_Savings,3000.0,25531.599685,16259.950768,1482.030029,12513.774902,22357.355469,35464.74,75963.9
Amount_of_Credit_Cards,3000.0,1.463667,0.676387,1.0,1.0,1.0,2.0,3.0
Credit_Card_Balance,3000.0,3176.206944,2497.094709,1.17,1236.630005,2560.804932,4522.633,13991.99
Bank_Loans,3000.0,591386.156003,457557.036884,0.0,239628.136719,479793.40625,825813.0,2667557.0
Bank_Deposits,3000.0,671560.193962,645716.857281,0.0,204400.375,463316.46875,942754.6,3890598.0
Checking_Accounts,3000.0,321092.948707,282079.552753,0.0,119947.53125,242815.65625,434874.9,1969923.0
Saving_Accounts,3000.0,232908.353518,230007.777585,0.0,74794.404297,164086.554688,315575.0,1724118.0


In [12]:
# Unique Values
print("\n--- Unique Value Counts Per Column ---")
print(df.nunique())


--- Unique Value Counts Per Column ---
Client_ID                   2940
Name                        2913
Age                           69
Location_ID                 2890
Joined_Bank                 2579
Banking_Contact               49
Nationality                    5
Occupation                   195
Fee_Structure                  3
Loyalty_Classification         4
Estimated_Income            3000
Superannuation_Savings      2997
Amount_of_Credit_Cards         3
Credit_Card_Balance         2996
Bank_Loans                  2973
Bank_Deposits               2966
Checking_Accounts           2967
Saving_Accounts             2967
Foreign_Currency_Account    3000
Business_Lending            2987
Properties_Owned               4
Risk_Weighting                 5
BRId                           4
GenderId                       2
IAId                          22
dtype: int64
