### 1. Load the data
Load the training data and display the first few rows to get an overview of the dataset

In [14]:
import pandas as pd

# Load the training data
df = pd.read_csv("data/train.csv")

# Show the first 5 rows
print(df.head())

   id  CustomerId         Surname  CreditScore Geography Gender   Age  Tenure  \
0   0    15674932  Okwudilichukwu          668    France   Male  33.0       3   
1   1    15749177   Okwudiliolisa          627    France   Male  33.0       1   
2   2    15694510           Hsueh          678    France   Male  40.0      10   
3   3    15741417             Kao          581    France   Male  34.0       2   
4   4    15766172       Chiemenam          716     Spain   Male  33.0       5   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  \
0       0.00              2        1.0             0.0        181449.97   
1       0.00              2        1.0             1.0         49503.50   
2       0.00              2        1.0             0.0        184866.69   
3  148882.54              1        1.0             1.0         84560.88   
4       0.00              2        1.0             1.0         15068.83   

   Exited  
0       0  
1       0  
2       0  
3       0  
4 

### 2. Explore the Dataset
We'll check data types, look for missing values, and start understanding the structure of our dataset.

In [15]:
# Check data types and non-null counts
print(df.info())

# Check for missing values in each column
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB
None
id                 0
CustomerId         0
Surname            0
Credit

In [16]:
# Show unique values and how many times each appears
print(df['Geography'].value_counts())

Geography
France     94215
Spain      36213
Germany    34606
Name: count, dtype: int64


#### 3. Clean the Data
We’ll drop irrelevant columns, and convert categorical columns Gender and Geography into numeric values to make the data usable for modelling.

In [17]:
# Make a copy of the original DataFrame to keep things safe
df_clean = df.copy()

# Drop unnecessary columns
df_clean = df_clean.drop(['id', 'CustomerId', 'Surname'], axis=1)

# Convert 'Gender' to binary: Male = 1, Female = 0
df_clean['Gender'] = df_clean['Gender'].map({'Male': 1, 'Female': 0})

# Convert 'Geography' to dummy variables (e.g. France, Germany, Spain)
df_clean = pd.get_dummies(df_clean, columns=['Geography'], drop_first=True)

# View the cleaned DataFrame
print(df_clean.head())

   CreditScore  Gender   Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          668       1  33.0       3       0.00              2        1.0   
1          627       1  33.0       1       0.00              2        1.0   
2          678       1  40.0      10       0.00              2        1.0   
3          581       1  34.0       2  148882.54              1        1.0   
4          716       1  33.0       5       0.00              2        1.0   

   IsActiveMember  EstimatedSalary  Exited  Geography_Germany  Geography_Spain  
0             0.0        181449.97       0              False            False  
1             1.0         49503.50       0              False            False  
2             0.0        184866.69       0              False            False  
3             1.0         84560.88       0              False            False  
4             1.0         15068.83       0              False             True  


#### 4. Feature Engineering
To help the model, we create new features based on existing ones, like balance-to-salary ratio and a flag for having any balance.

In [18]:
df_clean['BalanceSalaryRatio'] = df_clean['Balance'] / (df_clean['EstimatedSalary'] + 1)

In [19]:
df_clean['HasBalance'] = (df_clean['Balance'] > 0).astype(int)

In [20]:
print(df_clean[['Balance', 'EstimatedSalary', 'BalanceSalaryRatio', 'HasBalance']].head())
print(df_clean.isnull().sum())

     Balance  EstimatedSalary  BalanceSalaryRatio  HasBalance
0       0.00        181449.97            0.000000           0
1       0.00         49503.50            0.000000           0
2       0.00        184866.69            0.000000           0
3  148882.54         84560.88            1.760634           1
4       0.00         15068.83            0.000000           0
CreditScore           0
Gender                0
Age                   0
Tenure                0
Balance               0
NumOfProducts         0
HasCrCard             0
IsActiveMember        0
EstimatedSalary       0
Exited                0
Geography_Germany     0
Geography_Spain       0
BalanceSalaryRatio    0
HasBalance            0
dtype: int64
