# Data Description

SHORT DESCRIPTION ABOUT CHURN MODELLING DATASET


* CustomerID: A unique identifier for each customer.

* Surname: The customer's last name.

* CreditScore: A numeric score representing the creditworthiness of the customer.

* Geography: The country or region where the customer resides.

* Gender: The gender of the customer (e.g., Male, Female).

* Age: The age of the customer.

* Tenure: The number of years the customer has been with the company.

* Balance : The amount of money the customer currently has in their account(s).

* NumOfProducts : The number of products the customer has purchased or is subscribed to.

* HasCrCard: A binary indicator (Yes/No or 1/0) indicating whether the customer has a credit card.

* IsActiveMember: A binary indicator showing if the customer is an active member (Yes/No or 1/0).

* EstimatedSalary : The estimated annual salary of the customer.

* Exited : The target variable, indicating whether the customer has left the company (Yes/No or 1/0).


# Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('Churn_Modelling.csv')

In [3]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


# Basic Info

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
df.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [6]:
df.shape

(10000, 14)

In [7]:
df.size

140000

In [8]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [10]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [11]:
df.describe(include='object')

Unnamed: 0,Surname,Geography,Gender
count,10000,10000,10000
unique,2932,3,2
top,Smith,France,Male
freq,32,5014,5457


In [12]:
df.describe(include='all')

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000,10000.0,10000,10000,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
unique,,,2932,,3,2,,,,,,,,
top,,,Smith,,France,Male,,,,,,,,
freq,,,32,,5014,5457,,,,,,,,
mean,5000.5,15690940.0,,650.5288,,,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,,96.653299,,,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,,350.0,,,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,,584.0,,,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,,652.0,,,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,,718.0,,,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0


In [13]:
df.nunique()

Unnamed: 0,0
RowNumber,10000
CustomerId,10000
Surname,2932
CreditScore,460
Geography,3
Gender,2
Age,70
Tenure,11
Balance,6382
NumOfProducts,4


In [14]:
df.ndim

2

In [15]:
df.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,count
RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Unnamed: 14_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1,1
6671,15667932,Bellucci,758,Spain,Female,43,10,0.00,2,1,1,55313.44,0,1
6664,15766185,She,850,Germany,Male,31,4,146587.30,1,1,1,89874.82,0,1
6665,15667632,Birdseye,703,France,Female,42,7,0.00,2,0,1,72500.68,0,1
6666,15599024,Hope,506,Spain,Male,32,8,0.00,2,0,1,182692.80,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3334,15599078,Yang,619,Germany,Female,41,5,92467.58,1,1,0,38270.47,0,1
3335,15702300,Walker,671,France,Male,27,5,0.00,2,0,0,120893.07,0,1
3336,15660735,T'ang,581,Spain,Female,31,6,0.00,2,1,0,188377.21,0,1
3337,15671390,Chukwukere,690,Spain,Male,36,10,0.00,2,1,0,55902.93,0,1


## Checking Null Values

In [16]:
df.isnull().sum()

Unnamed: 0,0
RowNumber,0
CustomerId,0
Surname,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0


In [17]:
df.isna().sum()

Unnamed: 0,0
RowNumber,0
CustomerId,0
Surname,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0


* Cleaned data





In [18]:
df.copy()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


## Checking datatypes of columns

In [19]:
df.dtypes

Unnamed: 0,0
RowNumber,int64
CustomerId,int64
Surname,object
CreditScore,int64
Geography,object
Gender,object
Age,int64
Tenure,int64
Balance,float64
NumOfProducts,int64


## Display object datatypes

In [20]:
df.select_dtypes('object')

Unnamed: 0,Surname,Geography,Gender
0,Hargrave,France,Female
1,Hill,Spain,Female
2,Onio,France,Female
3,Boni,France,Female
4,Mitchell,Spain,Female
...,...,...,...
9995,Obijiaku,France,Male
9996,Johnstone,France,Male
9997,Liu,France,Female
9998,Sabbatini,Germany,Male


## Display Numerical Features

In [21]:
df.select_dtypes(['int64','float64'])

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,619,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,608,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,502,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,699,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,850,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,771,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,516,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,709,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,772,42,3,75075.31,2,1,0,92888.52,1


## Checking Duplicates

In [22]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
9995,False
9996,False
9997,False
9998,False


In [23]:
df.duplicated().sum()

0

* No duplicate values

# Convert Data Types

In [24]:
df.dtypes

Unnamed: 0,0
RowNumber,int64
CustomerId,int64
Surname,object
CreditScore,int64
Geography,object
Gender,object
Age,int64
Tenure,int64
Balance,float64
NumOfProducts,int64


In [25]:
df['Balance']=df['Balance'].astype(int)
df.dtypes

Unnamed: 0,0
RowNumber,int64
CustomerId,int64
Surname,object
CreditScore,int64
Geography,object
Gender,object
Age,int64
Tenure,int64
Balance,int64
NumOfProducts,int64


# Feature Engineering

In [26]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [27]:
df[['Balance','Tenure']]

Unnamed: 0,Balance,Tenure
0,0,2
1,83807,1
2,159660,8
3,0,1
4,125510,2
...,...,...
9995,0,5
9996,57369,10
9997,0,7
9998,75075,3


## What is the average balance per year of tenure for each customer?

In [28]:
df['Balance per year of tenure']=df['Balance']/df['Tenure']

In [29]:
df

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure
0,1,15634602,Hargrave,619,France,Female,42,2,0,1,1,1,101348.88,1,0.0
1,2,15647311,Hill,608,Spain,Female,41,1,83807,1,0,1,112542.58,0,83807.0
2,3,15619304,Onio,502,France,Female,42,8,159660,3,1,0,113931.57,1,19957.5
3,4,15701354,Boni,699,France,Female,39,1,0,2,0,0,93826.63,0,0.0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510,1,1,1,79084.10,0,62755.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0,2,1,0,96270.64,0,0.0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369,1,1,1,101699.77,0,5736.9
9997,9998,15584532,Liu,709,France,Female,36,7,0,1,0,1,42085.58,1,0.0
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075,2,1,0,92888.52,1,25025.0


In [30]:
df[['Tenure','EstimatedSalary']]

Unnamed: 0,Tenure,EstimatedSalary
0,2,101348.88
1,1,112542.58
2,8,113931.57
3,1,93826.63
4,2,79084.10
...,...,...
9995,5,96270.64
9996,10,101699.77
9997,7,42085.58
9998,3,92888.52


## Group customers into age categories such as Young, Middle-aged, and Senior.

In [31]:
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 60, 100], labels=['Young', 'Middle-aged', 'Senior'])

In [33]:
df.head()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure,AgeGroup
0,1,15634602,Hargrave,619,France,Female,42,2,0,1,1,1,101348.88,1,0.0,Middle-aged
1,2,15647311,Hill,608,Spain,Female,41,1,83807,1,0,1,112542.58,0,83807.0,Middle-aged
2,3,15619304,Onio,502,France,Female,42,8,159660,3,1,0,113931.57,1,19957.5,Middle-aged
3,4,15701354,Boni,699,France,Female,39,1,0,2,0,0,93826.63,0,0.0,Middle-aged
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510,1,1,1,79084.1,0,62755.0,Middle-aged


## Calculate the ratio of the customer’s balance to their estimated salary.

In [35]:
df['Balance_to_salary_ratio']=df['Balance']/df['EstimatedSalary']

In [36]:
df.head()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure,AgeGroup,Balance_to_salary_ratio
0,1,15634602,Hargrave,619,France,Female,42,2,0,1,1,1,101348.88,1,0.0,Middle-aged,0.0
1,2,15647311,Hill,608,Spain,Female,41,1,83807,1,0,1,112542.58,0,83807.0,Middle-aged,0.744669
2,3,15619304,Onio,502,France,Female,42,8,159660,3,1,0,113931.57,1,19957.5,Middle-aged,1.401368
3,4,15701354,Boni,699,France,Female,39,1,0,2,0,0,93826.63,0,0.0,Middle-aged,0.0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510,1,1,1,79084.1,0,62755.0,Middle-aged,1.587045


## Create a Loyalty Score Based on Tenure and Number of Products

In [37]:
df['LoyaltyScore']=df['Tenure']*df['NumOfProducts']

In [38]:
df.head()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure,AgeGroup,Balance_to_salary_ratio,LoyaltyScore
0,1,15634602,Hargrave,619,France,Female,42,2,0,1,1,1,101348.88,1,0.0,Middle-aged,0.0,2
1,2,15647311,Hill,608,Spain,Female,41,1,83807,1,0,1,112542.58,0,83807.0,Middle-aged,0.744669,1
2,3,15619304,Onio,502,France,Female,42,8,159660,3,1,0,113931.57,1,19957.5,Middle-aged,1.401368,24
3,4,15701354,Boni,699,France,Female,39,1,0,2,0,0,93826.63,0,0.0,Middle-aged,0.0,2
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510,1,1,1,79084.1,0,62755.0,Middle-aged,1.587045,2


## Categorize CreditScore into risk levels such as Low, Medium, and High.

In [39]:
df['CreditRisk']=pd.cut(df['CreditScore'],bins=[300,600,800,1000],labels=['Low','Medium','High'])

In [40]:
df.head()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure,AgeGroup,Balance_to_salary_ratio,LoyaltyScore,CreditRisk
0,1,15634602,Hargrave,619,France,Female,42,2,0,1,1,1,101348.88,1,0.0,Middle-aged,0.0,2,Medium
1,2,15647311,Hill,608,Spain,Female,41,1,83807,1,0,1,112542.58,0,83807.0,Middle-aged,0.744669,1,Medium
2,3,15619304,Onio,502,France,Female,42,8,159660,3,1,0,113931.57,1,19957.5,Middle-aged,1.401368,24,Low
3,4,15701354,Boni,699,France,Female,39,1,0,2,0,0,93826.63,0,0.0,Middle-aged,0.0,2,Medium
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510,1,1,1,79084.1,0,62755.0,Middle-aged,1.587045,2,High


## Create a new feature that combines IsActiveMember and HasCrCard.

In [42]:
df['Active_member']=df['IsActiveMember']*df['HasCrCard']

In [49]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure,AgeGroup,Balance_to_salary_ratio,LoyaltyScore,CreditRisk,Active_member,DaysSinceJoining
0,1,15634602,Hargrave,619,France,Female,42,2,0,1,...,1,101348.88,1,0.0,Middle-aged,0.0,2,Medium,1,730
1,2,15647311,Hill,608,Spain,Female,41,1,83807,1,...,1,112542.58,0,83807.0,Middle-aged,0.744669,1,Medium,0,365
2,3,15619304,Onio,502,France,Female,42,8,159660,3,...,0,113931.57,1,19957.5,Middle-aged,1.401368,24,Low,0,2920
3,4,15701354,Boni,699,France,Female,39,1,0,2,...,0,93826.63,0,0.0,Middle-aged,0.0,2,Medium,0,365
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510,1,...,1,79084.1,0,62755.0,Middle-aged,1.587045,2,High,1,730


## Calculate the total number of days since the customer joined based on their tenure.

In [43]:
df['DaysSinceJoining'] = df['Tenure'] * 365


In [48]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure,AgeGroup,Balance_to_salary_ratio,LoyaltyScore,CreditRisk,Active_member,DaysSinceJoining
0,1,15634602,Hargrave,619,France,Female,42,2,0,1,...,1,101348.88,1,0.0,Middle-aged,0.0,2,Medium,1,730
1,2,15647311,Hill,608,Spain,Female,41,1,83807,1,...,1,112542.58,0,83807.0,Middle-aged,0.744669,1,Medium,0,365
2,3,15619304,Onio,502,France,Female,42,8,159660,3,...,0,113931.57,1,19957.5,Middle-aged,1.401368,24,Low,0,2920
3,4,15701354,Boni,699,France,Female,39,1,0,2,...,0,93826.63,0,0.0,Middle-aged,0.0,2,Medium,0,365
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510,1,...,1,79084.1,0,62755.0,Middle-aged,1.587045,2,High,1,730


# Sorting

## Identify the most loyal customers by sorting the data based on the Tenure column in descending order.

In [44]:
df_sorted_by_tenure=df.sort_values(by='Tenure',ascending=False)

In [45]:
df_sorted_by_tenure

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure,AgeGroup,Balance_to_salary_ratio,LoyaltyScore,CreditRisk,Active_member,DaysSinceJoining
7565,7566,15623369,Clifton,708,France,Male,52,10,105355,1,...,0,123.07,1,10535.5,Middle-aged,856.057528,10,Medium,0,3650
9302,9303,15752534,Mironov,744,France,Male,36,10,0,2,...,1,182867.84,0,0.0,Middle-aged,0.000000,20,Medium,1,3650
376,377,15583456,Gardiner,745,Germany,Male,45,10,117231,3,...,1,122381.02,1,11723.1,Middle-aged,0.957918,30,Medium,1,3650
7596,7597,15794868,Nnonso,599,Germany,Male,40,10,137456,2,...,1,14113.11,0,13745.6,Middle-aged,9.739597,20,Low,1,3650
6827,6828,15760216,Pokrovskaya,718,France,Female,49,10,0,1,...,0,184474.72,1,0.0,Middle-aged,0.000000,10,Medium,0,3650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5150,5151,15614716,Okwudilichukwu,515,France,Female,37,0,196853,1,...,1,132770.11,0,inf,Middle-aged,1.482661,0,Low,1,0
1685,1686,15713826,Ferguson,613,Germany,Female,20,0,117356,1,...,0,113557.70,1,inf,Young,1.033448,0,Medium,0,0
1682,1683,15662758,Watson,620,France,Male,41,0,97925,1,...,0,85000.32,0,inf,Middle-aged,1.152054,0,Medium,0,0
8869,8870,15733597,Y?an,669,France,Female,41,0,150219,2,...,0,107839.03,0,inf,Middle-aged,1.392993,0,Medium,0,0


## Find customers with the highest creditworthiness and account balance by sorting first by CreditScore and then by Balance.

In [46]:
df_sorted=df.sort_values(by=['CreditScore','Balance'],ascending=[False,False])

In [47]:
df_sorted.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure,AgeGroup,Balance_to_salary_ratio,LoyaltyScore,CreditRisk,Active_member,DaysSinceJoining
1533,1534,15769818,Moore,850,France,Female,37,3,212778,1,...,1,69372.88,0,70926.0,Middle-aged,3.067164,3,High,0,1095
520,521,15671256,Macartney,850,France,Female,35,1,211774,1,...,0,188574.12,1,211774.0,Middle-aged,1.123028,1,High,0,365
4533,4534,15607275,Ch'ang,850,Spain,Male,39,6,206014,2,...,1,42774.84,1,34335.666667,Middle-aged,4.816242,12,High,0,2190
4167,4168,15737509,Morrison,850,Spain,Male,34,8,199229,1,...,0,68106.29,0,24903.625,Middle-aged,2.925266,8,High,0,2920
4674,4675,15689492,Benjamin,850,Germany,Male,41,1,176958,2,...,1,125806.3,0,176958.0,Middle-aged,1.406591,2,High,0,365


## Sort by Balance and HasCrCard to Identify High-Balance Customers with Credit Cards

In [50]:
sorted_by_balance_card = df.sort_values(by=['Balance', 'HasCrCard'], ascending=[False, False])


In [51]:
sorted_by_balance_card.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,IsActiveMember,EstimatedSalary,Exited,Balance per year of tenure,AgeGroup,Balance_to_salary_ratio,LoyaltyScore,CreditRisk,Active_member,DaysSinceJoining
2092,2093,15757408,Lo,655,Spain,Male,38,3,250898,3,...,1,81054.0,1,83632.666667,Middle-aged,3.095443,9,Medium,0,1095
3280,3281,15715622,To Rot,583,France,Female,57,3,238387,1,...,1,147964.99,1,79462.333333,Middle-aged,1.611104,3,Low,0,1095
8733,8734,15714241,Haddon,749,Spain,Male,42,9,222267,1,...,0,101108.85,1,24696.333333,Middle-aged,2.198294,9,Medium,0,3285
3588,3589,15571958,McIntosh,489,Spain,Male,40,3,221532,1,...,0,171867.08,0,73844.0,Middle-aged,1.288973,3,Low,0,1095
6717,6718,15586674,Shaw,663,Spain,Female,58,5,216109,1,...,1,74176.71,1,43221.8,Middle-aged,2.913435,5,Medium,0,1825


## value_counts

## What is the distribution of customers across different AgeGroup categories?





In [52]:
df['AgeGroup'].value_counts()

Unnamed: 0_level_0,count
AgeGroup,Unnamed: 1_level_1
Middle-aged,7568
Young,1968
Senior,464


## What is the distribution of customers across different credit risk categories?

In [56]:
df['CreditRisk'].value_counts()

Unnamed: 0_level_0,count
CreditRisk,Unnamed: 1_level_1
Medium,6289
Low,3066
High,645


## What is the average account balance for each age group?

In [55]:
df.groupby('AgeGroup')['Balance'].mean()

  df.groupby('AgeGroup')['Balance'].mean()


Unnamed: 0_level_0,Balance
AgeGroup,Unnamed: 1_level_1
Young,73198.463923
Middle-aged,77385.930233
Senior,75742.280172


## What is the distribution of customers across different geographies?

In [60]:
df['Geography'].value_counts()

Unnamed: 0_level_0,count
Geography,Unnamed: 1_level_1
France,5014
Germany,2509
Spain,2477


## What is the average balance of customers across different geographies and age groups?

In [59]:
df.groupby(['Geography','AgeGroup'])['Balance'].mean()

  df.groupby(['Geography','AgeGroup'])['Balance'].mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Balance
Geography,AgeGroup,Unnamed: 2_level_1
France,Young,59631.646435
France,Middle-aged,62913.01255
France,Senior,59845.450216
Germany,Young,119186.476821
Germany,Middle-aged,119860.74421
Germany,Senior,119652.309735
Spain,Young,59047.073375
Spain,Middle-aged,62318.118085
Spain,Senior,64995.066667
