In [1]:
#import libraries
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('Churndata.csv')

In [None]:
#Show sample data
data.head()

Unnamed: 0,CustomerID,Gender,Age,SignupDate,LastPurchaseDate,MonthlySpend,ContractType,Country,Churn
0,CUST-0001,Female,20.0,07-01-2023,04-04-2025,1625.01,Quarterly,UK,0
1,CUST-0002,Female,33.0,22-10-2020,14-07-2024,3395.11,Monthly,India,1
2,CUST-0003,Female,40.0,03-03-2023,06-11-2024,1592.27,Annual,India,0
3,CUST-0004,Female,50.0,25-11-2020,14-10-2024,2015.18,Monthly,USA,0
4,CUST-0005,Male,33.0,26-06-2022,29-12-2024,1472.24,Monthly,USA,1


In [31]:
#show null columns
data.isnull().sum()

CustomerID          0
Gender              0
Age                 0
SignupDate          0
LastPurchaseDate    0
MonthlySpend        0
ContractType        0
Country             0
Churn               0
dtype: int64

In [13]:
data.isnull().mean() * 100

CustomerID          0.0
Gender              0.0
Age                 3.0
SignupDate          0.0
LastPurchaseDate    0.0
MonthlySpend        5.0
ContractType        0.0
Country             0.0
Churn               0.0
dtype: float64

Age has 3 missing values and monthlyspend has 5. Total 8 rows have missing data. We will impute them and fill them with Mean.

In [19]:
data.describe()

Unnamed: 0,Age,MonthlySpend,Churn
count,97.0,95.0,100.0
mean,38.164948,2184.706842,0.35
std,13.699666,1254.952965,0.479372
min,18.0,996.36,0.0
25%,26.0,1683.775,0.0
50%,37.0,2003.62,0.0
75%,50.0,2389.725,1.0
max,63.0,10000.0,1.0


In [None]:
data['Age'].fillna(data['Age'].mean(),inplace =True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df=data['Age'].fillna(data['Age'].mean(),inplace =True)


In [30]:
data['MonthlySpend'].fillna(data['MonthlySpend'].mean(),inplace =True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['MonthlySpend'].fillna(data['MonthlySpend'].mean(),inplace =True)


Customer ID is not required, we will drop it. 

In [33]:
data = data.drop('CustomerID', axis =1)

In [34]:
data

Unnamed: 0,Gender,Age,SignupDate,LastPurchaseDate,MonthlySpend,ContractType,Country,Churn
0,Female,20.0,07-01-2023,04-04-2025,1625.01,Quarterly,UK,0
1,Female,33.0,22-10-2020,14-07-2024,3395.11,Monthly,India,1
2,Female,40.0,03-03-2023,06-11-2024,1592.27,Annual,India,0
3,Female,50.0,25-11-2020,14-10-2024,2015.18,Monthly,USA,0
4,Male,33.0,26-06-2022,29-12-2024,1472.24,Monthly,USA,1
...,...,...,...,...,...,...,...,...
95,Male,35.0,18-06-2022,07-01-2025,1033.15,Monthly,UK,0
96,Male,58.0,18-11-2020,18-12-2024,2760.64,Quarterly,Germany,0
97,Female,41.0,25-08-2022,15-03-2025,2454.69,Monthly,Germany,1
98,Male,51.0,17-05-2024,25-04-2025,1838.53,Monthly,India,1


In [None]:
#Finding unique values for categorical columns:
data['Gender'].value_counts()  #Male, Female

Gender
Male      54
Female    46
Name: count, dtype: int64

In [37]:
data['ContractType'].value_counts()

ContractType
Quarterly    36
Monthly      35
Annual       29
Name: count, dtype: int64

In [38]:
data['Country'].value_counts()

Country
India      29
USA        26
Germany    23
UK         22
Name: count, dtype: int64

Encoding categorical columns

In [41]:
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

In [43]:
data['ContractType'] = data['ContractType'].map({'Quarterly': 0, 'Monthly': 1, 'Annual': 2})

In [44]:
data['Country'] = data['Country'].map({'India': 0, 'USA': 1, 'Germany': 2, 'UK':3})

In [45]:
data

Unnamed: 0,Gender,Age,SignupDate,LastPurchaseDate,MonthlySpend,ContractType,Country,Churn
0,1,20.0,07-01-2023,04-04-2025,1625.01,0,3,0
1,1,33.0,22-10-2020,14-07-2024,3395.11,1,0,1
2,1,40.0,03-03-2023,06-11-2024,1592.27,2,0,0
3,1,50.0,25-11-2020,14-10-2024,2015.18,1,1,0
4,0,33.0,26-06-2022,29-12-2024,1472.24,1,1,1
...,...,...,...,...,...,...,...,...
95,0,35.0,18-06-2022,07-01-2025,1033.15,1,3,0
96,0,58.0,18-11-2020,18-12-2024,2760.64,0,2,0
97,1,41.0,25-08-2022,15-03-2025,2454.69,1,2,1
98,0,51.0,17-05-2024,25-04-2025,1838.53,1,0,1


#Transforming datetime colums to numerical columns

In [55]:
data['LastPurchaseDate'] = pd.to_datetime(data['LastPurchaseDate'],dayfirst =True)

In [57]:
data['SignupDate'] = pd.to_datetime(data['SignupDate'],dayfirst =True)

In [59]:
data['Pyear'] = data['LastPurchaseDate'].dt.year

In [61]:
data['Pmonth'] = data['LastPurchaseDate'].dt.month

In [62]:
data['Syear'] = data['SignupDate'].dt.year

In [63]:
data['Smonth'] = data['SignupDate'].dt.month

In [64]:
today = pd.Timestamp.today()

In [None]:
#calculating customer's tenure(new feature)
data['CustomerTenure'] = (today - data['SignupDate']).dt.days

In [67]:
df = data.drop(['SignupDate','LastPurchaseDate'],axis =1)

In [71]:
#Split into train -test and scale the dataset
X = df.drop(['Churn'],axis =1)
y= df['Churn']

In [78]:
#import train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [76]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [79]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Handled Missing values and imputed with mean. 
Encoded categorical variables and scaled numerical features.
Create feature such as customer tenure but feature for average spend does not make sense as it is monthly spend value per customer. 