Install required libraries in notebook

In [35]:
!pip install pandas numpy matplotlib seaborn scikit-learn

Defaulting to user installation because normal site-packages is not writeable


Verify Installation and Import Libraries

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


Load and Explore Dataset

In [37]:
import pandas as pd
df = pd.read_csv('Dataset 104.csv')
df.head()

Unnamed: 0,CustomerID,Gender,Age,Tenure,SubscriptionType,MonthlyCharges,TotalCharges,PaymentMethod,ContractType,StreamingServiceUsed,SupportCalls,Churn
0,1,Male,34,11,Premium,90.12,3080.77,Bank Transfer,Two Year,Yes,5,1
1,2,Female,26,56,Standard,35.87,1477.14,Bank Transfer,Month-to-Month,Yes,7,1
2,3,Male,50,67,Standard,43.1,950.13,Bank Transfer,Month-to-Month,Yes,1,0
3,4,Male,37,29,Standard,15.23,1960.78,Bank Transfer,Month-to-Month,No,3,0
4,5,Male,30,9,Premium,20.04,4027.08,PayPal,Two Year,No,6,0


Get basic information about the dataset

In [38]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CustomerID            1000 non-null   int64  
 1   Gender                1000 non-null   object 
 2   Age                   1000 non-null   int64  
 3   Tenure                1000 non-null   int64  
 4   SubscriptionType      1000 non-null   object 
 5   MonthlyCharges        1000 non-null   float64
 6   TotalCharges          1000 non-null   float64
 7   PaymentMethod         1000 non-null   object 
 8   ContractType          1000 non-null   object 
 9   StreamingServiceUsed  1000 non-null   object 
 10  SupportCalls          1000 non-null   int64  
 11  Churn                 1000 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 93.9+ KB


Summarize numerical columns

In [39]:

df.describe()


Unnamed: 0,CustomerID,Age,Tenure,MonthlyCharges,TotalCharges,SupportCalls,Churn
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,43.799,34.569,54.57893,2527.43377,4.59,0.177
std,288.819436,15.133857,20.348103,26.029768,1398.473052,2.993804,0.38186
min,1.0,18.0,1.0,10.0,100.15,0.0,0.0
25%,250.75,31.0,17.0,31.8775,1330.375,2.0,0.0
50%,500.5,44.0,33.5,54.72,2552.1,5.0,0.0
75%,750.25,57.0,52.0,77.35,3697.6025,7.0,0.0
max,1000.0,69.0,71.0,99.96,4988.97,9.0,1.0


Check for any missing values available in dataset

In [40]:
df.isnull().sum()


CustomerID              0
Gender                  0
Age                     0
Tenure                  0
SubscriptionType        0
MonthlyCharges          0
TotalCharges            0
PaymentMethod           0
ContractType            0
StreamingServiceUsed    0
SupportCalls            0
Churn                   0
dtype: int64

Dropping the rows with missing values also cleaning the dataset in this way

In [41]:
df.dropna(inplace=True)

Converting categorical variables into numerical format using one-hot encoding.

In [42]:
df = pd.get_dummies(df, drop_first=True)


Standardizing the numerical features to ensure they have a mean of 0 and a standard deviation of 1

In [43]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(df[['MonthlyCharges', 'TotalCharges']])


Defining Features and Target variable

In [44]:
# Features
X = df.drop('Churn', axis=1)

# Target variable
y = df['Churn']


splitting the data into training and testing sets

In [45]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Now building the model

In [46]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf_model = RandomForestClassifier(random_state=42)


Train the model

In [47]:
rf_model.fit(X_train, y_train)


Use the trained model to make predictions on the test data

In [48]:
y_pred = rf_model.predict(X_test)


Access and Evaluate Model Performance

In [49]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
print(confusion_matrix(y_test, y_pred))

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


              precision    recall  f1-score   support

           0       0.81      1.00      0.89       161
           1       0.00      0.00      0.00        39

    accuracy                           0.81       200
   macro avg       0.40      0.50      0.45       200
weighted avg       0.65      0.81      0.72       200

[[161   0]
 [ 39   0]]
Accuracy: 0.805


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Identifying Important Features

In [50]:
# Feature importances
feature_importances = rf_model.feature_importances_
features = X.columns

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the top features
feature_importance_df.head(10)


Unnamed: 0,Feature,Importance
0,CustomerID,0.168186
4,TotalCharges,0.154505
3,MonthlyCharges,0.152839
2,Tenure,0.137312
1,Age,0.129166
5,SupportCalls,0.084409
7,SubscriptionType_Premium,0.024372
13,StreamingServiceUsed_Yes,0.023377
6,Gender_Male,0.022983
11,ContractType_One Year,0.022093
