In [115]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc,accuracy_score,recall_score
import matplotlib.pyplot as plt
import numpy as np

In [116]:
df1 = pd.read_csv("Telco-Customer-Churn.csv")

In [117]:
df2 = pd.read_csv('Customer_Churn.csv')

## Merging Two Dataframes based on Customer_ID

In [118]:
df2.isnull().sum()

Unnamed: 0                              0
Customer ID                             0
Gender                                  0
Age                                     0
Married                                 0
Number of Dependents                    0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3877
Phone Service                           0
Avg Monthly Long Distance Charges     682
Multiple Lines                        682
Internet Service                        0
Internet Type                        1526
Avg Monthly GB Download              1526
Online Security                      1526
Online Backup                        1526
Device Protection Plan               1526
Premium Tech Support                 1526
Streaming TV                      

In [119]:
column_mapping = {
    'customerID': 'Customer ID',
    'gender': 'Gender',
    'Partner': 'Married',
    'Dependents': 'Number of Dependents',
    'tenure': 'Tenure in Months',
    'PhoneService': 'Phone Service',
    'MultipleLines': 'Multiple Lines',
    'InternetService': 'Internet Service',
    'OnlineSecurity': 'Online Security',
    'OnlineBackup': 'Online Backup',
    'DeviceProtection': 'Device Protection Plan',
    'TechSupport': 'Premium Tech Support',
    'StreamingTV': 'Streaming TV',
    'StreamingMovies': 'Streaming Movies',
    'Contract': 'Contract',
    'PaperlessBilling': 'Paperless Billing',
    'PaymentMethod': 'Payment Method',
    'MonthlyCharges': 'Monthly Charge',
    'TotalCharges': 'Total Charges',
    'Churn': 'Churn'
}


In [120]:
df1.rename(columns=column_mapping, inplace=True)

In [121]:
combined_df = pd.merge(df1, df2, how='inner', on='Customer ID')


In [122]:
combined_df.columns

Index(['Customer ID', 'Gender_x', 'SeniorCitizen', 'Married_x',
       'Number of Dependents_x', 'Tenure in Months_x', 'Phone Service_x',
       'Multiple Lines_x', 'Internet Service_x', 'Online Security_x',
       'Online Backup_x', 'Device Protection Plan_x', 'Premium Tech Support_x',
       'Streaming TV_x', 'Streaming Movies_x', 'Contract_x',
       'Paperless Billing_x', 'Payment Method_x', 'Monthly Charge_x',
       'Total Charges_x', 'Churn', 'Unnamed: 0', 'Gender_y', 'Age',
       'Married_y', 'Number of Dependents_y', 'City', 'Zip Code', 'Latitude',
       'Longitude', 'Number of Referrals', 'Tenure in Months_y', 'Offer',
       'Phone Service_y', 'Avg Monthly Long Distance Charges',
       'Multiple Lines_y', 'Internet Service_y', 'Internet Type',
       'Avg Monthly GB Download', 'Online Security_y', 'Online Backup_y',
       'Device Protection Plan_y', 'Premium Tech Support_y', 'Streaming TV_y',
       'Streaming Movies_y', 'Streaming Music', 'Unlimited Data', 'Contract_y',

In [123]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 58 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        7043 non-null   object 
 1   Gender_x                           7043 non-null   object 
 2   SeniorCitizen                      7043 non-null   int64  
 3   Married_x                          7043 non-null   object 
 4   Number of Dependents_x             7043 non-null   object 
 5   Tenure in Months_x                 7043 non-null   int64  
 6   Phone Service_x                    7043 non-null   object 
 7   Multiple Lines_x                   7043 non-null   object 
 8   Internet Service_x                 7043 non-null   object 
 9   Online Security_x                  7043 non-null   object 
 10  Online Backup_x                    7043 non-null   object 
 11  Device Protection Plan_x           7043 non-null   objec

In [124]:
combined_df.drop(combined_df.columns[3:20], axis=1, inplace=True)

In [125]:
combined_df.drop(combined_df.columns[1:2],axis=1,inplace=True)
combined_df.drop('Unnamed: 0',axis=1,inplace=True)

In [126]:
combined_df.columns = combined_df.columns.str.replace('_y', '')

In [127]:
len(combined_df.columns)

39

In [12]:
## Storing Churn Reasons for future Use

reasons_df= combined_df.loc[combined_df['Churn'] == 'Yes', ['Customer ID', 'Churn Reason']]
combined_df_copy = combined_df.copy()

In [13]:
combined_df['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [14]:
# The 1869 value indicates that customers who were not churned has reason as Null values.
combined_df['Churn Reason'].notnull().sum()

1869

In [15]:
combined_df.describe()

Unnamed: 0,SeniorCitizen,Age,Number of Dependents,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,6361.0,5517.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,46.509726,0.468692,93486.070567,36.197455,-119.756684,1.951867,32.386767,25.420517,26.189958,63.596131,2280.381264,1.962182,6.860713,749.099262
std,0.368612,16.750352,0.962802,1856.767505,2.468929,2.154425,3.001199,24.542061,14.200374,19.586585,31.204743,2266.220462,7.902614,25.104978,846.660055
min,0.0,19.0,0.0,90001.0,32.555828,-124.301372,0.0,1.0,1.01,2.0,-10.0,18.8,0.0,0.0,0.0
25%,0.0,32.0,0.0,92101.0,33.990646,-121.78809,0.0,9.0,13.05,13.0,30.4,400.15,0.0,0.0,70.545
50%,0.0,46.0,0.0,93518.0,36.205465,-119.595293,0.0,29.0,25.69,21.0,70.05,1394.55,0.0,0.0,401.44
75%,0.0,60.0,0.0,95329.0,38.161321,-117.969795,3.0,55.0,37.68,30.0,89.75,3786.6,0.0,0.0,1191.1
max,1.0,80.0,9.0,96150.0,41.962127,-114.192901,11.0,72.0,49.99,85.0,118.75,8684.8,49.79,150.0,3564.72


## Data Pre-Processing

In [16]:
combined_df.isnull().sum()

Customer ID                             0
SeniorCitizen                           0
Churn                                   0
Gender                                  0
Age                                     0
Married                                 0
Number of Dependents                    0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3877
Phone Service                           0
Avg Monthly Long Distance Charges     682
Multiple Lines                        682
Internet Service                        0
Internet Type                        1526
Avg Monthly GB Download              1526
Online Security                      1526
Online Backup                        1526
Device Protection Plan               1526
Premium Tech Support              

- Since, the number of missing values are way too high, we need to fill them with appropriate imputation techniques

In [17]:
# Filling Null Values
# If feature is object, we fill it with mode else mean
for i in combined_df.columns:
    if i != 'Churn Reason':  # Skip imputation for 'Churn Reason' column
        if combined_df[i].dtype == 'object':
            if combined_df[i].isnull().sum() != 0:
                combined_df[i].fillna(combined_df[i].mode()[0], inplace=True)
        else:
            if combined_df[i].isnull().sum() != 0:
                combined_df[i].fillna(combined_df[i].mean(), inplace=True)


In [18]:
combined_df.isnull().sum()

Customer ID                             0
SeniorCitizen                           0
Churn                                   0
Gender                                  0
Age                                     0
Married                                 0
Number of Dependents                    0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Number of Referrals                     0
Tenure in Months                        0
Offer                                   0
Phone Service                           0
Avg Monthly Long Distance Charges       0
Multiple Lines                          0
Internet Service                        0
Internet Type                           0
Avg Monthly GB Download                 0
Online Security                         0
Online Backup                           0
Device Protection Plan                  0
Premium Tech Support              

In [19]:
combined_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeniorCitizen,7043.0,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
Age,7043.0,46.509726,16.750352,19.0,32.0,46.0,60.0,80.0
Number of Dependents,7043.0,0.468692,0.962802,0.0,0.0,0.0,0.0,9.0
Zip Code,7043.0,93486.070567,1856.767505,90001.0,92101.0,93518.0,95329.0,96150.0
Latitude,7043.0,36.197455,2.468929,32.555828,33.990646,36.205465,38.161321,41.962127
Longitude,7043.0,-119.756684,2.154425,-124.301372,-121.78809,-119.595293,-117.969795,-114.192901
Number of Referrals,7043.0,1.951867,3.001199,0.0,0.0,0.0,3.0,11.0
Tenure in Months,7043.0,32.386767,24.542061,1.0,9.0,29.0,55.0,72.0
Avg Monthly Long Distance Charges,7043.0,25.420517,13.495231,1.01,14.455,25.420517,36.395,49.99
Avg Monthly GB Download,7043.0,26.189958,17.33496,2.0,15.0,26.0,27.0,85.0


## Converting Tenure from months to years

In [20]:
combined_df['Tenure In Years']=round(combined_df['Tenure in Months']/12,2)
combined_df.drop("Tenure in Months",axis=1,inplace=True)

In [21]:
len(combined_df.columns)

39

## Calculating Customer Life Time Value

In [22]:
combined_df.columns

Index(['Customer ID', 'SeniorCitizen', 'Churn', 'Gender', 'Age', 'Married',
       'Number of Dependents', 'City', 'Zip Code', 'Latitude', 'Longitude',
       'Number of Referrals', 'Offer', 'Phone Service',
       'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Customer Status', 'Churn Category', 'Churn Reason', 'Tenure In Years'],
      dtype='object')

In [23]:
combined_df["Transaction"]=combined_df["Total Charges"]/combined_df["Monthly Charge"]
combined_df['Transaction']=round(combined_df['Transaction'],0)

In [24]:
# Calculating Total Revenue for Each Customer
combined_df['Total Revenue']= combined_df['Total Charges'] + combined_df['Total Extra Data Charges'] + combined_df['Total Long Distance Charges']-combined_df['Total Refunds']

In [25]:
combined_df.head()

Unnamed: 0,Customer ID,SeniorCitizen,Churn,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,...,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Customer Status,Churn Category,Churn Reason,Tenure In Years,Transaction,Total Revenue
0,7590-VHVEG,0,No,Female,36,Yes,0,Los Angeles,90001,33.973616,...,29.85,0.0,0,0.0,Joined,Competitor,,0.08,1.0,29.85
1,5575-GNVDE,0,No,Male,46,No,0,Los Angeles,90002,33.949255,...,1889.5,0.0,0,581.06,Stayed,Competitor,,2.83,33.0,2470.56
2,3668-QPYBK,0,Yes,Male,37,No,0,Los Angeles,90003,33.964131,...,108.15,0.0,0,20.94,Churned,Competitor,Competitor made better offer,0.17,2.0,129.09
3,7795-CFOCW,0,No,Male,53,No,0,Los Angeles,90004,34.076259,...,1840.75,0.0,0,0.0,Stayed,Competitor,,3.75,44.0,1840.75
4,9237-HQITU,0,Yes,Female,19,No,2,Los Angeles,90005,34.059281,...,151.65,0.0,0,18.24,Churned,Other,Moved,0.17,2.0,169.89


In [26]:
def cltv_c(dataframe):
    cltv_c = dataframe.groupby('Customer ID').agg({
          'Total Revenue': lambda x: x.sum(),  
          'Transaction': lambda x: x.sum(),
          'Total Charges': lambda x: x.sum(),
          'Tenure In Years': 'mean'
      })
    cltv_c.columns = ['total_revenue', 'transaction', 'total_charges', 'average_tenure']
    cltv_c['avg_order_value'] = cltv_c['total_charges'] / cltv_c['transaction']
    cltv_c["purchase_frequency"] = cltv_c['transaction'] / cltv_c.shape[0]
    cltv_c['customer_value'] = (cltv_c['avg_order_value'] * cltv_c["purchase_frequency"])
    cltv_c['cltv'] = cltv_c['customer_value'] * cltv_c['average_tenure']

    return cltv_c


In [27]:
cltv_df=cltv_c(combined_df)

In [28]:
combined_df = pd.merge(combined_df, cltv_df['cltv'], on='Customer ID', how='left')

In [29]:
combined_df.head()

Unnamed: 0,Customer ID,SeniorCitizen,Churn,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,...,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Customer Status,Churn Category,Churn Reason,Tenure In Years,Transaction,Total Revenue,cltv
0,7590-VHVEG,0,No,Female,36,Yes,0,Los Angeles,90001,33.973616,...,0.0,0,0.0,Joined,Competitor,,0.08,1.0,29.85,0.000339
1,5575-GNVDE,0,No,Male,46,No,0,Los Angeles,90002,33.949255,...,0.0,0,581.06,Stayed,Competitor,,2.83,33.0,2470.56,0.759234
2,3668-QPYBK,0,Yes,Male,37,No,0,Los Angeles,90003,33.964131,...,0.0,0,20.94,Churned,Competitor,Competitor made better offer,0.17,2.0,129.09,0.00261
3,7795-CFOCW,0,No,Male,53,No,0,Los Angeles,90004,34.076259,...,0.0,0,0.0,Stayed,Competitor,,3.75,44.0,1840.75,0.980095
4,9237-HQITU,0,Yes,Female,19,No,2,Los Angeles,90005,34.059281,...,0.0,0,18.24,Churned,Other,Moved,0.17,2.0,169.89,0.00366


## Labelling Customers based on CLTV

In [30]:
clust_df=combined_df[['Customer ID','Offer','Premium Tech Support','Contract','cltv']]

## Reasons for only considering these columns to label customer value
- No of Referrals has more 0 values, so if you calculate kmeans cluster it is giving more weightage to number of referrals and labelling low cltv values customers as high.

In [31]:
clust_df.head()

Unnamed: 0,Customer ID,Offer,Premium Tech Support,Contract,cltv
0,7590-VHVEG,Offer E,No,Month-to-Month,0.000339
1,5575-GNVDE,Offer B,No,One Year,0.759234
2,3668-QPYBK,Offer B,No,Month-to-Month,0.00261
3,7795-CFOCW,Offer B,Yes,One Year,0.980095
4,9237-HQITU,Offer B,No,Month-to-Month,0.00366


In [32]:
LE=LabelEncoder()
columns_to_encode = ['Offer','Contract','Premium Tech Support']
for column in columns_to_encode:
    clust_df[column] = LE.fit_transform(clust_df[column])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust_df[column] = LE.fit_transform(clust_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust_df[column] = LE.fit_transform(clust_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust_df[column] = LE.fit_transform(clust_df[column])


In [33]:
clust_df.head()

Unnamed: 0,Customer ID,Offer,Premium Tech Support,Contract,cltv
0,7590-VHVEG,4,0,0,0.000339
1,5575-GNVDE,1,0,1,0.759234
2,3668-QPYBK,1,0,0,0.00261
3,7795-CFOCW,1,1,1,0.980095
4,9237-HQITU,1,0,0,0.00366


In [34]:
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(clust_df.iloc[:,1:])
clust_df['Cluster'] = kmeans.fit_predict(clust_df.iloc[:,1:])
cluster_labels = {
    0: 'High',
    1: 'Medium',
    2: 'Low'
}
clust_df['Cluster'] = clust_df['Cluster'].map(cluster_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust_df['Cluster'] = kmeans.fit_predict(clust_df.iloc[:,1:])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust_df['Cluster'] = clust_df['Cluster'].map(cluster_labels)


In [35]:
clust_df.head()

Unnamed: 0,Customer ID,Offer,Premium Tech Support,Contract,cltv,Cluster
0,7590-VHVEG,4,0,0,0.000339,High
1,5575-GNVDE,1,0,1,0.759234,Low
2,3668-QPYBK,1,0,0,0.00261,Low
3,7795-CFOCW,1,1,1,0.980095,Low
4,9237-HQITU,1,0,0,0.00366,Low


In [36]:
combined_df['Cluster'] = clust_df['Cluster']

In [37]:
clust_df['Cluster'].value_counts()

Cluster
Low       4096
Medium    1540
High      1407
Name: count, dtype: int64

In [38]:
combined_df.head()

Unnamed: 0,Customer ID,SeniorCitizen,Churn,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,...,Total Extra Data Charges,Total Long Distance Charges,Customer Status,Churn Category,Churn Reason,Tenure In Years,Transaction,Total Revenue,cltv,Cluster
0,7590-VHVEG,0,No,Female,36,Yes,0,Los Angeles,90001,33.973616,...,0,0.0,Joined,Competitor,,0.08,1.0,29.85,0.000339,High
1,5575-GNVDE,0,No,Male,46,No,0,Los Angeles,90002,33.949255,...,0,581.06,Stayed,Competitor,,2.83,33.0,2470.56,0.759234,Low
2,3668-QPYBK,0,Yes,Male,37,No,0,Los Angeles,90003,33.964131,...,0,20.94,Churned,Competitor,Competitor made better offer,0.17,2.0,129.09,0.00261,Low
3,7795-CFOCW,0,No,Male,53,No,0,Los Angeles,90004,34.076259,...,0,0.0,Stayed,Competitor,,3.75,44.0,1840.75,0.980095,Low
4,9237-HQITU,0,Yes,Female,19,No,2,Los Angeles,90005,34.059281,...,0,18.24,Churned,Other,Moved,0.17,2.0,169.89,0.00366,Low


In [39]:
# Encoding Cluster Names
cluster_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
combined_df['Cluster'] = combined_df['Cluster'].map(cluster_mapping)

In [40]:
combined_df['Cluster']

0       2
1       0
2       0
3       0
4       0
       ..
7038    0
7039    1
7040    0
7041    0
7042    1
Name: Cluster, Length: 7043, dtype: int64

## Feature Reduction Using PCA

In [41]:
columns_to_encode = [col for col in combined_df.columns[1:-1] if col != 'Churn Reason' and combined_df[col].dtype == 'object']
LE = LabelEncoder()
for i in columns_to_encode:
    combined_df[i] = LE.fit_transform(combined_df[i])


In [42]:
reduced_df=combined_df.drop(['Customer ID','Zip Code','Latitude','Longitude','Paperless Billing','Payment Method','Churn Category','Churn Reason','Transaction'],axis=1)
combined_df.drop(['Customer ID','Zip Code','Latitude','Longitude','Paperless Billing','Payment Method','Churn Category','Transaction'],axis=1,inplace=True)

In [43]:
combined_df[['Streaming TV', 'Streaming Movies', 'Streaming Music']].corr()

Unnamed: 0,Streaming TV,Streaming Movies,Streaming Music
Streaming TV,1.0,0.533094,0.455387
Streaming Movies,0.533094,1.0,0.848528
Streaming Music,0.455387,0.848528,1.0


In [44]:
columns_for_pca = ['Streaming TV', 'Streaming Movies', 'Streaming Music']

pca = PCA(n_components=2)
scaler=StandardScaler()
pca_components = pca.fit_transform(scaler.fit_transform(reduced_df[columns_for_pca]))

explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)

reduced_df['Streaming Services-1'] = pca_components[:, 0]
reduced_df['Streaming Services-2'] = pca_components[:, 1]

combined_df['Streaming Services-1'] = pca_components[:, 0]
combined_df['Streaming Services-2'] = pca_components[:, 1]

reduced_df.drop(columns=columns_for_pca, inplace=True)
combined_df.drop(columns=columns_for_pca, inplace=True)

Explained Variance Ratio: [0.74741967 0.20386504]


In [45]:
reduced_df.head()

Unnamed: 0,SeniorCitizen,Churn,Gender,Age,Married,Number of Dependents,City,Number of Referrals,Offer,Phone Service,...,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Customer Status,Tenure In Years,Total Revenue,cltv,Cluster,Streaming Services-1,Streaming Services-2
0,0,0,0,36,1,0,554,0,4,0,...,0.0,0,0.0,1,0.08,29.85,0.000339,2,-1.333973,-0.157216
1,0,0,1,46,0,0,554,0,1,1,...,0.0,0,581.06,2,2.83,2470.56,0.759234,0,-1.333973,-0.157216
2,0,1,1,37,0,0,554,0,1,1,...,0.0,0,20.94,0,0.17,129.09,0.00261,0,-1.333973,-0.157216
3,0,0,1,53,0,0,554,0,1,0,...,0.0,0,0.0,2,3.75,1840.75,0.980095,0,-1.333973,-0.157216
4,0,1,0,19,0,2,554,0,1,1,...,0.0,0,18.24,0,0.17,169.89,0.00366,0,-1.333973,-0.157216


In [46]:
combined_df[['Monthly Charge', 'Total Charges', 'Total Refunds','Total Extra Data Charges','Total Long Distance Charges','Avg Monthly Long Distance Charges']].corr()

Unnamed: 0,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Avg Monthly Long Distance Charges
Monthly Charge,1.0,0.62281,0.023963,0.121238,0.235685,0.018455
Total Charges,0.62281,1.0,0.039558,0.121859,0.610185,0.016546
Total Refunds,0.023963,0.039558,1.0,0.016755,0.028113,-0.024869
Total Extra Data Charges,0.121238,0.121859,0.016755,1.0,0.058871,0.019822
Total Long Distance Charges,0.235685,0.610185,0.028113,0.058871,1.0,0.525222
Avg Monthly Long Distance Charges,0.018455,0.016546,-0.024869,0.019822,0.525222,1.0


In [47]:
columns_for_pca = ['Monthly Charge', 'Total Charges', 'Total Refunds','Total Extra Data Charges','Total Long Distance Charges','Avg Monthly Long Distance Charges']

pca = PCA(n_components=4)
pca_components = pca.fit_transform(scaler.fit_transform(reduced_df[columns_for_pca]))

explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)

reduced_df['Charges-1'] = pca_components[:, 0]
reduced_df['Charges-2'] = pca_components[:, 1]
reduced_df['Charges-3'] = pca_components[:, 2]
reduced_df['Charges-4'] = pca_components[:, 3]

combined_df['Charges-1'] = pca_components[:, 0]
combined_df['Charges-2'] = pca_components[:, 1]
combined_df['Charges-3'] = pca_components[:, 2]
combined_df['Charges-4'] = pca_components[:, 3]

reduced_df.drop(columns=columns_for_pca, inplace=True)
combined_df.drop(columns=columns_for_pca, inplace=True)

Explained Variance Ratio: [0.35304704 0.20746549 0.16577886 0.16020383]


In [48]:
reduced_df.head()

Unnamed: 0,SeniorCitizen,Churn,Gender,Age,Married,Number of Dependents,City,Number of Referrals,Offer,Phone Service,...,Tenure In Years,Total Revenue,cltv,Cluster,Streaming Services-1,Streaming Services-2,Charges-1,Charges-2,Charges-3,Charges-4
0,0,0,0,36,1,0,554,0,4,0,...,0.08,29.85,0.000339,2,-1.333973,-0.157216,-1.651737,-0.524479,-0.109299,0.098462
1,0,0,1,46,0,0,554,0,1,1,...,2.83,2470.56,0.759234,0,-1.333973,-0.157216,-0.542043,0.285755,-0.260714,-0.284047
2,0,1,1,37,0,0,554,0,1,1,...,0.17,129.09,0.00261,0,-1.333973,-0.157216,-1.569044,0.617957,-0.270518,-0.188623
3,0,0,1,53,0,0,554,0,1,0,...,3.75,1840.75,0.980095,0,-1.333973,-0.157216,-0.989208,-0.123773,-0.201134,-0.091736
4,0,1,0,19,0,2,554,0,1,1,...,0.17,169.89,0.00366,0,-1.333973,-0.157216,-1.33312,0.933175,-0.340048,-0.283804


In [49]:
reduced_df.columns

Index(['SeniorCitizen', 'Churn', 'Gender', 'Age', 'Married',
       'Number of Dependents', 'City', 'Number of Referrals', 'Offer',
       'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type',
       'Avg Monthly GB Download', 'Online Security', 'Online Backup',
       'Device Protection Plan', 'Premium Tech Support', 'Unlimited Data',
       'Contract', 'Customer Status', 'Tenure In Years', 'Total Revenue',
       'cltv', 'Cluster', 'Streaming Services-1', 'Streaming Services-2',
       'Charges-1', 'Charges-2', 'Charges-3', 'Charges-4'],
      dtype='object')

In [50]:
combined_df.columns

Index(['SeniorCitizen', 'Churn', 'Gender', 'Age', 'Married',
       'Number of Dependents', 'City', 'Number of Referrals', 'Offer',
       'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type',
       'Avg Monthly GB Download', 'Online Security', 'Online Backup',
       'Device Protection Plan', 'Premium Tech Support', 'Unlimited Data',
       'Contract', 'Customer Status', 'Churn Reason', 'Tenure In Years',
       'Total Revenue', 'cltv', 'Cluster', 'Streaming Services-1',
       'Streaming Services-2', 'Charges-1', 'Charges-2', 'Charges-3',
       'Charges-4'],
      dtype='object')

In [51]:
combined_df[['Number of Dependents', 'Online Backup', 'Online Security', 'Avg Monthly GB Download','Churn']].corr()

Unnamed: 0,Number of Dependents,Online Backup,Online Security,Avg Monthly GB Download,Churn
Number of Dependents,1.0,0.001721,0.041977,0.24841,-0.21878
Online Backup,0.001721,1.0,0.283832,0.032511,-0.082255
Online Security,0.041977,0.283832,1.0,0.068969,-0.171226
Avg Monthly GB Download,0.24841,0.032511,0.068969,1.0,-0.084292
Churn,-0.21878,-0.082255,-0.171226,-0.084292,1.0


In [52]:
model_df = reduced_df.drop(['SeniorCitizen','Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type',
       'Avg Monthly GB Download', 'Online Backup','Customer Status'],axis=1)
combined_df.drop(['SeniorCitizen','Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type',
       'Avg Monthly GB Download', 'Online Backup','Customer Status'],axis=1,inplace=True)

In [129]:
model_df.columns

Index(['Churn', 'Gender', 'Age', 'Married', 'Number of Dependents', 'City',
       'Number of Referrals', 'Offer', 'Online Security',
       'Device Protection Plan', 'Premium Tech Support', 'Unlimited Data',
       'Contract', 'Tenure In Years', 'Total Revenue', 'cltv', 'Cluster',
       'Streaming Services-1', 'Streaming Services-2', 'Charges-1',
       'Charges-2', 'Charges-3', 'Charges-4'],
      dtype='object')

## Splitting into Train and Test Data

In [54]:
x=model_df.drop('Churn',axis=1)
y=model_df['Churn']

In [55]:
# Training and testing for churn prediction
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [56]:
# Training data for churn reason prediction

# Get the indices of records in x_test
indices_to_exclude = x_test.index

# Create x_train_reason by excluding records present in x_test
x_train_reason = combined_df[(combined_df['Churn'] == 1) & (~combined_df.index.isin(indices_to_exclude))]

# Create y_train_reason corresponding to x_train_reason
y_train_reason = x_train_reason['Churn Reason']

# Drop the 'Churn Reason' column from x_train_reason
x_train_reason.drop(['Churn Reason','Churn'], axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train_reason.drop(['Churn Reason','Churn'], axis=1, inplace=True)


In [57]:
x_train_reason

Unnamed: 0,Gender,Age,Married,Number of Dependents,City,Number of Referrals,Offer,Online Security,Device Protection Plan,Premium Tech Support,...,Tenure In Years,Total Revenue,cltv,Cluster,Streaming Services-1,Streaming Services-2,Charges-1,Charges-2,Charges-3,Charges-4
2,1,37,0,0,554,0,1,1,0,0,...,0.17,129.09,0.002610,0,-1.333973,-0.157216,-1.569044,0.617957,-0.270518,-0.188623
4,0,19,0,2,554,0,1,0,0,0,...,0.17,169.89,0.003660,0,-1.333973,-0.157216,-1.333120,0.933175,-0.340048,-0.283804
5,0,31,0,2,554,0,1,0,1,0,...,0.67,917.70,0.078054,0,2.227598,0.186057,-0.603732,1.224686,-0.443543,-0.433743
13,1,38,0,1,554,0,1,0,1,0,...,4.08,7208.47,2.917522,1,2.227598,0.186057,2.625339,-0.820637,-0.281941,-0.489975
18,0,21,1,0,854,6,1,0,1,1,...,0.83,622.45,0.062265,0,-1.333973,-0.157216,-1.412004,0.714972,-0.286838,-0.241256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,1,64,1,2,695,1,1,0,0,0,...,0.08,108.33,0.000802,0,-1.333973,-0.157216,-0.741027,-0.625236,-0.197841,0.048530
7021,1,44,0,0,723,0,3,0,0,1,...,1.00,843.00,0.103337,2,-0.325106,1.622951,-1.272123,0.784960,-0.306552,-0.275683
7026,0,40,0,0,905,0,1,0,0,0,...,0.75,780.63,0.042952,0,-1.333973,-0.157216,-0.737632,-1.328836,-0.067987,0.177239
7032,1,79,0,0,306,0,1,0,0,0,...,0.08,84.10,0.000860,0,-1.333973,-0.157216,-1.299465,1.040135,-0.360949,-0.309537


## Model Building

In [58]:
accuracy_models={}
recall_models={}

## Decision Tree

In [59]:
DT = DecisionTreeClassifier()
DT.fit(x_train,y_train)

In [60]:
DT_y_pred = DT.predict(x_test)

In [61]:
print("The Accuracy of the Decision Tree Model is: {}%".format(round(accuracy_score(y_test,DT_y_pred)*100,2)))
accuracy_models['Decision Tree']=round(accuracy_score(y_test,DT_y_pred)*100,2)

The Accuracy of the Decision Tree Model is: 77.71%


In [62]:
print("The Recall of the Decision Tree Model is: {}%".format(round(recall_score(y_test,DT_y_pred)*100,2)))
recall_models['Decision Tree']=round(recall_score(y_test,DT_y_pred)*100,2)

The Recall of the Decision Tree Model is: 60.32%


In [63]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
print("Confusion Matrix of Decision Tree Model:")
cf_indi_rand= pd.DataFrame(confusion_matrix(y_test,DT_y_pred),columns=['Predicted_0','Predicted_1'])
cf_indi_rand.index=['Actual_0','Actual_1']
cf_indi_rand

Confusion Matrix of Decision Tree Model:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,870,166
Actual_1,148,225


## Logistic Regression

In [64]:
LR = LogisticRegression()
LR.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [65]:
LR_ypred = LR.predict(x_test)

In [66]:
print("The Accuracy of the Logistic Regression Model is: {}%".format(round(accuracy_score(y_test,LR_ypred)*100,2)))
accuracy_models['Logistic Regression']=round(accuracy_score(y_test,LR_ypred)*100,2)

The Accuracy of the Logistic Regression Model is: 84.17%


In [67]:
print("The Recall of the Logistic Regression Model is: {}%".format(round(recall_score(y_test,LR_ypred)*100,2)))
recall_models['Logistic Regression']=round(recall_score(y_test,LR_ypred)*100,2)

The Recall of the Logistic Regression Model is: 65.15%


In [68]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix,recall_score
print("Confusion Matrix of Logistic Regression Model:")
cf_indi_rand= pd.DataFrame(confusion_matrix(y_test,LR_ypred),columns=['Predicted_0','Predicted_1'])
cf_indi_rand.index=['Actual_0','Actual_1']
cf_indi_rand

Confusion Matrix of Logistic Regression Model:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,943,93
Actual_1,130,243


## Individual Random Forest Model

In [86]:
RF = RandomForestClassifier()
RF.fit(x_train,y_train)

In [87]:
indi_rf_ypred = RF.predict(x_test)

In [88]:
print("The Accuracy of the Random Forest Model: {} %".format(round(accuracy_score(y_test,indi_rf_ypred)*100,2)))
accuracy_models['Random Forest']=round(accuracy_score(y_test,indi_rf_ypred)*100,2)

The Accuracy of the Random Forest Model: 84.32 %


In [89]:
print("The Recall of the Random Forest Model: {} %".format(round(recall_score(y_test,indi_rf_ypred)*100,2)))
recall_models['Random Forest']=round(recall_score(y_test,indi_rf_ypred)*100,2)

The Recall of the Random Forest Model: 63.27 %


In [73]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
print("Confusion Matrix of Random Forest Model:")
cf_indi_rand= pd.DataFrame(confusion_matrix(y_test,indi_rf_ypred),columns=['Predicted_0','Predicted_1'])
cf_indi_rand.index=['Actual_0','Actual_1']
cf_indi_rand

Confusion Matrix of Random Forest Model:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,954,82
Actual_1,137,236


### Hyperparameter Tuning

In [74]:
param = {
    'n_estimators': [10,20,50,100],  
    'max_features': ['sqrt', 'log2'],  
    'max_depth': [2,4,6,8], 
    'criterion': ['gini', 'entropy'] 
}

RF1 = RandomForestClassifier(random_state=10)
 
grid = GridSearchCV(estimator=RF1, 
                    param_grid=param,
                    scoring="accuracy",
                    verbose=1, cv=5)

grid.fit(x_train,y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [75]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'sqrt',
 'n_estimators': 100}

In [76]:
rf1 = grid.best_estimator_

In [77]:
tuned_y_pred = rf1.predict(x_test)

In [78]:
from sklearn.metrics import accuracy_score
print("The Accuracy of the Tuned Random Forest Model is: {}%".format(round(accuracy_score(y_test,tuned_y_pred)*100,2)))
accuracy_models['Tuned Random Forest']=round(accuracy_score(y_test,tuned_y_pred)*100,2)

The Accuracy of the Tuned Random Forest Model is: 84.46%


In [79]:
from sklearn.metrics import accuracy_score
print("The recall of the Tuned Random Forest Model is: {}%".format(round(recall_score(y_test,tuned_y_pred)*100,2)))
recall_models['Tuned Random Forest']=round(recall_score(y_test,tuned_y_pred)*100,2)

The recall of the Tuned Random Forest Model is: 60.32%


In [80]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
print("Confusion Matrix of Tuned Random Forest Model:")
cf_tuned_rand= pd.DataFrame(confusion_matrix(y_test,tuned_y_pred),columns=['Predicted_0','Predicted_1'])
cf_tuned_rand.index=['Actual_0','Actual_1']
cf_tuned_rand

Confusion Matrix of Tuned Random Forest Model:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,965,71
Actual_1,148,225


In [81]:
accuracies_df = pd.DataFrame.from_dict(accuracy_models, orient='index', columns=['Accuracy'])
accuracies_df['Recall']=recall_models
accuracies_df.index.name = 'Model Name'  
accuracies_df.head()

Unnamed: 0_level_0,Accuracy,Recall
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Decision Tree,77.71,60.32
Logistic Regression,84.17,65.15
Random Forest,84.46,63.27
Tuned Random Forest,84.46,60.32


## Finding Target Customers

In [90]:
# adding best model predictions to test set.
x_test['Churn']=LR_ypred

In [91]:
len(x_test)

1409

In [92]:
# High Valued
print("Total Number of High-valued customers that are going to churn: ",len(x_test[(x_test['Cluster'] == 2) & (x_test['Churn'] == 1)][['Cluster','Churn']]))
x_test[(x_test['Cluster'] == 2) & (x_test['Churn'] == 1)][['Cluster','Churn']].head()

Total Number of High-valued customers that are going to churn:  135


Unnamed: 0,Cluster,Churn
5275,2,1
5194,2,1
1662,2,1
4810,2,1
742,2,1


In [93]:
# Medium Valued

print("Total Number of Medium-valued customers that are going to churn: ",len(x_test[(x_test['Cluster'] == 1) & (x_test['Churn'] == 1)][['Cluster','Churn']]))
x_test[(x_test['Cluster'] == 1) & (x_test['Churn'] == 1)][['Cluster','Churn']].head()

Total Number of Medium-valued customers that are going to churn:  14


Unnamed: 0,Cluster,Churn
4395,1,1
2206,1,1
2609,1,1
6819,1,1
2877,1,1


In [94]:
# Low Valued

len(x_test[(x_test['Cluster'] == 0) & (x_test['Churn'] == 1)][['Cluster','Churn']])

187

In [96]:
# High Valued customers retained

len(x_test[(x_test['Cluster'] == 2) & (x_test['Churn'] == 0)][['Cluster','Churn']])

159

In [97]:
original_customer_ids = combined_df_copy.iloc[x_test[(x_test['Cluster'] == 2) & (x_test['Churn'] == 1)][['Cluster','Churn']]
.index]['Customer ID']


In [98]:
print("Customer ID's of High-Valued Customers that are going to Churn: ")
original_customer_ids

Customer ID's of High-Valued Customers that are going to Churn: 


5275    9530-EHPOH
5194    1902-XBTFB
1662    2058-DCJBE
4810    1112-CUNAO
742     5138-WVKYJ
           ...    
3721    2839-RFSQE
6498    0742-NXBGR
6848    5916-QEWPT
2306    7245-NIIWQ
5872    4550-EVXNY
Name: Customer ID, Length: 135, dtype: object

## Predict Churn Reasons for the High-Valued Customers

In [99]:
# target_cust_df is the customer id's of customer who are high valued and churn is yes
target_cust_df=combined_df_copy[combined_df_copy['Customer ID'].isin(original_customer_ids)]

In [100]:
target_cust_df=target_cust_df.merge(reasons_df,on='Customer ID')

In [101]:
target_cust_df.head()

Unnamed: 0,Customer ID,SeniorCitizen,Churn,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Customer Status,Churn Category,Churn Reason_x,Churn Reason_y
0,9947-OTFQU,1,Yes,Male,65,No,0,Covina,91722,34.097345,...,Bank Withdrawal,74.4,1074.3,0.0,0,436.65,Churned,Competitor,Competitor had better devices,Competitor had better devices
1,2034-GDRCN,0,Yes,Female,47,No,0,San Diego,92122,32.85723,...,Bank Withdrawal,90.4,168.2,0.0,0,21.06,Churned,Other,Don't know,Don't know
2,0259-GBZSH,0,Yes,Male,22,No,0,San Diego,92122,32.85723,...,Bank Withdrawal,85.65,181.5,0.0,0,80.86,Churned,Competitor,Competitor made better offer,Competitor made better offer
3,6905-NIQIN,0,Yes,Male,26,No,0,San Diego,92122,32.85723,...,Credit Card,50.65,50.65,0.0,0,14.78,Churned,Competitor,Competitor made better offer,Competitor made better offer
4,4800-VHZKI,0,Yes,Female,59,No,0,San Diego,92122,32.85723,...,Credit Card,19.9,19.9,0.0,0,36.63,Churned,Competitor,Competitor made better offer,Competitor made better offer


## Processing Churn Reasons and Making Predictions for Target Customers

In [102]:
x_train_reason

le = LabelEncoder()
y_train_reason = le.fit_transform(y_train_reason)

In [103]:
x_train_reason

Unnamed: 0,Gender,Age,Married,Number of Dependents,City,Number of Referrals,Offer,Online Security,Device Protection Plan,Premium Tech Support,...,Tenure In Years,Total Revenue,cltv,Cluster,Streaming Services-1,Streaming Services-2,Charges-1,Charges-2,Charges-3,Charges-4
2,1,37,0,0,554,0,1,1,0,0,...,0.17,129.09,0.002610,0,-1.333973,-0.157216,-1.569044,0.617957,-0.270518,-0.188623
4,0,19,0,2,554,0,1,0,0,0,...,0.17,169.89,0.003660,0,-1.333973,-0.157216,-1.333120,0.933175,-0.340048,-0.283804
5,0,31,0,2,554,0,1,0,1,0,...,0.67,917.70,0.078054,0,2.227598,0.186057,-0.603732,1.224686,-0.443543,-0.433743
13,1,38,0,1,554,0,1,0,1,0,...,4.08,7208.47,2.917522,1,2.227598,0.186057,2.625339,-0.820637,-0.281941,-0.489975
18,0,21,1,0,854,6,1,0,1,1,...,0.83,622.45,0.062265,0,-1.333973,-0.157216,-1.412004,0.714972,-0.286838,-0.241256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,1,64,1,2,695,1,1,0,0,0,...,0.08,108.33,0.000802,0,-1.333973,-0.157216,-0.741027,-0.625236,-0.197841,0.048530
7021,1,44,0,0,723,0,3,0,0,1,...,1.00,843.00,0.103337,2,-0.325106,1.622951,-1.272123,0.784960,-0.306552,-0.275683
7026,0,40,0,0,905,0,1,0,0,0,...,0.75,780.63,0.042952,0,-1.333973,-0.157216,-0.737632,-1.328836,-0.067987,0.177239
7032,1,79,0,0,306,0,1,0,0,0,...,0.08,84.10,0.000860,0,-1.333973,-0.157216,-1.299465,1.040135,-0.360949,-0.309537


In [104]:
y_train_reason

array([ 3, 13, 13, ...,  4,  3,  3])

# Random Forest Model

In [105]:
TRF = RandomForestClassifier(random_state=10)
 
grid1 = GridSearchCV(estimator=TRF, 
                    param_grid=param,
                    scoring="accuracy",
                    verbose=1, cv=3)

grid1.fit(x_train_reason,y_train_reason)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


In [106]:
trf = grid1.best_estimator_

In [107]:
x_train_reason.columns

Index(['Gender', 'Age', 'Married', 'Number of Dependents', 'City',
       'Number of Referrals', 'Offer', 'Online Security',
       'Device Protection Plan', 'Premium Tech Support', 'Unlimited Data',
       'Contract', 'Tenure In Years', 'Total Revenue', 'cltv', 'Cluster',
       'Streaming Services-1', 'Streaming Services-2', 'Charges-1',
       'Charges-2', 'Charges-3', 'Charges-4'],
      dtype='object')

In [108]:
## getting records of customer that are high-valued and going to churn

x_test=x_test[(x_test['Cluster'] == 2) & (x_test['Churn'] == 1)]

In [109]:
x_test.drop("Churn",axis=1,inplace=True)

In [110]:
reasons_ypred = trf.predict(x_test)

In [111]:
pd.Series(reasons_ypred).unique()

array([3, 2, 1])

In [112]:
decoded_data = le.inverse_transform(reasons_ypred)
print("Decoded data:", decoded_data)

Decoded data: ['Competitor made better offer' 'Competitor had better devices'
 'Competitor had better devices' 'Competitor had better devices'
 'Competitor made better offer' 'Competitor had better devices'
 'Competitor made better offer' 'Competitor had better devices'
 'Attitude of support person' 'Competitor made better offer'
 'Competitor had better devices' 'Competitor had better devices'
 'Competitor made better offer' 'Competitor had better devices'
 'Competitor had better devices' 'Competitor had better devices'
 'Competitor made better offer' 'Competitor had better devices'
 'Competitor made better offer' 'Competitor had better devices'
 'Competitor had better devices' 'Competitor made better offer'
 'Competitor had better devices' 'Attitude of support person'
 'Competitor had better devices' 'Competitor had better devices'
 'Competitor had better devices' 'Competitor had better devices'
 'Competitor made better offer' 'Competitor made better offer'
 'Competitor had better dev

In [113]:
pd.Series(decoded_data).unique()

array(['Competitor made better offer', 'Competitor had better devices',
       'Attitude of support person'], dtype=object)

In [114]:
unique_values, counts = np.unique(decoded_data, return_counts=True)

for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")

Value: Attitude of support person, Count: 5
Value: Competitor had better devices, Count: 87
Value: Competitor made better offer, Count: 43
