### Preparing sample data for streamlit app demo

In [53]:
import pandas as pd 
from sklearn.utils import shuffle

In [54]:
sample_data = pd.read_csv(('../raw/customer_churn_dataset-testing-master.csv'))

In [55]:
# drop nulls
sample_data.dropna(axis=0, inplace=True)
# dtypes to int
sample_data = sample_data.astype({'CustomerID':'int', 'Age': 'int', 'Tenure':'int', 'Usage Frequency':'int', 'Support Calls':'int', 'Total Spend':'int', 'Last Interaction':'int',  'Churn':'int'})
# remove index (ID col)
sample_data.drop(columns='CustomerID', inplace=True)
#reorder cols 
columns = [col for col in sample_data.columns if col != 'Churn'] + ['Churn']
sample_data = sample_data[columns]
#normalise column names 
sample_data.rename(columns={'Age': 'age' , 'Gender': 'gender', 'Tenure': 'tenure', 'Usage Frequency': 'usage_frequency', 
                 'Support Calls':'support_calls', 'Payment Delay': 'payment_delay', 'Contract Length': 'contract_length', 
                 'Total Spend':'total_spend', 'Last Interaction': 'last_interaction', 'Subscription Type': 'subscription_type',
                 'Churn':'churn'}, inplace=True)

# Shuffling 
sample_data = shuffle(sample_data, random_state=42).reset_index(drop=True)

In [56]:
# Remove leading/trailing spaces from categorical columns
sample_data['gender'] = sample_data['gender'].str.strip()
sample_data['contract_length'] = sample_data['contract_length'].str.strip()
sample_data['subscription_type'] = sample_data['subscription_type'].str.strip()


In [57]:
sample_y = sample_data.iloc[:2000]

In [58]:
sample_data = sample_data.iloc[:2000, :-1]

In [59]:
# Check for any NaN or empty string values remaining in the dataset
print(sample_data.isnull().sum())  # To check for NaN values
print((sample_data == "").sum())   # To check for empty strings


age                  0
gender               0
tenure               0
usage_frequency      0
support_calls        0
payment_delay        0
subscription_type    0
contract_length      0
total_spend          0
last_interaction     0
dtype: int64
age                  0
gender               0
tenure               0
usage_frequency      0
support_calls        0
payment_delay        0
subscription_type    0
contract_length      0
total_spend          0
last_interaction     0
dtype: int64


In [60]:
print(sample_data.dtypes)


age                   int64
gender               object
tenure                int64
usage_frequency       int64
support_calls         int64
payment_delay         int64
subscription_type    object
contract_length      object
total_spend           int64
last_interaction      int64
dtype: object


In [61]:
# Check for unexpected values in categorical columns
print(sample_data['gender'].unique())  # List all unique values in 'gender'
print(sample_data['contract_length'].unique())  # List all unique values in 'contract_length'
print(sample_data['subscription_type'].unique())  # List all unique values in 'subscription_type'


['Male' 'Female']
['Monthly' 'Quarterly' 'Annual']
['Standard' 'Premium' 'Basic']


In [62]:
sample_data.head(10)

Unnamed: 0,age,gender,tenure,usage_frequency,support_calls,payment_delay,subscription_type,contract_length,total_spend,last_interaction
0,55,Male,20,24,4,6,Standard,Monthly,635,25
1,28,Male,27,30,4,5,Premium,Quarterly,631,10
2,65,Female,60,17,7,16,Premium,Quarterly,314,1
3,53,Male,47,16,8,7,Premium,Annual,527,13
4,32,Male,56,5,7,15,Premium,Annual,236,25
5,60,Female,9,6,9,5,Premium,Quarterly,638,24
6,37,Male,1,28,6,13,Premium,Annual,875,28
7,21,Male,59,7,0,7,Basic,Annual,814,15
8,26,Male,37,23,6,25,Basic,Annual,541,29
9,65,Male,35,12,3,0,Premium,Monthly,476,11


In [63]:
sample_data.to_csv("sampled_test_data.csv", index=False)
sample_y.to_csv("sample_y_values.csv", index=False)