In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification


X, y = make_classification(
    n_samples=2000,
    n_features=5,
    n_informative=3,
    n_redundant=0,
    random_state=42,
    weights=[0.73, 0.27]  # 27% churn
)


df = pd.DataFrame(X, columns=['tenure', 'monthly_charges', 'support_calls', 'usage_score', 'satisfaction'])

df['tenure'] = (df['tenure'] - df['tenure'].min()) * 20
df['monthly_charges'] = (df['monthly_charges'] - df['monthly_charges'].min()) * 50 + 20
df['support_calls'] = (df['support_calls'] - df['support_calls'].min()) * 2
df['usage_score'] = (df['usage_score'] - df['usage_score'].min()) * 100
df['satisfaction'] = (df['satisfaction'] - df['satisfaction'].min()) * 5

np.random.seed(42)
df['contract_type'] = np.random.choice(['Month-to-month','One year','Two year'], size=2000, p=[0.55,0.25,0.20])
df['payment_method'] = np.random.choice(['Credit Card','Bank Transfer','Electronic Check','Mailed Check'], size=2000)
df['gender'] = np.random.choice(['Male','Female'], size=2000)


df['total_charges'] = df['tenure'] * df['monthly_charges']
df['customer_id'] = ["CUST"+str(i).zfill(5) for i in range(2000)]
df['churn'] = y


df.to_csv("../data/churn_data.csv", index=False)

print("✅ CSV file created successfully in data folder")


✅ CSV file created successfully in data folder


In [11]:
import pandas as pd

df = pd.read_csv("../data/churn_data.csv")
df.head()


Unnamed: 0,tenure,monthly_charges,support_calls,usage_score,satisfaction,contract_type,payment_method,gender,total_charges,customer_id,churn
0,105.137686,153.649211,6.034194,354.729126,19.656511,Month-to-month,Mailed Check,Male,16154.322395,CUST00000,0
1,67.473489,192.847322,5.43326,395.997511,34.553182,Two year,Electronic Check,Male,13012.0817,CUST00001,0
2,49.210735,331.51174,9.10921,398.100917,35.582162,One year,Electronic Check,Male,16313.93621,CUST00002,1
3,50.28326,219.662759,7.061933,470.260197,14.667901,One year,Electronic Check,Female,11045.359597,CUST00003,0
4,63.984182,237.064556,4.046808,349.870953,36.529053,Month-to-month,Mailed Check,Male,15168.381771,CUST00004,0


In [12]:
print("Shape:", df.shape)
df.info()
print("\nMissing values:\n", df.isna().sum())


Shape: (2000, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tenure           2000 non-null   float64
 1   monthly_charges  2000 non-null   float64
 2   support_calls    2000 non-null   float64
 3   usage_score      2000 non-null   float64
 4   satisfaction     2000 non-null   float64
 5   contract_type    2000 non-null   object 
 6   payment_method   2000 non-null   object 
 7   gender           2000 non-null   object 
 8   total_charges    2000 non-null   float64
 9   customer_id      2000 non-null   object 
 10  churn            2000 non-null   int64  
dtypes: float64(6), int64(1), object(4)
memory usage: 172.0+ KB

Missing values:
 tenure             0
monthly_charges    0
support_calls      0
usage_score        0
satisfaction       0
contract_type      0
payment_method     0
gender             0
total_charges      0
custom

In [13]:
df.describe()


Unnamed: 0,tenure,monthly_charges,support_calls,usage_score,satisfaction,total_charges,churn
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,73.84832,221.497327,5.981441,392.083409,31.461132,16348.625668,0.2735
std,19.812365,67.303822,1.826471,103.032736,7.436161,6685.51288,0.445867
min,0.0,20.0,0.0,0.0,0.0,0.0,0.0
25%,60.185376,173.663801,4.892433,322.481162,26.707072,11414.358913,0.0
50%,73.621869,219.719352,5.605531,391.036646,32.550449,15729.448582,0.0
75%,87.674445,279.543712,6.590894,461.061916,36.885367,20549.363976,1.0
max,144.34841,454.736545,12.341896,729.978317,50.284746,49054.090115,1.0


In [14]:
print(df['churn'].value_counts())
print((df['churn'].value_counts(normalize=True)*100).round(2))


churn
0    1453
1     547
Name: count, dtype: int64
churn
0    72.65
1    27.35
Name: proportion, dtype: float64


In [15]:
categorical_cols = ['contract_type','payment_method','gender']
for col in categorical_cols:
    print(f"\n{col} distribution:")
    print(df[col].value_counts())



contract_type distribution:
contract_type
Month-to-month    1089
One year           508
Two year           403
Name: count, dtype: int64

payment_method distribution:
payment_method
Credit Card         526
Mailed Check        513
Electronic Check    499
Bank Transfer       462
Name: count, dtype: int64

gender distribution:
gender
Male      1003
Female     997
Name: count, dtype: int64
