In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

ML_Summer_School_ID = os.getenv('ML_Summer_School_ID')
print("Your Sudent ID is: " + ML_Summer_School_ID)

Your Sudent ID is: ML017_MyoThet


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import numpy as np


In [5]:
np.random.seed(42) # for reproducibility
n_samples = 500

In [6]:
data = {
    'MonthlyCharges': np.random.uniform(20, 120, n_samples),
    'TotalCharges': np.random.uniform(50, 5000, n_samples),
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples, p=[0.6, 0.25, 0.15]),
    'Dependents': np.random.choice(['Yes', 'No'], n_samples, p=[0.3, 0.7]),
    'SeniorCitizen': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
    'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples, p=[0.35, 0.45, 0.2]),
    'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'], n_samples, p=[0.3, 0.25, 0.25, 0.2]),
    'Tenure': np.random.randint(1, 72, n_samples)
}
df = pd.DataFrame(data)

In [7]:

# Introduce some correlation for 'Churn'
df['Churn'] = 'No'
df.loc[
    (df['MonthlyCharges'] > 80) |
    (df['Contract'] == 'Month-to-month') |
    (df['Tenure'] < 12) & (df['InternetService'] == 'Fiber optic')
, 'Churn'] = 'Yes'

In [8]:
# Balance the churn somewhat
churn_yes_count = df[df['Churn'] == 'Yes'].shape[0]
churn_no_count = df[df['Churn'] == 'No'].shape[0]
if churn_yes_count < churn_no_count / 2: # Artificially increase churn if too low
    no_indices = df[df['Churn'] == 'No'].index
    random_no_indices = np.random.choice(no_indices, size=int(churn_no_count * 0.3), replace=False) # Convert some 'No' to 'Yes'
    df.loc[random_no_indices, 'Churn'] = 'Yes'
elif churn_yes_count > churn_no_count * 2: # Artificially decrease churn if too high
    yes_indices = df[df['Churn'] == 'Yes'].index
    random_yes_indices = np.random.choice(yes_indices, size=int(churn_yes_count * 0.3), replace=False) # Convert some 'Yes' to 'No'
    df.loc[random_yes_indices, 'Churn'] = 'No'

print("Dataset Head:")
print(df.head())
print("\nChurn distribution:")
print(df['Churn'].value_counts())

Dataset Head:
   MonthlyCharges  TotalCharges        Contract Dependents  SeniorCitizen  \
0       57.454012   3505.900484  Month-to-month         No              0   
1      115.071431   2703.677013  Month-to-month         No              0   
2       93.199394   1582.161701        Two year        Yes              1   
3       79.865848   4078.285348        One year         No              0   
4       35.601864   3439.419304        One year         No              0   

  InternetService              PaymentMethod  Tenure Churn  
0              No  Bank transfer (automatic)      35    No  
1     Fiber optic  Bank transfer (automatic)      51    No  
2     Fiber optic           Electronic check      37   Yes  
3              No  Bank transfer (automatic)      57    No  
4             DSL  Bank transfer (automatic)       8    No  

Churn distribution:
Churn
Yes    269
No     231
Name: count, dtype: int64


In [9]:
file_name = f"electric_db_by_{ML_Summer_School_ID}.csv"
df.to_csv(file_name, index=False)