In [1]:
!pip install scikit-learn
!pip install imbalanced-learn




In [2]:
import numpy as np

from datetime import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 


In [3]:
from imblearn.over_sampling import SMOTE


In [4]:
df = pd.read_csv('C:/Users/MANTHAN/Desktop/compute_tasks/dataset_task1.csv')


In [5]:
df = df.drop(columns=['PaperlessBilling'])
df = df.drop(columns=['customerID'])
df = df.drop(columns=['gender'])
df = df.drop(columns=['Partner']) 
df = df.drop(columns=['Dependents'])
df = df.drop_duplicates()

In [6]:
# Replace with "Automatic"
df['PaymentMethod'] = df['PaymentMethod'].replace(
    ['Bank transfer (automatic)', 'Credit card (automatic)'], 
    'Automatic'
)
df['PaymentMethod'] = df['PaymentMethod'].str.lower().str.replace(' ', '_')


In [7]:
print(df.isnull().sum())

SeniorCitizen       0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [8]:
num_cols = df.select_dtypes(include=['float64','int64']).columns


In [9]:
# Map Yes → 1, No → 0
df['PhoneService'] = df['PhoneService'].map({'Yes': 1, 'No': 0})


In [10]:
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')
df['MultipleLines'] = df['MultipleLines'].map({'Yes': 1, 'No': 0})


In [11]:
# Mapping categories to numbers (encoding)
df['InternetService'] = df['InternetService'].map({
    'No': 0,
    'DSL': 1,
    'Fiber optic': 2
})

service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in service_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0, 'No internet service': 0})



In [12]:
# Map Contract to numeric
df['Contract'] = df['Contract'].map({
    'Month-to-month': 0,
    'One year': 1,
    'Two year': 2
}).astype(int)


In [13]:
class_counts = df['Churn'].value_counts()
print(class_counts)

imbalance_ratio = class_counts.min() / class_counts.max()
print("Imbalance Ratio:", imbalance_ratio)


Churn
No     5135
Yes    1822
Name: count, dtype: int64
Imbalance Ratio: 0.3548198636806232


In [14]:
df['NoInternetNoPhoneFlag'] = ((df['InternetService'] == 0) & (df['PhoneService'] == 0)).astype(int)


median_charges = df['MonthlyCharges'].median()
df['HighChargesFlag'] = (df['MonthlyCharges'] > median_charges).astype(int)


service_cols = ['OnlineSecurity','OnlineBackup','DeviceProtection',
                'TechSupport','StreamingTV','StreamingMovies']

df['NumServices'] = df[service_cols].sum(axis=1)


In [15]:
missing_count = df['PaymentMethod'].isnull().sum()
print("Missing values in PaymentMethod:", missing_count)

Missing values in PaymentMethod: 0


In [16]:
df['PaymentMethod'] = df['PaymentMethod'].map({
    'automatic': 0,
    'mailed_check': 1,
    'electronic_check': 2
}).astype(int)

In [17]:
df["Churn"] = df["Churn"].map({'Yes': 1, 'No': 0})
print(df.head())


   SeniorCitizen  tenure  PhoneService  MultipleLines  InternetService  \
0              0       1             0              0                1   
1              0      34             1              0                1   
2              0       2             1              0                1   
3              0      45             0              0                1   
4              0       2             1              0                2   

   OnlineSecurity  OnlineBackup  DeviceProtection  TechSupport  StreamingTV  \
0               0             1                 0            0            0   
1               1             0                 1            0            0   
2               1             1                 0            0            0   
3               1             0                 1            1            0   
4               0             0                 0            0            0   

   StreamingMovies  Contract  PaymentMethod  MonthlyCharges TotalCharges  \
0   

In [18]:
df = df.drop(columns=['NoInternetNoPhoneFlag'])

In [19]:

# Verify
print(df.dtypes)


SeniorCitizen         int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaymentMethod         int64
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
HighChargesFlag       int64
NumServices           int64
dtype: object


In [20]:
df['TotalCharges'] = df['TotalCharges'].replace(' ',np.nan)
print(df['TotalCharges'].isna().sum())

df = df.dropna(subset=['TotalCharges'])
print(df['TotalCharges'].isna().sum())



print(df['TotalCharges'].dtypes)

11
0
object


In [21]:
print(df.dtypes)

SeniorCitizen         int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaymentMethod         int64
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
HighChargesFlag       int64
NumServices           int64
dtype: object


In [22]:
 df = df.astype(float)
 print(df.dtypes)


SeniorCitizen       float64
tenure              float64
PhoneService        float64
MultipleLines       float64
InternetService     float64
OnlineSecurity      float64
OnlineBackup        float64
DeviceProtection    float64
TechSupport         float64
StreamingTV         float64
StreamingMovies     float64
Contract            float64
PaymentMethod       float64
MonthlyCharges      float64
TotalCharges        float64
Churn               float64
HighChargesFlag     float64
NumServices         float64
dtype: object


In [23]:
df.head()


Unnamed: 0,SeniorCitizen,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Churn,HighChargesFlag,NumServices
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,29.85,29.85,0.0,0.0,1.0
1,0.0,34.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,56.95,1889.5,0.0,0.0,2.0
2,0.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,53.85,108.15,1.0,0.0,2.0
3,0.0,45.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,42.3,1840.75,0.0,0.0,3.0
4,0.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,70.7,151.65,1.0,1.0,0.0


In [24]:
X=df.drop(columns="Churn",axis=1)
y=df["Churn"]
#print(y.head())

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)


#print(y_train.head())

In [26]:
df.head()

Unnamed: 0,SeniorCitizen,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Churn,HighChargesFlag,NumServices
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,29.85,29.85,0.0,0.0,1.0
1,0.0,34.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,56.95,1889.5,0.0,0.0,2.0
2,0.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,53.85,108.15,1.0,0.0,2.0
3,0.0,45.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,42.3,1840.75,0.0,0.0,3.0
4,0.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,70.7,151.65,1.0,1.0,0.0


In [27]:
print(X_train.dtypes)
X_train.info()



SeniorCitizen       float64
tenure              float64
PhoneService        float64
MultipleLines       float64
InternetService     float64
OnlineSecurity      float64
OnlineBackup        float64
DeviceProtection    float64
TechSupport         float64
StreamingTV         float64
StreamingMovies     float64
Contract            float64
PaymentMethod       float64
MonthlyCharges      float64
TotalCharges        float64
HighChargesFlag     float64
NumServices         float64
dtype: object
<class 'pandas.core.frame.DataFrame'>
Index: 5556 entries, 5006 to 1572
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     5556 non-null   float64
 1   tenure            5556 non-null   float64
 2   PhoneService      5556 non-null   float64
 3   MultipleLines     5556 non-null   float64
 4   InternetService   5556 non-null   float64
 5   OnlineSecurity    5556 non-null   float64
 6   OnlineBackup      5556 n

In [28]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)


X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(y_train_resampled.value_counts())

Churn
1.0    4099
0.0    4099
Name: count, dtype: int64


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

scaler = StandardScaler()


X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)  


model1 = LogisticRegression(random_state=42, max_iter=1000)  
model1.fit(X_train_scaled, y_train_resampled)


y_train_pred = model1.predict(X_train_scaled)
y_test_pred = model1.predict(X_test_scaled)

print("Train Accuracy:", accuracy_score(y_train_resampled, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))


print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


print("Classification Report:\n", classification_report(y_test, y_test_pred))


Train Accuracy: 0.7739692607953159
Test Accuracy: 0.737410071942446
Confusion Matrix:
 [[724 301]
 [ 64 301]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.92      0.71      0.80      1025
         1.0       0.50      0.82      0.62       365

    accuracy                           0.74      1390
   macro avg       0.71      0.77      0.71      1390
weighted avg       0.81      0.74      0.75      1390



In [30]:
param_grid = {
    'penalty': ['l1', 'l2',None],   # Type of regularization
    'C': [0.01, 0.1, 1, 10, 100],                  # Inverse of regularization strength
    'solver': ['saga'],      # Optimization solvers
    'l1_ratio': [0, 0.5, 1]                        # Only used if penalty='elasticnet'
}

from sklearn.model_selection import GridSearchCV

model2=GridSearchCV(model1,param_grid,cv=5,scoring='accuracy',n_jobs=-1)


model2.fit(X_train_scaled, y_train_resampled)

y_train_pred2 = model2.predict(X_train_scaled)
y_test_pred2 = model2.predict(X_test_scaled)
print("Train Accuracy:", accuracy_score(y_train_resampled, y_train_pred2))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred2))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


print("Classification Report:\n", classification_report(y_test, y_test_pred))





Train Accuracy: 0.7748231275920956
Test Accuracy: 0.737410071942446
Confusion Matrix:
 [[724 301]
 [ 64 301]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.92      0.71      0.80      1025
         1.0       0.50      0.82      0.62       365

    accuracy                           0.74      1390
   macro avg       0.71      0.77      0.71      1390
weighted avg       0.81      0.74      0.75      1390



In [31]:

from sklearn.ensemble import RandomForestClassifier

model3 = RandomForestClassifier(n_estimators=200, random_state=42)
model3.fit(X_train_scaled, y_train_resampled)

y_testpred_model3 = model3.predict(X_test_scaled)
y_trainpred_model3 = model3.predict(X_train_scaled)

print("Train Accuracy:", accuracy_score(y_train_resampled, y_trainpred_model3))
print("Test Accuracy:", accuracy_score(y_test, y_testpred_model3))


print("Confusion Matrix:\n", confusion_matrix(y_test, y_testpred_model3))


print("Classification Report:\n", classification_report(y_test, y_testpred_model3))


Train Accuracy: 0.997682361551598
Test Accuracy: 0.7848920863309352
Confusion Matrix:
 [[891 134]
 [165 200]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.87      0.86      1025
         1.0       0.60      0.55      0.57       365

    accuracy                           0.78      1390
   macro avg       0.72      0.71      0.71      1390
weighted avg       0.78      0.78      0.78      1390



In [32]:
param1_grid = [{
   
    'criterion': ['entropy', 'gini'],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_depth': [10, 20, 30],
    'class_weight': ['balanced']
}]

model4=GridSearchCV(model3,param1_grid,cv=5,scoring='accuracy',n_jobs=-1)

model4.fit(X_train_scaled, y_train_resampled)

y_testpred_model4 = model3.predict(X_test_scaled)
y_trainpred_model4 = model3.predict(X_train_scaled)

print("Train Accuracy:", accuracy_score(y_train_resampled, y_trainpred_model4))
print("Test Accuracy:", accuracy_score(y_test, y_testpred_model4))


print("Confusion Matrix:\n", confusion_matrix(y_test, y_testpred_model4))


print("Classification Report:\n", classification_report(y_test, y_testpred_model4))

Train Accuracy: 0.997682361551598
Test Accuracy: 0.7848920863309352
Confusion Matrix:
 [[891 134]
 [165 200]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.87      0.86      1025
         1.0       0.60      0.55      0.57       365

    accuracy                           0.78      1390
   macro avg       0.72      0.71      0.71      1390
weighted avg       0.78      0.78      0.78      1390



In [33]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

base = DecisionTreeClassifier(max_depth=1)

ada = AdaBoostClassifier(
    estimator=base,
    n_estimators=100,   
    learning_rate=1.0, 
    random_state=42
)

ada.fit(X_train_resampled, y_train_resampled)

y_predada_train = ada.predict(X_train_resampled)
y_predada_test = ada.predict(X_test)

train_acc = accuracy_score(y_train_resampled, y_predada_train)
test_acc = accuracy_score(y_test, y_predada_test)


print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(classification_report(y_test,y_predada_test))



Train Accuracy: 0.8542
Test Accuracy: 0.7885
              precision    recall  f1-score   support

         0.0       0.89      0.82      0.85      1025
         1.0       0.58      0.70      0.64       365

    accuracy                           0.79      1390
   macro avg       0.73      0.76      0.74      1390
weighted avg       0.81      0.79      0.79      1390



In [34]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()


lda.fit(X_train_resampled, y_train_resampled)

y_predlda_train = lda.predict(X_train_resampled)
y_predlda_test = lda.predict(X_test)

train_lda = accuracy_score(y_train_resampled, y_predlda_train)
test_lda = accuracy_score(y_test, y_predlda_test)

print(f"Train Accuracy: {train_lda:.4f}")
print(f"Test Accuracy: {test_lda:.4f}")
print(classification_report(y_test,y_predlda_test))

Train Accuracy: 0.7702
Test Accuracy: 0.7381
              precision    recall  f1-score   support

         0.0       0.92      0.71      0.80      1025
         1.0       0.50      0.83      0.62       365

    accuracy                           0.74      1390
   macro avg       0.71      0.77      0.71      1390
weighted avg       0.81      0.74      0.75      1390

