**MODEL TRAIN**

* **Import Required Packages & Data Frame**

* Import packages

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score

import warnings 
warnings.filterwarnings("ignore")

* Import Data frame

In [36]:
df = pd.read_csv("E:/users/USER/Desktop/DataAnalyticsAndgenAI/ML Project/Telecom/nootbook/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [37]:
# 3. Clean numeric columns
for col in ['tenure', 'MonthlyCharges', 'TotalCharges']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [38]:
df.drop("customerID",axis=1,inplace=True)

**Define X(Input column) and Y(Target column)**

In [39]:
x = df.drop("Churn",axis=1)

In [40]:
y = df["Churn"]

**Train Test Split**

In [41]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

**Define numerical and categorical Column**

In [42]:
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                        'PhoneService', 'MultipleLines', 'InternetService',
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                        'TechSupport', 'StreamingTV', 'StreamingMovies',
                        'Contract', 'PaperlessBilling', 'PaymentMethod']

**Transformation**

In [43]:
num_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)
cat_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("scaler",OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [44]:
preprocessor = ColumnTransformer([
    ("num_pipeline",num_pipeline,numeric_features),
    ("cat_pipeline",cat_pipeline,categorical_features)
]
)

In [46]:
x_train_tra = preprocessor.fit_transform(x_train)
x_test_tra = preprocessor.transform(x_test)


**Model Train**

In [47]:
models = {
    "LogisticRegression" : LogisticRegression(),
    "DecisionTreeClassifier" : DecisionTreeClassifier(),
    "SVC" : SVC(),
    "KNeighborsClassifier" : KNeighborsClassifier(),
    "AdaBoostClassifier" : AdaBoostClassifier(),
    "GradientBoostingClassifier" : GradientBoostingClassifier(),
    "RandomForestClassifier" : RandomForestClassifier()
}

In [54]:
def model_train(x_train,y_train,x_test,y_test):
    accuracy_scores = []
    best_model = []
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(x_train,y_train)
        pred = model.predict(x_test)
        acc = accuracy_score(y_pred=pred,y_true=y_test)
        print("Model:",list(models.values())[i])
        print("accuracy_score: ",acc)
        print("--------------------------")
    

In [55]:
model_train(x_train_tra,y_train,x_test_tra,y_test)

Model: LogisticRegression()
accuracy_score:  0.7906316536550745
--------------------------
Model: DecisionTreeClassifier()
accuracy_score:  0.7352732434350603
--------------------------
Model: SVC()
accuracy_score:  0.7920511000709723
--------------------------
Model: KNeighborsClassifier()
accuracy_score:  0.752306600425834
--------------------------
Model: AdaBoostClassifier()
accuracy_score:  0.7927608232789212
--------------------------
Model: GradientBoostingClassifier()
accuracy_score:  0.7913413768630234
--------------------------
Model: RandomForestClassifier()
accuracy_score:  0.7821149751596878
--------------------------


In [60]:
final_model = AdaBoostClassifier()
final_model.fit(x_train_tra,y_train)
pred = final_model.predict(x_test_tra)
accuracy_score(y_test,pred)

0.7927608232789212

**Create dump file of preprocessor & model**

In [63]:
import dill
import os

In [71]:
artifacts_dir = os.path.join(os.getcwd(), "artifacts")
os.makedirs(artifacts_dir, exist_ok=True) 


trans_path = os.path.join(artifacts_dir, "preprocessor.pkl")
with open(trans_path, "wb") as f:
    dill.dump(preprocessor, f)

In [72]:
model_dir = os.path.join(os.getcwd(), "artifacts")
os.makedirs(model_dir, exist_ok=True) 


model_path = os.path.join(model_dir, "model.pkl")
with open(model_path, "wb") as f:
    dill.dump(final_model, f)