In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r'D:\Data_Scientisce_Trainging\Dataset\WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
# Define target column and categories
target = 'Churn'
yes_no_columns = []
cat_columns = []
num_columns = []
columns_to_drop = ['customerID', 'gender', 'TotalCharges']

# Classify columns
for column in df.columns:
    if column == target:
        continue
    elif column in columns_to_drop:
        continue
    elif df[column].nunique() == 2:  # Check if the column has exactly 2 unique values
        yes_no_columns.append(column)
    elif df[column].nunique() <= 4:  # Check if the column has 4 or fewer unique values
        cat_columns.append(column)
    else:
        num_columns.append(column)

# Print the results
print("Yes/No Columns:", yes_no_columns)
print("Categorical Columns:", cat_columns)
print("Numerical Columns:", num_columns)

Yes/No Columns: ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
Categorical Columns: ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
Numerical Columns: ['tenure', 'MonthlyCharges']


In [5]:
def drop_columns(X):
    return X.drop(columns =columns_to_drop)

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [7]:
preprocessing = ColumnTransformer([
    ('scaling', StandardScaler(), num_columns), ## basic scalling and use for number column
    ('oneHot', OneHotEncoder(), cat_columns ), ## categorical as number
    ('yes_no', OrdinalEncoder(), yes_no_columns) ## Yes No: ordinal encoder
])

In [8]:
# Define the training pipeline
Training_Pipeline = Pipeline([
    ('drop_columns', FunctionTransformer(drop_columns)),  # Custom transformer for dropping columns
    ('preprocessing', preprocessing),                    # Preprocessing step
    ('model', LogisticRegression())                      # Logistic Regression model
])

In [9]:
x, y = df.drop(columns=target), df[target]
x_train,x_test, y_train,y_test=train_test_split(x,y)

In [10]:
Training_Pipeline.fit(x_train,y_train)

In [11]:
y_prediction = Training_Pipeline.predict(x_test)

In [12]:
confusion_matrix(y_test, y_prediction)

array([[1155,  148],
       [ 195,  263]], dtype=int64)