In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
pd.set_option('display.max_columns', None)

In [24]:
print("Enter your file choice: 1/2/3")
choice=int(input())
if choice==1:
    dataframe=pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
#dataframe
#print(dataframe.head(),5)

Enter your file choice: 1/2/3


In [25]:
def preprocessing(ch,df):
    #replace empty string values with NaN
    df = df.replace(" ", np.nan)
    #replace missing values with columnwise mean for numeric columns
    numeric_cols = df.select_dtypes(include='number')
    df[numeric_cols.columns] = numeric_cols.fillna(numeric_cols.mean())
    #replace missing values with columnwise mode for non-numeric columns
    non_numeric_cols = df.select_dtypes(exclude='number')
    df[non_numeric_cols.columns] = non_numeric_cols.fillna(non_numeric_cols.mode().iloc[0])
    #remove duplicate rows
    df.drop_duplicates(inplace=True)

    if ch==1:
        features=dataframe.drop('Churn',axis=1)
        features.drop('customerID',axis=1,inplace=True)
        target=dataframe['Churn']

    #label encoding target column
    encoder=LabelEncoder()
    target=encoder.fit_transform(target)

    #one hot encoding the categorical columns
    categorical_columns=[col for col in features if features[col].dtype == 'object']
    for col in categorical_columns:
        features=pd.get_dummies(features, columns=[col], drop_first=True)

    #min-max scaling the numeric columns
    scaler=MinMaxScaler()
    scale_columns=features.select_dtypes(exclude=['bool']).columns
    features[scale_columns]=scaler.fit_transform(features[scale_columns])

    features_df=pd.DataFrame(features,columns=features.columns)
    target_df=pd.DataFrame(target,columns=['Churn'])
    correlations=features_df.corrwith(target_df['Churn'])

    return features_df,target_df,correlations

In [26]:
feature_processed, target_processed, correlations=preprocessing(choice,dataframe)
norm=Normalizer()
feature_normalized=norm.fit_transform(feature_processed)
#feature_processed

In [27]:
top_20_correlations=correlations.abs().sort_values(ascending=False).head(20)

feature_processed_df=pd.DataFrame(feature_normalized,columns=feature_processed.columns)
feature_processed_df=feature_processed_df[top_20_correlations.index]


In [28]:
#Logistic regression
X_train, X_test, y_train, y_test = train_test_split(feature_processed_df, target_processed, test_size=0.2, random_state=42)

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Hypothesis function (Logistic regression)
def predict(X, weights, bias):
    return sigmoid(np.dot(X, weights) + bias)

# Cost function (Binary Cross-Entropy)
def compute_cost(X, y, weights, bias):
    m = X.shape[0]  # number of examples
    predictions = predict(X, weights, bias)
    cost = -(1/m) * np.sum(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
    return cost

# Gradient descent to update weights and bias
def gradient_descent(X, y, weights, bias, learning_rate, iterations):
    m = X.shape[0]  # number of examples
    for i in range(iterations):
        # Calculate predictions
        predictions = predict(X, weights, bias)
        
        # Compute the gradients
        dw = (1/m) * np.dot(X.T, (predictions - y))
        db = (1/m) * np.sum(predictions - y)
        
        # Update the weights and bias
        weights -= learning_rate * dw
        bias -= learning_rate * db
    
    return weights, bias

# Function to make predictions (classify as 0 or 1)
def classify(X, weights, bias, threshold=0.5):
    probabilities = predict(X, weights, bias)
    return [1 if p >= threshold else 0 for p in probabilities]

# Initialize weights and bias
weights = np.zeros(X_train.shape[1])
bias = 0

# Hyperparameters
learning_rate = 0.1
iterations = 1000

# Train the model
weights, bias = gradient_descent(X_train, np.array(y_train).flatten(), weights, bias, learning_rate, iterations)

# Make predictions on the training set
y_pred = classify(X_test, weights, bias)

# Output accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Logistic Regression classifier: {accuracy:.2f}")



Accuracy of Logistic Regression classifier: 0.78


# Bagging

In [29]:
num_bootstrap_samples=9
bootstrap_samples=[]

X_train_np = X_train.to_numpy()  # Convert X_train to a NumPy array

for i in range(num_bootstrap_samples):
    bootstrap_indices=np.random.choice(range(len(X_train)),len(X_train),replace=True)
    bootstrap_X=X_train.iloc[bootstrap_indices]
    bootstrap_y=y_train.iloc[bootstrap_indices]
    bootstrap_samples.append((bootstrap_X, bootstrap_y))

lr_models=[]
for x,y in bootstrap_samples:
    clf=LogisticRegression()
    clf.fit(x,y)
    lr_models.append(clf)

predictions=[]
for model in lr_models:
    predictions.append(model.predict(X_test))
predictions=np.array(predictions)
aggregated_predictions=np.mean(predictions,axis=0)

# Round the aggregated predictions to the nearest integer
aggregated_predictions = np.round(aggregated_predictions)

bagging_accuracy=accuracy_score(y_test,aggregated_predictions)
print(f"Accuracy of Bagging classifier: {bagging_accuracy:.2f}")

Accuracy of Bagging classifier: 0.81


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# Stacking

In [30]:
X_base_train, X_meta_train_split, y_base_train, y_meta_train_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)
# Create a new dataset with predictions from base models
meta_X = np.array([model.predict(X_test) for model in lr_models]).T
X_meta_train = np.array([model.predict(X_meta_train_split) for model in lr_models]).T

# Train a meta classifier (another LR model)
meta_model = LogisticRegression()
meta_model.fit(X_meta_train, y_meta_train_split)

# Make predictions using the stacking ensemble
stacking_preds = meta_model.predict(np.array([model.predict(X_test) for model in lr_models]).T)

stacking_acc = accuracy_score(y_test, np.round(stacking_preds))
print(f"Stacking Ensemble Accuracy: {stacking_acc:.2f}")

  y = column_or_1d(y, warn=True)


Stacking Ensemble Accuracy: 0.81
