<a href="https://colab.research.google.com/github/naghwalid/sales_predictions/blob/main/Project_2_Part_4_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the dataset:

df = pd.read_csv('/content/drive/MyDrive/Dataset/telecom_customer_churn.csv')
df.head(7)

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability
5,0013-MHZWF,Female,23,No,3,Midpines,95345,37.581496,-119.972762,0,...,Credit Card,69.4,571.45,0.0,0,150.93,722.38,Stayed,,
6,0013-SMEOE,Female,67,Yes,0,Lompoc,93437,34.757477,-120.550507,1,...,Bank Withdrawal,109.7,7904.25,0.0,0,707.16,8611.41,Stayed,,


In [4]:

# check for duplicates and missing values
print('missing data:\n', df.isna().sum())
print('\n')
print('duplicates:', df.duplicated().sum())

missing data:
 Customer ID                             0
Gender                                  0
Age                                     0
Married                                 0
Number of Dependents                    0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Number of Referrals                     0
Tenure in Months                        0
Offer                                   0
Phone Service                           0
Avg Monthly Long Distance Charges     682
Multiple Lines                        682
Internet Service                        0
Internet Type                        1526
Avg Monthly GB Download              1526
Online Security                      1526
Online Backup                        1526
Device Protection Plan               1526
Premium Tech Support                 1526
Streaming TV                         1526
Streaming Movies   

In [5]:
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
for col in numeric_cols:
    df[col].fillna(df[col].mean(), inplace=True)


In [6]:
# Impute missing values in categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col].fillna(df[col].value_counts().index[0], inplace=True)

# Check for any remaining missing values
missing_values = df.isnull().sum()
print(missing_values)

Customer ID                          0
Gender                               0
Age                                  0
Married                              0
Number of Dependents                 0
City                                 0
Zip Code                             0
Latitude                             0
Longitude                            0
Number of Referrals                  0
Tenure in Months                     0
Offer                                0
Phone Service                        0
Avg Monthly Long Distance Charges    0
Multiple Lines                       0
Internet Service                     0
Internet Type                        0
Avg Monthly GB Download              0
Online Security                      0
Online Backup                        0
Device Protection Plan               0
Premium Tech Support                 0
Streaming TV                         0
Streaming Movies                     0
Streaming Music                      0
Unlimited Data           

In [7]:
# Select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols)

In [8]:
from sklearn.preprocessing import LabelEncoder

# Select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Perform label encoding
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [9]:
# Split the dataset into features and target variable
X = df.drop(columns=[ 'Customer ID','Churn Category'])
y = df['Churn Category']

In [10]:
#Split the data into training and testing sets:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
#Then, you can apply feature scaling to normalize the numerical features:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [12]:
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)  # Keep components that explain 95% of the variance
X_pca = pca.fit_transform(X)

# Check the dimensions of the transformed dataset
print(X_pca.shape)

(7043, 2)


In [13]:
# Fit and evaluate multiple models
models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier()]
model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest']
best_model = None
best_score = 0

In [14]:
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Model:", name)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("---------------------------------------")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


Model: Logistic Regression
Accuracy: 0.8537970191625266
Precision: 0.7289693499308157
Recall: 0.8537970191625266
F1-Score: 0.7864608070846244
---------------------------------------
Model: Decision Tree
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-Score: 1.0
---------------------------------------
Model: Random Forest
Accuracy: 0.9737402413058907
Precision: 0.9815590612670178
Recall: 0.9737402413058907
F1-Score: 0.9731830227848851
---------------------------------------


In [15]:
    if f1 > best_score:
        best_score = f1
        best_model = model

# Hyperparameter tuning using GridSearchCV
param_grid = {'max_depth': [3, 5, 7],
              'min_samples_split': [2, 5, 10]}

grid_search = GridSearchCV(best_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'max_depth': 7, 'min_samples_split': 10}
Best Score: 0.9629013981065435


In [16]:
# Identify the best tuned version of each model type
best_tuned_model = grid_search.best_estimator_

In [17]:
# Calculate metrics for the best tuned model
y_pred_best_tuned = best_tuned_model.predict(X_test)
accuracy_best_tuned = accuracy_score(y_test, y_pred_best_tuned)
precision_best_tuned = precision_score(y_test, y_pred_best_tuned, average='weighted')
recall_best_tuned = recall_score(y_test, y_pred_best_tuned, average='weighted')
f1_best_tuned = f1_score(y_test, y_pred_best_tuned, average='weighted')

In [18]:
print("Best Tuned Model Metrics:")
print("Accuracy:", accuracy_best_tuned)
print("Precision:", precision_best_tuned)
print("Recall:", recall_best_tuned)
print("F1-Score:", f1_best_tuned)

Best Tuned Model Metrics:
Accuracy: 0.950319375443577
Precision: 0.964304359384695
Recall: 0.950319375443577
F1-Score: 0.9457748811364833
