In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Loading the dataset
file_path = r'C:\nexford -\capstone\m4 work\teleconnect.csv'
df = pd.read_csv(file_path)

# Drop irrelevant columns
df = df.drop(columns=['customerID'], errors='ignore')

# Handling missing values in 'TotalCharges'
# Converting 'TotalCharges' column to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Removing rows where 'TotalCharges' is missing (NaN)
df = df.dropna(subset=['TotalCharges'])

# Checking for any other missing values in the dataset
print(df.isnull().sum())  

# Encoding categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Scaling numerical features
scaler = StandardScaler()
df_encoded[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(
    df_encoded[['tenure', 'MonthlyCharges', 'TotalCharges']]
)

# Creating a new feature 'TotalSpend' as tenure * MonthlyCharges
df_encoded['TotalSpend'] = df_encoded['tenure'] * df_encoded['MonthlyCharges']

# Saving the cleaned dataset to a valid local directory
cleaned_file_path = r'C:\nexford -\capstone\m3 work\cleaned_telco_data.csv'
df_encoded.to_csv(cleaned_file_path, index=False)

# Displaying the first few rows of the cleaned dataset
print(df_encoded.head())

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
   SeniorCitizen    tenure  MonthlyCharges  TotalCharges  gender_Male  \
0              0 -1.280248       -1.161694     -0.994194        False   
1              0  0.064303       -0.260878     -0.173740         True   
2              0 -1.239504       -0.363923     -0.959649         True   
3              0  0.512486       -0.747850     -0.195248         True   
4              0 -1.239504        0.196178     -0.940457        False   

   Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0         True           False             False   
1   

In [2]:
# Import necessary libraries
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LogisticRegression  
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  

# Splitting the data into features (X) and target variable (y)
X = df_encoded.drop(columns=['Churn_Yes'])  
y = df_encoded['Churn_Yes']  

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a Logistic Regression model on the training data
logreg = LogisticRegression(max_iter=1000) 
logreg.fit(X_train, y_train) 

# Making predictions on the test data
y_pred = logreg.predict(X_test)

# Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print classification report 
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 78.82%
Confusion Matrix:
[[916 117]
 [181 193]]
Classification Report:
              precision    recall  f1-score   support

       False       0.84      0.89      0.86      1033
        True       0.62      0.52      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



- 62% of the predicted churns were correct.- 
Only 52% of actual churn cases were identified which shows that some customers who churned were missed
- 
The dataset has more non-churners than churners which may be affecting performance.

In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Splitting the data into features (X) and target (y)
X = df_encoded.drop(columns=['Churn_Yes'])
y = df_encoded['Churn_Yes']

# Applying SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Splitting the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Training Logistic Regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Making predictions and evaluate
y_pred = logreg.predict(X_test)

# Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 79.48%
Confusion Matrix:
[[798 239]
 [185 844]]
Classification Report:
              precision    recall  f1-score   support

       False       0.81      0.77      0.79      1037
        True       0.78      0.82      0.80      1029

    accuracy                           0.79      2066
   macro avg       0.80      0.79      0.79      2066
weighted avg       0.80      0.79      0.79      2066



- Recall for churners (True): Increased from 52% to 82%, meaning the model is missing fewer churners.
- 
Balanced performance: The model now handles both classes more evenly, with a small drop in precision for non-churners but a big gain in identifying churners

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Training a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train) 

# Making predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluating the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy: {accuracy_rf * 100:.2f}%')
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

Accuracy: 83.20%
Confusion Matrix:
[[846 191]
 [156 873]]
Classification Report:
              precision    recall  f1-score   support

       False       0.84      0.82      0.83      1037
        True       0.82      0.85      0.83      1029

    accuracy                           0.83      2066
   macro avg       0.83      0.83      0.83      2066
weighted avg       0.83      0.83      0.83      2066



- Recall for churners (True): 85%, meaning the model captures most churners.
- Balanced precision: Precision for both churners and non-churners is well-balanced, with both classes achieving around 83%
- The Random Forest model had better accuracy and recall than Logistic Regression making it a more accurate and reliable choice for predicting churn.