In [1]:
# Importing necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Load the CSV dataset
file_path = 'database/TelecomCustomerChurn.csv'
data = pd.read_csv(file_path)
data = data.drop('customerID', axis=1)


# Display the first few rows of the dataset
data.head()

Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Monthly,Yes,Manual,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Manual,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Monthly,Yes,Manual,53.85,108.15,Yes
3,Male,0,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Monthly,Yes,Manual,70.7,151.65,Yes


In [3]:
# Label Encoding for binary categories
label_encoder = LabelEncoder()
binary_columns = ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']  # Update as needed
for col in binary_columns:
    if data[col].dtype == 'object':  # To ensure the column is categorical
        data[col] = label_encoder.fit_transform(data[col])

# One-Hot Encoding for nominal data
nominal_columns = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                   'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']  # Update as needed
data = pd.get_dummies(data, columns=nominal_columns, drop_first=True)


# Display the first few rows of the dataset
data.head()


Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,OnlineSecurity_Yes,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Manual
0,0,0,1,0,1,0,1,29.85,29.85,No,...,False,True,False,False,False,False,False,False,False,True
1,1,0,0,0,34,1,0,56.95,1889.5,No,...,True,False,True,False,False,False,True,False,False,True
2,1,0,0,0,2,1,1,53.85,108.15,Yes,...,True,True,False,False,False,False,False,False,False,True
3,1,0,0,0,45,0,0,42.3,1840.75,No,...,True,False,True,True,False,False,True,False,False,False
4,0,0,0,0,2,1,1,70.7,151.65,Yes,...,False,False,False,False,False,False,False,False,False,True


In [4]:
print(data.dtypes)

# Assuming 'data' is your DataFrame
rows, columns = data.shape

print(f"Number of rows: {rows}")
print(f"Number of columns: {columns}")

Gender                                     int64
SeniorCitizen                              int64
Partner                                    int64
Dependents                                 int64
Tenure                                     int64
PhoneService                               int64
PaperlessBilling                           int64
MonthlyCharges                           float64
TotalCharges                              object
Churn                                     object
MultipleLines_Yes                           bool
InternetService_Fiber optic                 bool
InternetService_No                          bool
OnlineSecurity_Yes                          bool
OnlineBackup_Yes                            bool
DeviceProtection_Yes                        bool
TechSupport_Yes                             bool
StreamingTV_Yes                             bool
StreamingMovies_Yes                         bool
Contract_One year                           bool
Contract_Two year   

In [5]:
non_numeric_values = pd.to_numeric(data['TotalCharges'], errors='coerce').isna()
print(data['TotalCharges'][non_numeric_values].unique())
# Convert 'TotalCharges' to a numeric type, handling non-numeric values as NaN
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# You may choose to handle NaN values, for example, by filling them with the mean
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)


[' ']


In [6]:
# Assuming 'Churn' is the target variable
X = data.drop('Churn', axis=1)
y = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)  # Convert to binary if it's not already

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


feature_names = X_train.columns


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


X_train = X_train_scaled
X_test = X_test_scaled


# Length of each subset
len_X_train = len(X_train)
len_X_test = len(X_test)
len_y_train = len(y_train)
len_y_test = len(y_test)

print(f"Length of X_train: {len_X_train}")
print(f"Length of X_test: {len_X_test}")
print(f"Length of y_train: {len_y_train}")
print(f"Length of y_test: {len_y_test}")

Length of X_train: 5634
Length of X_test: 1409
Length of y_train: 5634
Length of y_test: 1409


In [7]:
# Build and train the Logistic Regression model
model = LogisticRegression(max_iter=10000)  # Increased max_iter for convergence


In [8]:
model.fit(X_train, y_train)


In [9]:
# Predictions and Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.68      0.59      0.63       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.81      0.82      0.81      1409

[[934 102]
 [153 220]]


In [10]:
# Get parameters of the model
current_parameters = model.get_params()

print(current_parameters)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 10000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [11]:
coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': model.coef_[0]})
print(coefficients)

                                  Feature  Coefficient
0                                  Gender    -0.026339
1                           SeniorCitizen     0.064119
2                                 Partner     0.032654
3                              Dependents    -0.079967
4                                  Tenure    -1.265960
5                            PhoneService    -0.088838
6                        PaperlessBilling     0.172778
7                          MonthlyCharges    -0.545398
8                            TotalCharges     0.566891
9                       MultipleLines_Yes     0.168451
10            InternetService_Fiber optic     0.621976
11                     InternetService_No    -0.536756
12                     OnlineSecurity_Yes    -0.168711
13                       OnlineBackup_Yes    -0.049697
14                   DeviceProtection_Yes     0.022417
15                        TechSupport_Yes    -0.133223
16                        StreamingTV_Yes     0.177318
17        

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
file_path = 'database/TelecomCustomerChurn.csv'
data = pd.read_csv(file_path)
data = data.drop('customerID', axis=1)
# Preprocess your data
# Label Encoding for binary categories
label_encoder = LabelEncoder()
binary_columns = ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']  # Update as needed
for col in binary_columns:
    if data[col].dtype == 'object':  # To ensure the column is categorical
        data[col] = label_encoder.fit_transform(data[col])

# One-Hot Encoding for nominal data
nominal_columns = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                   'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']  # Update as needed
data = pd.get_dummies(data, columns=nominal_columns, drop_first=True)

non_numeric_values = pd.to_numeric(data['TotalCharges'], errors='coerce').isna()
print(data['TotalCharges'][non_numeric_values].unique())
# Convert 'TotalCharges' to a numeric type, handling non-numeric values as NaN
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# You may choose to handle NaN values, for example, by filling them with the mean
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

# Assuming 'Churn' is the target variable
X = data.drop('Churn', axis=1)
y = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)  # Convert to binary if it's not already

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('Churn', axis=1), data['Churn'], test_size=0.2, random_state=42)

# Store the feature names before scaling
feature_names = X_train.columns

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning using Grid Search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced'],
    'max_iter': [5000, 10000]  # Increased max_iter values
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Train the model with the best parameters
best_model = LogisticRegression(**best_params)
best_model.fit(X_train_scaled, y_train)

# Optionally evaluate the model
# ...

# Extracting the coefficients
coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': best_model.coef_[0]})
print(coefficients)


[' ']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'coefficients' is your DataFrame with features and their corresponding coefficients
plt.figure(figsize=(10, 8))
sns.barplot(x='Coefficient', y='Feature', data=coefficients)

plt.title('Coefficients of Logistic Regression Model')
plt.xlabel('Coefficient Value')
plt.ylabel('Features')
plt.show()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Train the model with the best parameters
best_model = LogisticRegression(C=10, class_weight=None, max_iter=5000, penalty='l2', solver='liblinear')
best_model.fit(X_train, y_train)  # Make sure to use the scaled data if you scaled your features

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data' is your DataFrame, 'MonthlyCharges' and 'Churn' are column names
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='MonthlyCharges', y='Churn', alpha=0.5)
plt.title('Monthly Charges vs. Churn')
plt.xlabel('Monthly Charges')
plt.ylabel('Churn')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Churn', y='MonthlyCharges', data=data)
plt.title('Monthly Charges vs. Churn')
plt.xlabel('Churn')
plt.ylabel('Monthly Charges')
plt.show()