In [22]:
# Logistic Regression with sklearn:
# For simple and linear problems where the relationship between predictors (all together, not one by one) and target is linear or can be linearly separated.
# since in logistic regression the target is categorical (like 0-1 binary) by saying linear relationship we mean linear relationship between predictors and log-odds (or "logit") of target varibale
# Works well for small to medium-sized datasets.

# For more complex problems, especially those involving non-linear relationships, unstructured data, or large-scale datasets, we use neural network libraries like Keras.

In [23]:
from urllib.request import urlretrieve

url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/ChurnData.csv'
out_file = 'ChurnData.csv'

import os
if not os.path.exists(out_file):
    urlretrieve(url, out_file)
else:
    pass

In [24]:
import pandas as pd
df = pd.read_csv(out_file)
df.head()
# learn about the dataset in the main file

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,longmon,...,pager,internet,callwait,confer,ebill,loglong,logtoll,lninc,custcat,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,4.4,...,1.0,0.0,1.0,1.0,0.0,1.482,3.033,4.913,4.0,1.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,9.45,...,0.0,0.0,0.0,0.0,0.0,2.246,3.24,3.497,1.0,1.0
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,6.3,...,0.0,0.0,0.0,1.0,0.0,1.841,3.24,3.401,3.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,6.05,...,1.0,1.0,1.0,1.0,1.0,1.8,3.807,4.331,4.0,0.0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,7.1,...,0.0,0.0,1.0,1.0,0.0,1.96,3.091,4.382,3.0,0.0


In [25]:
X = df.drop(['churn'], axis=1)  # remove the last column (target)
y = df['churn'].values
# standardizing X is generaly recommended for logistic regressoin (also it is better to standardize for cross validation. For decision tree it is not needed but doesn't make any changes too, so we do it anywasy)
# However, there are some cases where you might not need to standardize:
# When all features are already on the same scale (e.g., all binary variables)
# When interpretability of the original units is crucial for your specific use case
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)
# instead of StandardScaler().fit(X).transform(X)


In [26]:
# cross validatoin to see if we should use logistic regression or decision tree (or random forest)
# check the decisionTree_mine for cases where we have categorical variables
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

log_scores = cross_val_score(LogisticRegression(), X, y, cv=5)
tree_scores = cross_val_score(DecisionTreeClassifier(), X, y, cv=5)

print("Logistic Regression average score:", log_scores.mean())
print("Decision Tree average score:", tree_scores.mean())

# If decision tree is better by > 0.05 on avergae score, go with the decision tree or random forest
# otherwise go with logistic regression

# If logistic regression performs similarly to or better than the decision tree, it suggests your relationships are mostly linear. 
# If the decision tree performs notably better, you likely have non-linear relationships.

Logistic Regression average score: 0.735
Decision Tree average score: 0.61


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_logReg = LogisticRegression(C=0.01, solver='liblinear').fit(X_train, y_train)
# Smaller C means stronger regularization (helps prevent overfitting)
# liblinear: the algorithm used to optimize the model

y_pred = model_logReg.predict(X_test)
y_pred_prob = model_logReg.predict_proba(X_test)[:,1]

In [28]:
# a couple of ways to evaluate the model
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, jaccard_score

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))  # for plotting it check with Claude or main file
print(confusion_matrix(y_test, y_pred))  # for plotting it check with Claude or main file
print(jaccard_score(y_test, y_pred))


0.775
0.8530465949820789
[[25  6]
 [ 3  6]]
0.4
