# Summary
Notebook of article about clogistic: logistic regression with applied constraints on the coefficients

# Install

In [1]:
# install clogistic
!pip install clogistic

Collecting clogistic
  Downloading clogistic-0.1.0-py3-none-any.whl (11 kB)
Installing collected packages: clogistic
Successfully installed clogistic-0.1.0


# Dataset

In [2]:
import pandas as pd

# load the original/raw data
df = pd.read_csv('/content/drive/MyDrive/project/clogistic/telco_churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


`TotalCharges` is still in `object`. Need to preprocess so that we can convert it to `float`.

In [4]:
# exclude rows with TotalCharges column contains white space
df = df.loc[~df['TotalCharges'].str.contains(' ')]

# transform TotalCharges col to float
df['TotalCharges'] = df['TotalCharges'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [5]:
# separate columns by their type
categoricals = df.select_dtypes(include='object').columns.tolist()
numericals = df.select_dtypes(exclude='object').columns.tolist()

categoricals

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [6]:
# inspect the values of categorical columns
for col in categoricals:
  print('-----'*10)
  print(f'Unique values of column {col}:')
  print(df[col].value_counts())
  print('\n')

--------------------------------------------------
Unique values of column customerID:
3160-TYXLT    1
4468-KAZHE    1
6121-TNHBO    1
6599-GZWCM    1
1213-NGCUN    1
             ..
3810-DVDQQ    1
1699-UOTXU    1
2453-SAFNS    1
0567-GGCAC    1
3115-JPJDD    1
Name: customerID, Length: 7032, dtype: int64


--------------------------------------------------
Unique values of column gender:
Male      3549
Female    3483
Name: gender, dtype: int64


--------------------------------------------------
Unique values of column Partner:
No     3639
Yes    3393
Name: Partner, dtype: int64


--------------------------------------------------
Unique values of column Dependents:
No     4933
Yes    2099
Name: Dependents, dtype: int64


--------------------------------------------------
Unique values of column PhoneService:
Yes    6352
No      680
Name: PhoneService, dtype: int64


--------------------------------------------------
Unique values of column MultipleLines:
No                  3385
Yes

In [7]:
# drop columns
dropped_cols = ['customerID',
                'MultipleLines',
                'InternetService',
                'OnlineSecurity',
                'OnlineBackup',
                'DeviceProtection',
                'TechSupport',
                'StreamingTV',
                'StreamingMovies',
                'Contract',
                'PaymentMethod',
                'TotalCharges' #multicollinear with 'tenure'
                ]

df = df.drop(columns=dropped_cols)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,Churn
0,Female,0,Yes,No,1,No,Yes,29.85,No
1,Male,0,No,No,34,Yes,No,56.95,No
2,Male,0,No,No,2,Yes,Yes,53.85,Yes
3,Male,0,No,No,45,No,No,42.3,No
4,Female,0,No,No,2,Yes,Yes,70.7,Yes


In [8]:
# label-encode categorical columns
binary_categoricals = ['gender','Partner','Dependents','PhoneService','PaperlessBilling','Churn']

for col in binary_categoricals:
  if col == 'gender':
    df[col] = df[col].apply(lambda x: 1 if x == 'Male' else 0)
  else :
    df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,Churn
0,0,0,1,0,1,0,1,29.85,0
1,1,0,0,0,34,1,0,56.95,0
2,1,0,0,0,2,1,1,53.85,1
3,1,0,0,0,45,0,0,42.3,0
4,0,0,0,0,2,1,1,70.7,1


# Modelling

In [9]:
# split data
from sklearn.model_selection import train_test_split

X = df.drop(columns='Churn').to_numpy()
y = df[['Churn']].to_numpy()
y = y.reshape(len(y),) # sklearn's y shape requirement

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [10]:
# train standard logistic regression
from sklearn.linear_model import LogisticRegression as skLogisticRegression

sk_logreg = skLogisticRegression(penalty='none',random_state=42)
sk_logreg.fit(X_train, y_train)

LogisticRegression(penalty='none', random_state=42)

In [11]:
# coefficients as dataframe
sk_coef = pd.DataFrame({
    'feature': df.drop(columns='Churn').columns.tolist() + ['intercept'],
    'coefficient': list(sk_logreg.coef_[0]) + [sk_logreg.intercept_[0]]
})

sk_coef

Unnamed: 0,feature,coefficient
0,gender,0.015848
1,SeniorCitizen,0.485217
2,Partner,0.156218
3,Dependents,-0.434155
4,tenure,-0.056694
5,PhoneService,-0.797005
6,PaperlessBilling,0.400013
7,MonthlyCharges,0.033082
8,intercept,-1.370021


Introduce constraints:
* non-positive coeffs (believed can't lead to churn): `Partner`, `Dependents`, `tenure`, `PhoneService`



In [12]:
# define constraints as dataframe
import numpy as np
constraint_df = pd.DataFrame(data=[
                                   ['gender',-np.inf,np.inf],
                                   ['SeniorCitizen',-np.inf,np.inf],
                                   ['Partner',-np.inf, 0],
                                   ['Dependents',-np.inf,0],
                                   ['tenure',-np.inf,0],
                                   ['PhoneService',-np.inf,0],
                                   ['PaperlessBilling',-np.inf,np.inf],
                                   ['MonthlyCharges',-np.inf,np.inf],
                                   ['intercept',-np.inf,np.inf]],
                             columns=['feature','lower_bound','upper_bound'])

constraint_df

Unnamed: 0,feature,lower_bound,upper_bound
0,gender,-inf,inf
1,SeniorCitizen,-inf,inf
2,Partner,-inf,0.0
3,Dependents,-inf,0.0
4,tenure,-inf,0.0
5,PhoneService,-inf,0.0
6,PaperlessBilling,-inf,inf
7,MonthlyCharges,-inf,inf
8,intercept,-inf,inf


In [13]:
# train using clogistic
from scipy.optimize import Bounds
from clogistic import LogisticRegression as clLogisticRegression

lower_bounds = constraint_df['lower_bound'].to_numpy()
upper_bounds = constraint_df['upper_bound'].to_numpy()
bounds = Bounds(lower_bounds, upper_bounds)

cl_logreg = clLogisticRegression(penalty='none')
cl_logreg.fit(X_train, y_train, bounds=bounds)

LogisticRegression(penalty='none')

In [14]:
# coefficients as dataframe
cl_coef = pd.DataFrame({
    'feature': df.drop(columns='Churn').columns.tolist() + ['intercept'],
    'coefficient': list(cl_logreg.coef_[0]) + [cl_logreg.intercept_[0]]
})

cl_coef

Unnamed: 0,feature,coefficient
0,gender,0.01841682
1,SeniorCitizen,0.5066916
2,Partner,3.856028e-09
3,Dependents,-0.3572103
4,tenure,-0.05572105
5,PhoneService,-0.7962332
6,PaperlessBilling,0.3988242
7,MonthlyCharges,0.033197
8,intercept,-1.360859


More constraints: 
1. `dependents` should be -0.2 - 0
2. `PhoneService` should be -0.5 - 0


In [15]:
# revise constraint dataframe
import numpy as np
constraint_df_rev = pd.DataFrame(data=[
                                   ['gender',-np.inf,np.inf],
                                   ['SeniorCitizen',-np.inf,np.inf],
                                   ['Partner',-np.inf, 0],
                                   ['Dependents',-0.2,0],
                                   ['tenure',-np.inf,0],
                                   ['PhoneService',-0.5,0],
                                   ['PaperlessBilling',-np.inf,np.inf],
                                   ['MonthlyCharges',-np.inf,np.inf],
                                   ['intercept',-np.inf,np.inf]],
                             columns=['feature','lower_bound','upper_bound'])

constraint_df_rev

Unnamed: 0,feature,lower_bound,upper_bound
0,gender,-inf,inf
1,SeniorCitizen,-inf,inf
2,Partner,-inf,0.0
3,Dependents,-0.2,0.0
4,tenure,-inf,0.0
5,PhoneService,-0.5,0.0
6,PaperlessBilling,-inf,inf
7,MonthlyCharges,-inf,inf
8,intercept,-inf,inf


In [16]:
# train using clogistic
from scipy.optimize import Bounds
from clogistic import LogisticRegression as clLogisticRegression

lower_bounds = constraint_df_rev['lower_bound'].to_numpy()
upper_bounds = constraint_df_rev['upper_bound'].to_numpy()
bounds = Bounds(lower_bounds, upper_bounds)

cl_logreg_rev = clLogisticRegression(penalty='none')
cl_logreg_rev.fit(X_train, y_train, bounds=bounds)

LogisticRegression(penalty='none')

In [17]:
# coefficient dataframe
cl_coef_rev = pd.DataFrame({
    'feature':['intercept'] + df.drop(columns='Churn').columns.tolist(),
    'coefficient':[cl_logreg_rev.intercept_[0]] + list(cl_logreg_rev.coef_[0])
})

cl_coef_rev

Unnamed: 0,feature,coefficient
0,intercept,-1.582653
1,gender,0.01566601
2,SeniorCitizen,0.5464141
3,Partner,3.611137e-09
4,Dependents,-0.2
5,tenure,-0.05548038
6,PhoneService,-0.5
7,PaperlessBilling,0.4172085
8,MonthlyCharges,0.03173833


# Remark

Acknowledge the side effect: decreasing F1 score as constraints to be added

In [18]:
# performance on train data
from sklearn.metrics import f1_score

y_sk_pred = sk_logreg.predict(X_train)
y_cl_pred = cl_logreg.predict(X_train)
y_cl_pred_rev = cl_logreg_rev.predict(X_train)

print(f'F1 score on train set for sk_logreg model is {f1_score(y_train, y_sk_pred):.4f}')
print(f'F1 score on train set for cl_logreg model is {f1_score(y_train, y_cl_pred):.4f}')
print(f'F1 score on train set for cl_logreg_rev model is {f1_score(y_train, y_cl_pred_rev):.4f}')

F1 score on train set for sk_logreg model is 0.5593
F1 score on train set for cl_logreg model is 0.5575
F1 score on train set for cl_logreg_rev model is 0.5513


In [19]:
# performance on test data
from sklearn.metrics import f1_score

y_sk_pred = sk_logreg.predict(X_test)
y_cl_pred = cl_logreg.predict(X_test)
y_cl_pred_rev = cl_logreg_rev.predict(X_test)

print(f'F1 score on test set for sk_logreg model is {f1_score(y_test, y_sk_pred):.4f}')
print(f'F1 score on test set for cl_logreg model is {f1_score(y_test, y_cl_pred):.4f}')
print(f'F1 score on test set for cl_logreg_rev model is {f1_score(y_test, y_cl_pred_rev):.4f}')

F1 score on test set for sk_logreg model is 0.5314
F1 score on test set for cl_logreg model is 0.5385
F1 score on test set for cl_logreg_rev model is 0.5281
