In [None]:
# Import libraries and methods/functions
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix



In [10]:
# Load data
telco_demog = pd.read_csv('data/telecom_demographics.csv')
telco_usage = pd.read_csv('data/telecom_usage.csv')

In [11]:
telco_demog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         6500 non-null   int64 
 1   telecom_partner     6500 non-null   object
 2   gender              6500 non-null   object
 3   age                 6500 non-null   int64 
 4   state               6500 non-null   object
 5   city                6500 non-null   object
 6   pincode             6500 non-null   int64 
 7   registration_event  6500 non-null   object
 8   num_dependents      6500 non-null   int64 
 9   estimated_salary    6500 non-null   int64 
dtypes: int64(5), object(5)
memory usage: 507.9+ KB


In [12]:
telco_usage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   customer_id  6500 non-null   int64
 1   calls_made   6500 non-null   int64
 2   sms_sent     6500 non-null   int64
 3   data_used    6500 non-null   int64
 4   churn        6500 non-null   int64
dtypes: int64(5)
memory usage: 254.0 KB


In [13]:
# Join data
churn_df = telco_demog.merge(telco_usage, on = 'customer_id')
churn_df.head()

Unnamed: 0,customer_id,telecom_partner,gender,age,state,city,pincode,registration_event,num_dependents,estimated_salary,calls_made,sms_sent,data_used,churn
0,15169,Airtel,F,26,Himachal Pradesh,Delhi,667173,2020-03-16,4,85979,75,21,4532,1
1,149207,Airtel,F,74,Uttarakhand,Hyderabad,313997,2022-01-16,0,69445,35,38,723,1
2,148119,Airtel,F,54,Jharkhand,Chennai,549925,2022-01-11,2,75949,70,47,4688,1
3,187288,Reliance Jio,M,29,Bihar,Hyderabad,230636,2022-07-26,3,34272,95,32,10241,1
4,14016,Vodafone,M,45,Nagaland,Bangalore,188036,2020-03-11,4,34157,66,23,5246,1


In [14]:
# Calculate churn rate
churn_rate = churn_df['churn'].value_counts() / len(churn_df)
churn_rate

churn
0    0.799538
1    0.200462
Name: count, dtype: float64

In [15]:
# Identify categorical variables#
churn_df.dtypes

customer_id            int64
telecom_partner       object
gender                object
age                    int64
state                 object
city                  object
pincode                int64
registration_event    object
num_dependents         int64
estimated_salary       int64
calls_made             int64
sms_sent               int64
data_used              int64
churn                  int64
dtype: object

In [16]:
# One Hot Encoding for categorical variables
churn_df = pd.get_dummies(churn_df, columns=['telecom_partner', 'gender', 'state', 'city', 'registration_event'])

In [17]:
# Feature Scaling
scaler = StandardScaler()

# dorp 'customer_id', it is not a feature and 'churn', it is target variable
features = churn_df.drop(['customer_id', 'churn'], axis=1)
features_scaled = scaler.fit_transform(features)

# Target variable
target = churn_df['churn']

In [18]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

In [19]:
# Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

In [20]:
# Logistic Regression predictions
logreg_pred = logreg.predict(X_test)

In [21]:
# Logistic Regression evaluation
print(confusion_matrix(y_test, logreg_pred))

[[911 116]
 [243  30]]


In [22]:
print(classification_report(y_test, logreg_pred))

              precision    recall  f1-score   support

           0       0.79      0.89      0.84      1027
           1       0.21      0.11      0.14       273

    accuracy                           0.72      1300
   macro avg       0.50      0.50      0.49      1300
weighted avg       0.67      0.72      0.69      1300



In [23]:
# Instantiate the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [24]:
# Random Forest predictions
rf_pred = rf.predict(X_test)

In [25]:
# Random Forest evaluation
print(confusion_matrix(y_test, rf_pred))

[[1026    1]
 [ 273    0]]


In [26]:
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1027
           1       0.00      0.00      0.00       273

    accuracy                           0.79      1300
   macro avg       0.39      0.50      0.44      1300
weighted avg       0.62      0.79      0.70      1300



In [27]:
# Which accuracy score is higher "LogisticRegression" or "RandomForest" ? 
higher_accuracy = "RandomForest"