In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [14]:
telecom_demo = pd.read_csv("telecom_demographics.csv")
telecom_usage = pd.read_csv("telecom_usage.csv")

print(telecom_demo.info())
print(telecom_usage.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         6500 non-null   int64 
 1   telecom_partner     6500 non-null   object
 2   gender              6500 non-null   object
 3   age                 6500 non-null   int64 
 4   state               6500 non-null   object
 5   city                6500 non-null   object
 6   pincode             6500 non-null   int64 
 7   registration_event  6500 non-null   object
 8   num_dependents      6500 non-null   int64 
 9   estimated_salary    6500 non-null   int64 
dtypes: int64(5), object(5)
memory usage: 507.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   customer_id  6500 non-null   int64
 1   calls_made   6500 non-null   

In [30]:
telecom_demo.head()

Unnamed: 0,customer_id,telecom_partner,gender,age,state,city,pincode,registration_event,num_dependents,estimated_salary
0,15169,Airtel,F,26,Himachal Pradesh,Delhi,667173,2020-03-16,4,85979
1,149207,Airtel,F,74,Uttarakhand,Hyderabad,313997,2022-01-16,0,69445
2,148119,Airtel,F,54,Jharkhand,Chennai,549925,2022-01-11,2,75949
3,187288,Reliance Jio,M,29,Bihar,Hyderabad,230636,2022-07-26,3,34272
4,14016,Vodafone,M,45,Nagaland,Bangalore,188036,2020-03-11,4,34157


In [31]:
telecom_usage.head()

Unnamed: 0,customer_id,calls_made,sms_sent,data_used,churn
0,15169,75,21,4532,1
1,149207,35,38,723,1
2,148119,70,47,4688,1
3,187288,95,32,10241,1
4,14016,66,23,5246,1


#### Merge the datasets on customer_id

In [15]:
churn_df = pd.merge(telecom_demo, telecom_usage, on='customer_id')

#### churn rate

In [16]:
churn_rate = churn_df['churn'].mean()
print(f"Churn Rate: {churn_rate:.2%}")

Churn Rate: 20.05%


In [19]:
categorical_vars = churn_df.select_dtypes(include=['object']).columns
print("Categorical Variables:", categorical_vars)

Categorical Variables: Index(['telecom_partner', 'gender', 'state', 'city', 'registration_event'], dtype='object')


In [20]:
churn_df = pd.get_dummies(churn_df, columns=categorical_vars, drop_first=True)

In [21]:
X = churn_df.drop(columns=['churn', 'customer_id'])
y = churn_df['churn']

#### Ensure all data is numeric before scaling

In [22]:
assert X.select_dtypes(include=['object']).empty, "There are still non-numeric columns in the dataset."

#### Feature Scaling

In [23]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Splitting the data

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### Train Logistic Regression model

In [25]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

### Train Random Forest Classifier model

In [26]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

### Assess models on test data

In [27]:
logreg_accuracy = accuracy_score(y_test, logreg_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

## Determine which model has higher accuracy

In [28]:
if logreg_accuracy > rf_accuracy:
    higher_accuracy = "LogisticRegression"
else:
    higher_accuracy = "RandomForest"

In [29]:
print(f"Logistic Regression Accuracy: {logreg_accuracy:.2%}")
print(f"Random Forest Accuracy: {rf_accuracy:.2%}")
print(f"Higher Accuracy Model: {higher_accuracy}")

Logistic Regression Accuracy: 72.46%
Random Forest Accuracy: 79.08%
Higher Accuracy Model: RandomForest
