***Importing the necessary modules***

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import MiniGroup3



**Loading in the data set**

This dataset is randomly collected from an Iranian telecom company's database
over a period of 12 months. A total of 3150 rows of data, each representing a
customer, bear information for 13 columns. The attributes that are in this
dataset are call failures, frequency of SMS, number of complaints, number of
distinct calls, subscription length, age group, the charge amount, type of
service, seconds of use, status, frequency of use, and Customer Value.

In [None]:
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_3/datasets/customer-churn.csv')
df.head()


Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0


***Checking for data type***


* Dataset contained 3150 complete entries for each category;
* Dataset contained only 1 column as a non-integer (float).


In [None]:
# Checking for data type
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Call  Failure            3150 non-null   int64  
 1   Complains                3150 non-null   int64  
 2   Subscription  Length     3150 non-null   int64  
 3   Charge  Amount           3150 non-null   int64  
 4   Seconds of Use           3150 non-null   int64  
 5   Frequency of use         3150 non-null   int64  
 6   Frequency of SMS         3150 non-null   int64  
 7   Distinct Called Numbers  3150 non-null   int64  
 8   Age Group                3150 non-null   int64  
 9   Tariff Plan              3150 non-null   int64  
 10  Status                   3150 non-null   int64  
 11  Age                      3150 non-null   int64  
 12  Customer Value           3150 non-null   float64
 13  Churn                    3150 non-null   int64  
dtypes: float64(1), int64(13)

***Cleaning and preparing the data***

1. No nulls, so no dropped rows needed;
2. 'Customer Value' initially only non-integer column, converting from float to integer;
3. Target is 'Churn';
4. Scaling the data using StandardScaler;
5. Running a train/test/split on the scaled data.



In [None]:
# Converting 'Customer Value' from float to int
df['Customer Value'] = df['Customer Value'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Call  Failure            3150 non-null   int64
 1   Complains                3150 non-null   int64
 2   Subscription  Length     3150 non-null   int64
 3   Charge  Amount           3150 non-null   int64
 4   Seconds of Use           3150 non-null   int64
 5   Frequency of use         3150 non-null   int64
 6   Frequency of SMS         3150 non-null   int64
 7   Distinct Called Numbers  3150 non-null   int64
 8   Age Group                3150 non-null   int64
 9   Tariff Plan              3150 non-null   int64
 10  Status                   3150 non-null   int64
 11  Age                      3150 non-null   int64
 12  Customer Value           3150 non-null   int64
 13  Churn                    3150 non-null   int64
dtypes: int64(14)
memory usage: 344.7 KB


In [None]:
# Setting target
y = df['Churn']

# Setting features
X = df.copy()
X = X.drop(columns='Churn')

In [None]:
# Splitting into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [None]:
# Instantiate scaler and scale features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


***Model Training and Evaluation***

1.   Logistic regression
2.   K nearest-neighbors
3.   Random forest


In [None]:
# Instantiating logistics regression
lr = LogisticRegression(random_state=1).fit(X_train_scaled, y_train)

# Running LR on both train and test data to find the difference in accuracy
lr_train = lr.predict(X_train_scaled)
lr_test = lr.predict(X_test_scaled)

# Calculating and printing the accuracy
lr_test_acc = accuracy_score(y_test, lr_test)
lr_train_acc = accuracy_score(y_train, lr_train)
print(f'Logistic Regression Train Accuracy: {lr_train_acc}')
print(f'Logistic Regression Test Accuracy: {lr_test_acc}')


Logistic Regression Train Accuracy: 0.8966977138018628
Logistic Regression Test Accuracy: 0.8934010152284264


In [None]:
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_scaled, y_train)
knn_train = knn.predict(X_train_scaled)
knn_test = knn.predict(X_test_scaled)

# Calculating and printing the accuracy
knn_test_acc = accuracy_score(y_test, knn_test)
knn_train_acc = accuracy_score(y_train, knn_train)
print(f'KNN Train Accuracy: {knn_train_acc}')
print(f'KNN Test Accuracy: {knn_test_acc}')

KNN Train Accuracy: 0.9657070279424217
KNN Test Accuracy: 0.949238578680203


In [None]:
rf = RandomForestClassifier(n_estimators=128, random_state=1).fit(X_train_scaled, y_train)
rf_train = rf.predict(X_train_scaled)
rf_test = rf.predict(X_test_scaled)

# Calculating and printing the accuracy
rf_test_acc = accuracy_score(y_test, rf_test)
rf_train_acc = accuracy_score(y_train, rf_train)
print(f'Random Forest Train Accuracy: {rf_train_acc}')
print(f'Random Forest Test Accuracy: {rf_test_acc}')

Random Forest Train Accuracy: 0.9949195596951735
Random Forest Test Accuracy: 0.9479695431472082
