Importing libraries

In [1]:
import sklearn
import joblib
import pandas as pd
import numpy as np
from joblib import dump
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 1. Reading data

In [2]:
df = pd.read_csv('/datasets/users_behavior.csv')

# 2. Spliting data
Splitting to train_train, train_validation & test. The train is 75%, tes 25%. At the train, we splited to train 75%, and validation 25%.

In [3]:
df_train, df_test = train_test_split(df, test_size=0.25, random_state=12345)
df_train_train, df_train_valid = train_test_split(df_train, test_size=0.25, random_state=12345)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
calls       3214 non-null float64
minutes     3214 non-null float64
messages    3214 non-null float64
mb_used     3214 non-null float64
is_ultra    3214 non-null int64
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2410 entries, 101 to 482
Data columns (total 5 columns):
calls       2410 non-null float64
minutes     2410 non-null float64
messages    2410 non-null float64
mb_used     2410 non-null float64
is_ultra    2410 non-null int64
dtypes: float64(4), int64(1)
memory usage: 113.0 KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 804 entries, 1415 to 2851
Data columns (total 5 columns):
calls       804 non-null float64
minutes     804 non-null float64
messages    804 non-null float64
mb_used     804 non-null float64
is_ultra    804 non-null int64
dtypes: float64(4), int64(1)
memory usage: 37.7 KB


In [7]:
df_train_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1807 entries, 2649 to 3027
Data columns (total 5 columns):
calls       1807 non-null float64
minutes     1807 non-null float64
messages    1807 non-null float64
mb_used     1807 non-null float64
is_ultra    1807 non-null int64
dtypes: float64(4), int64(1)
memory usage: 84.7 KB


In [8]:
df_train_valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 603 entries, 430 to 2673
Data columns (total 5 columns):
calls       603 non-null float64
minutes     603 non-null float64
messages    603 non-null float64
mb_used     603 non-null float64
is_ultra    603 non-null int64
dtypes: float64(4), int64(1)
memory usage: 28.3 KB


# 3. Investigating models

Investigating the quality of different models by changing hyperparameters. Briefly describing the findings of the study. Sead 12345.
Our problem is a configuration of category problem, if "is" or "not is" 'Ultra'.

Speed: Decision tree - High; Random forest - Low; Logistic regression - High

In [17]:
last_train_features = df_train.drop(['is_ultra'], axis=1)
last_train_target = df_train['is_ultra']

train_features = df_train_train.drop(['is_ultra'], axis=1)
train_target = df_train_train['is_ultra']

valid_features = df_train_valid.drop(['is_ultra'], axis=1)
valid_target = df_train_valid['is_ultra']

test_features = df_train_valid.drop(['is_ultra'], axis=1)
test_target = df_train_valid['is_ultra']

### Decision Tree Classifier
We will check the depth for 1,2,3,4,5.

In [18]:
for depth in range(1,6):
    dtc_model = DecisionTreeClassifier(max_depth=depth, random_state=12345)
    dtc_model.fit(train_features, train_target)
    dtc_predicted_valid = dtc_model.predict(valid_features)
    print("max_depth =", depth, ": ", end='')
    print(accuracy_score(valid_target, dtc_predicted_valid))

max_depth = 1 : 0.7495854063018242
max_depth = 2 : 0.7761194029850746
max_depth = 3 : 0.7943615257048093
max_depth = 4 : 0.7893864013266998
max_depth = 5 : 0.7877280265339967


Best depth: 3. Accuracy 0.7943615257048093.

### Random Forest Classifier
We will check the estimators for 1-10.

In [19]:
for estimators in range(1,10):
    rfc_model = RandomForestClassifier(n_estimators=estimators, random_state=12345)
    rfc_model.fit(train_features, train_target)
    rfc_predicted_valid = rfc_model.predict(valid_features)
    print("estimators =", estimators, ": ", end='')
    print(accuracy_score(valid_target, rfc_predicted_valid))

estimators = 1 : 0.736318407960199
estimators = 2 : 0.7711442786069652
estimators = 3 : 0.7661691542288557
estimators = 4 : 0.7827529021558872
estimators = 5 : 0.7810945273631841
estimators = 6 : 0.7993366500829188
estimators = 7 : 0.7976782752902156
estimators = 8 : 0.7943615257048093
estimators = 9 : 0.7943615257048093


Best estimator: 6. Accuracy 0.7993366500829188.

###  Logistic Regression Classifier
We will check the max iteration for 50-150, by jumps of 25.

In [20]:
for iterations in range(50,151,25):
    lrc_model = LogisticRegression(max_iter=iterations, random_state=12345)
    lrc_model.fit(train_features, train_target)
    lrc_predicted_valid = lrc_model.predict(valid_features)
    print("iterations =", iterations, ": ", end='')
    #     model.score(features_valid, target_valid)
    print(accuracy_score(valid_target, lrc_predicted_valid))

iterations = 50 : 0.7330016583747927
iterations = 75 : 0.7330016583747927
iterations = 100 : 0.7330016583747927
iterations = 125 : 0.7330016583747927
iterations = 150 : 0.7330016583747927




All iterations had same accuracy of 0.7330016583747927.
We will choose the default of 100.

# 4. Checking the model on the test set
Checking the quality of the model using the test set.

In [13]:
final_dtc_model = DecisionTreeClassifier(max_depth=3, random_state=12345)
final_rfc_model = RandomForestClassifier(n_estimators=6, random_state=12345)
final_lrc_model = LogisticRegression(max_iter=100, random_state=12345)

In [14]:
final_dtc_model.fit(last_train_features,last_train_target)
final_rfc_model.fit(last_train_features,last_train_target)
final_lrc_model.fit(last_train_features,last_train_target)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=12345, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
dtc_predictions = final_dtc_model.predict(test_features)
rfc_predictions = final_rfc_model.predict(test_features)
lrc_predictions = final_lrc_model.predict(test_features)

In [16]:
print("Decision Tree Classifier Accuracy: ", end='')
print(accuracy_score(test_target, dtc_predictions))

print("Random Forest Classifier Accuracy: ", end='')
print(accuracy_score(test_target, rfc_predictions))

print("Logistic Regression Classifier Accuracy: ", end='')
print(accuracy_score(test_target, lrc_predictions))

Decision Tree Classifier Accuracy: 0.802653399668325
Random Forest Classifier Accuracy: 0.9734660033167496
Logistic Regression Classifier Accuracy: 0.6965174129353234


The best classifier is the Random Forest classifier with an accuracy of 0.9734660033167496. The model is with a number of estimators = 6.

# 5. Sanity check of the model

My conclusion is that the Random Forest and the Decision Tree classifiers were underfit, and the Logistic Regression classifier was overfitted.
Yet, the results for the Decision Tree and the Logistic Regression are low. It could be that the categories are not well differed, at least not by a line that can separate both groups. Or either, the users have pretty the same numbers.
Something else it could be that the two bad classifiers do not bring the connection between two features, that maybe the Random Forest classifier can do.