# Рекомендация тарифов

В вашем распоряжении данные о поведении клиентов, которые уже перешли на эти тарифы (из проекта курса «Статистический анализ данных»). Нужно построить модель для задачи классификации, которая выберет подходящий тариф. Предобработка данных не понадобится — вы её уже сделали.

Постройте модель с максимально большим значением *accuracy*. Чтобы сдать проект успешно, нужно довести долю правильных ответов по крайней мере до 0.75. Проверьте *accuracy* на тестовой выборке самостоятельно.

## Откройте и изучите файл

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix

In [2]:
df = pd.read_csv('/datasets/users_behavior.csv')

In [3]:
df.head()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


<b>Поправила тип данных у <code>calls</code> и <code>messages</code>

In [5]:
df.calls = df.calls.astype('int') 
df.messages = df.messages.astype('int') 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   int64  
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   int64  
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(2), int64(3)
memory usage: 125.7 KB


## Разбейте данные на выборки

In [7]:
df_train, df_valid = train_test_split(df, test_size=0.20, random_state=12345) 

In [8]:
df_features_train = df_train.drop(['is_ultra'], axis=1)
df_target_train = df_train['is_ultra']

In [9]:
features_valid = df_valid.drop(['is_ultra'], axis=1)
target_valid = df_valid['is_ultra']

In [10]:
features_train, features_test,target_train, target_test  = train_test_split(df_features_train, df_target_train, 
                                                                            test_size = 0.25, random_state=12345, 
                                                                            stratify=df_target_train)

In [11]:
print(f"Количество строк в target_train по классам: {np.bincount(target_train)}")
print(f"Количество строк в target_test по классам: {np.bincount(target_test)}")

Количество строк в target_train по классам: [1336  592]
Количество строк в target_test по классам: [446 197]


In [12]:
print('Размер датасета',df.shape,'\n', 'Признаки тренеровочной выборки',features_train.shape,'\n','Таргет тренировочной выборки', target_train.shape,'\n','Признаки валид. выборки', features_valid.shape,'\n','Таргет валид. выборки', target_valid.shape,'\n','Признаки тестовой', features_test.shape,'\n','Таргет тестовой', features_test.shape)

Размер датасета (3214, 5) 
 Признаки тренеровочной выборки (1928, 4) 
 Таргет тренировочной выборки (1928,) 
 Признаки валид. выборки (643, 4) 
 Таргет валид. выборки (643,) 
 Признаки тестовой (643, 4) 
 Таргет тестовой (643, 4)


<b>Вывод: Сперва разделила выборку на тренировочную и валидационную. Потом уже от тренировочной выделила тестовую.

## Исследуйте модели

### RandomForestClassifier

In [13]:
best_model = None
best_result = 0
best_est = 0
best_depth = 0

for est in range(1, 11):
    for depth in range (1, 50):
        model = RandomForestClassifier(random_state=12345, n_estimators=est, max_depth=depth)
        model.fit(features_train, target_train)
        result = model.score(features_valid, target_valid)
        if result > best_result:
            best_model = model
            best_result = result
            best_est = est
            best_depth = depth

print("Accuracy наилучшей модели на валидационной выборке:", best_result.round(3), '\n',"Количество деревьев:", best_est, '\n',"Максимальная глубина:", depth)

Accuracy наилучшей модели на валидационной выборке: 0.802 
 Количество деревьев: 9 
 Максимальная глубина: 49


In [14]:
X = df.drop(['is_ultra'], axis=1)
Y = df['is_ultra']

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state=12345)

In [16]:
model_rf = RandomForestClassifier()

In [17]:
params_rf = {"n_estimators" : [100, 200, 500], 
          "criterion" : ['gini', 'entropy', 'log_loss'], 
          'max_features': ['auto', 'sqrt', 'log2'],
          'max_depth' : [4,5,6,7,8]
         }

In [18]:
gs_rf = GridSearchCV(estimator = model_rf,
                  param_grid = params_rf,
                  scoring = ['accuracy'],
                  refit = 'accuracy',
                  cv = 3,
                  verbose = 4)

In [19]:
%%time
gs_rf.fit(X_train, Y_train)

Fitting 3 folds for each of 135 candidates, totalling 405 fits
[CV 1/3] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100; accuracy: (test=0.784) total time=   0.2s
[CV 2/3] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100; accuracy: (test=0.816) total time=   0.2s
[CV 3/3] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100; accuracy: (test=0.807) total time=   0.2s
[CV 1/3] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200; accuracy: (test=0.787) total time=   0.4s
[CV 2/3] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200; accuracy: (test=0.812) total time=   0.4s
[CV 3/3] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200; accuracy: (test=0.806) total time=   0.4s
[CV 1/3] END criterion=gini, max_depth=4, max_features=auto, n_estimators=500; accuracy: (test=0.781) total time=   0.9s
[CV 2/3] END criterion=gini, max_depth=4, max_features=auto, n_estimators=500; accuracy: (

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 500]},
             refit='accuracy', scoring=['accuracy'], verbose=4)

In [20]:
best_est_rf = gs_rf.best_estimator_
best_score_rf = gs_rf.best_score_

In [21]:
print(best_est_rf)
print(best_score_rf)

RandomForestClassifier(criterion='entropy', max_depth=8)
0.8182623619139667


### Logistic Regression

In [22]:
best_model = None
best_result = 0
best_iteration = 0

for iteration in range(100, 1000, 2500):
    model = LogisticRegression(random_state=12345, max_iter=iteration)
    model.fit(features_train, target_train)
    result = model.score(features_valid, target_valid)
    if result > best_result:
        best_model = model
        best_result = result
        best_iteration = iteration

print("Accuracy наилучшей модели на валидационной выборке:", best_result.round(3), '\n', "Количество итераций:", best_iteration)

Accuracy наилучшей модели на валидационной выборке: 0.7 
 Количество итераций: 100


<div class="alert alert-info">
GridSearch для Logistic Regression

In [23]:
model_log = LogisticRegression()

In [24]:
params_log = {"C":np.logspace(-3,3,7), 
          "penalty":["l1","l2"],
          "solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
          "max_iter": [100, 1000]
         }

In [25]:
gs_log = GridSearchCV(estimator = model_log,
                  param_grid = params_log,
                  scoring = ['accuracy'],
                  refit = 'accuracy',
                  cv = 5,
                  verbose = 4)

In [26]:
%%time
gs_log.fit(X_train, Y_train)

Fitting 5 folds for each of 140 candidates, totalling 700 fits
[CV 1/5] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; accuracy: (test=nan) total time=   0.0s
[CV 2/5] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; accuracy: (test=nan) total time=   0.0s
[CV 3/5] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; accuracy: (test=nan) total time=   0.0s
[CV 4/5] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; accuracy: (test=nan) total time=   0.0s
[CV 5/5] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; accuracy: (test=nan) total time=   0.0s
[CV 1/5] END C=0.001, max_iter=100, penalty=l1, solver=lbfgs; accuracy: (test=nan) total time=   0.0s
[CV 2/5] END C=0.001, max_iter=100, penalty=l1, solver=lbfgs; accuracy: (test=nan) total time=   0.0s
[CV 3/5] END C=0.001, max_iter=100, penalty=l1, solver=lbfgs; accuracy: (test=nan) total time=   0.0s
[CV 4/5] END C=0.001, max_iter=100, penalty=l1, solver=lbfgs; accuracy: (test=nan) total time=   0.0s

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'max_iter': [100, 1000], 'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             refit='accuracy', scoring=['accuracy'], verbose=4)

In [27]:
best_est_log = gs_log.best_estimator_
best_score_log =gs_log.best_score_

In [28]:
print(best_est_log)
print(best_score_log)

LogisticRegression(C=1000.0, penalty='l1', solver='liblinear')
0.7419087136929461


### Decision Tree Classifier

In [29]:
best_model = None
best_result = 0
best_depth = 0
best_criterion = 0
best_min_leaf = 0
best_split = 0
best_weight = 0


for depth in range(1, 50):
    for criterion_name in ('gini', 'entropy'):
        for min_leaf in range(1, 11):
            for split in range(2, 11):
                for weight in range(0,1):
                    model = DecisionTreeClassifier(random_state=12345, max_depth=depth, 
                                           criterion=criterion_name, min_samples_leaf=min_leaf, 
                                            min_samples_split = split, min_weight_fraction_leaf = weight)
                
                    model.fit(features_train, target_train)
                    result = model.score(features_valid, target_valid)
                    if result > best_result:
                        best_model = model
                        best_result = result
                        best_depth = depth
                        best_criterion = criterion_name
                        best_split = split
                        best_min_leaf = min_leaf
                        best_weight = weight
print("Accuracy наилучшей модели на валидационной выборке:", best_result.round(3), '\n', "Максимальная глубина:",depth, '\n','Критерий:', best_criterion, '\n','Минимальное число объектов в листьях', best_min_leaf, '\n','Минимальное количество выборок:', best_split, '\n', 'Доля входных выборок:', best_weight)

Accuracy наилучшей модели на валидационной выборке: 0.793 
 Максимальная глубина: 49 
 Критерий: gini 
 Минимальное число объектов в листьях 1 
 Минимальное количество выборок: 4 
 Доля входных выборок: 0


<div class="alert alert-info">
GridSearch для Decision Tree Classifier

In [30]:
model_tree = DecisionTreeClassifier()

In [31]:
params_tree = {"max_depth": [1, 2, 3, 4],
          "criterion": ['gini', 'entropy'],
         }

In [32]:
gs_tree = GridSearchCV(estimator = model_tree,
                  param_grid = params_tree,
                  scoring = ['accuracy'],
                  refit = 'accuracy',
                  cv = 5,
                  verbose = 4)

In [33]:
%%time
gs_tree.fit(X_train, Y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END criterion=gini, max_depth=1; accuracy: (test=0.743) total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1; accuracy: (test=0.755) total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1; accuracy: (test=0.749) total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1; accuracy: (test=0.747) total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1; accuracy: (test=0.770) total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=2; accuracy: (test=0.770) total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=2; accuracy: (test=0.770) total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=2; accuracy: (test=0.790) total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=2; accuracy: (test=0.766) total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=2; accuracy: (test=0.797) total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=3; accuracy: (test=0.774) total time=   0.0s
[CV 2/5] END cri

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4]},
             refit='accuracy', scoring=['accuracy'], verbose=4)

In [34]:
best_est_tree = gs_tree.best_estimator_
best_score_tree = gs_tree.best_score_

In [35]:
print(best_est_tree)
print(best_score_tree)

DecisionTreeClassifier(max_depth=3)
0.7950207468879669


<div class="alert alert-info">
<BR><b>Вывод:</b>
По результатам выбрала модель для дальнейшей работы -  RF

## Проверьте модель на тестовой выборке

Объединила train и valid, для проверки модели на тестовой выборке.

In [36]:
features = pd.concat([features_train, features_valid])
target = pd.concat([target_train, target_valid])

In [37]:
print(best_est_rf)

RandomForestClassifier(criterion='entropy', max_depth=8)


In [38]:
model = RandomForestClassifier(criterion='entropy', max_depth=8, max_features='sqrt')

In [39]:
model.fit(features, target)
predictions_test = model.predict(features_test)
result_test = accuracy_score(target_test, predictions_test)
print("Accuracy:", result_test.round(3))

Accuracy: 0.818


<b>Вывод:</b> Accuracy 0.818,поставленной цели добились.

## (бонус) Проверьте модели на адекватность

In [40]:
model = DecisionTreeClassifier(random_state=54321)

model.fit(features, target)
predictions_test = model.predict(features_test)
result_test = accuracy_score(target_test, predictions_test)
print("Accuracy:", result_test.round(3))

Accuracy: 0.742


In [41]:
clf_dummy = DummyClassifier(random_state=54321) 

clf_dummy.fit(features, target)
predictions_test = clf_dummy.predict(features_test)
result_test = accuracy_score(target_test, predictions_test)
print("Accuracy:", result_test.round(3))

Accuracy: 0.694
