### Библиотеки, открытие данных

In [246]:

import pandas as pd
import plotly.express as ps
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix, precision_score, recall_score
import random

In [230]:
df = pd.read_csv('users_behavior.csv')
df.head(5)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


Надо бы перевести в int и поокруглять числа

In [231]:
for column in list(df.columns[:-1]):
    if column == 'mb_used':
        df[column] = df[column] / 1024
    df[column] = np.ceil(df[column])
    df[column] = df[column].astype(int)
df.rename(columns={'mb_used':'gb_used'}, inplace=True)
df.head(5)


Unnamed: 0,calls,minutes,messages,gb_used,is_ultra
0,40,312,83,20,0
1,85,517,56,23,0
2,77,468,86,21,0
3,106,746,81,9,1
4,66,419,1,15,0


In [232]:
fig = ps.box(df[['calls','minutes','messages','gb_used']]) # Просто поглядеть распределение
fig.show()

In [233]:
df.describe()

Unnamed: 0,calls,minutes,messages,gb_used,is_ultra
count,3214.0,3214.0,3214.0,3214.0,3214.0
mean,63.038892,438.699129,38.281269,17.299938,0.306472
std,33.236368,234.588415,36.148326,7.399306,0.4611
min,0.0,0.0,0.0,0.0,0.0
25%,40.0,275.0,9.0,13.0,0.0
50%,62.0,431.0,30.0,17.0,0.0
75%,82.0,572.75,57.0,21.0,1.0
max,244.0,1633.0,224.0,49.0,1.0


### Делим на обучающая / валидационную / тестовую / случайную выборки

In [234]:
df_train, df_valid = train_test_split(df, test_size=.4, random_state=123, stratify=df['is_ultra'])
df_valid, df_test = train_test_split(df_valid, test_size=.5, random_state=123, stratify=df_valid['is_ultra'])

In [235]:
df_valid_target = df_valid['is_ultra']
df_test_target = df_test['is_ultra']
df_train_target = df_train['is_ultra']
df_valid_features = df_valid.drop(['is_ultra'], axis=1)
df_test_features = df_test.drop(['is_ultra'], axis=1)
df_train_features = df_train.drop(['is_ultra'], axis=1)


In [236]:
# Выборка для сравнения со случайным значением
df_random_target = pd.Series(0, index=df_valid_target).apply(
    lambda row: 1 if random.random() < df_valid_target.mean() else 0)

accuracy_score(df_valid_target, df_random_target)

0.5738724727838258

### Подбираем гиперпараметры для обучения модели

In [237]:
best_estimate = best_result = best_depth = best_leaf = 0
results = []
for estimate in range(10, 121, 10):
    for depth in range(1, 11):
        for leaf in range(2, 27, 4):
            model = RandomForestClassifier(
                n_estimators=estimate, random_state=123, max_depth=depth, min_samples_leaf=leaf)
            model.fit(df_train_features, df_train_target)
            predict = model.predict(df_valid_features)
            result = accuracy_score(df_valid_target, predict)
            if result > best_result:
                best_result, best_estimate, best_depth, best_leaf = result, estimate, depth, leaf
            results.append((estimate, depth, leaf, result))

print(f'Лучший лес с точностью предсказаний {best_result:.2%}, имеет параметры: \n \
      n_estimate={best_estimate}, max_depth={best_depth}, min_samples_leaf={best_leaf}')


Лучший лес с точностью предсказаний 80.56%, имеет параметры: 
       n_estimate=30, max_depth=10, min_samples_leaf=2


In [238]:
results = pd.DataFrame(columns=['n_estimate', 'max_depth', 'min_sample_leaf', 'accuracy'], data=results)
results['accuracy'].describe()

count    840.000000
mean       0.791476
std        0.010938
min        0.748056
25%        0.791602
50%        0.794712
75%        0.796267
max        0.805599
Name: accuracy, dtype: float64

In [239]:
best_result = 0
results_tree = []
for depth in range(1,15):
    model = DecisionTreeClassifier(random_state=123, max_depth=depth)
    model.fit(df_train_features, df_train_target)
    predict = model.predict(df_valid_features)
    result = accuracy_score(df_valid_target, predict)
    if result > best_result:
        best_result = result
        best_depth = depth
print(f'Лучшее дерево с глубиной {best_depth} | точность предсказаний: {best_result:.2%}')

Лучшее дерево с глубиной 8 | точность предсказаний: 79.78%


In [240]:
model = LogisticRegression(random_state=123)
model.fit(df_train_features, df_train_target)
predict = model.predict(df_valid_features)
result = accuracy_score(df_valid_target, predict)
print(f'При использовании логистической регрессии, точность модели: {result:.2%}')

При использовании логистической регрессии, точность модели: 76.21%


### Проверка на тестовых выборках (accuracy, precision, recall)

In [266]:
# в комментарии хорошей подбор (80%) при очень малой глубине и большом "листе"
# model = RandomForestClassifier(n_estimators=30, max_depth=4, min_samples_leaf=26, random_state=123)
model = RandomForestClassifier(n_estimators=30, max_depth=10, min_samples_leaf=2, random_state=123)
model.fit(df_train_features, df_train_target)
predict = model.predict(df_test_features)
print('Правильность(accuracy)', accuracy_score(df_test_target, predict))
print('Точность(precision)', precision_score(df_test_target, predict))
print('Точность(recall)', recall_score(df_test_target, predict))
print(multilabel_confusion_matrix(df_test_target, predict)[0])

Правильность(accuracy) 0.8258164852255054
Точность(precision) 0.8102189781021898
Точность(recall) 0.5634517766497462
[[111  86]
 [ 26 420]]


In [268]:
model = LogisticRegression(random_state=123)
model.fit(df_train_features, df_train_target)
predict = model.predict(df_test_features)
print('Правильность(accuracy)', accuracy_score(df_test_target, predict))
print('Точность(precision)', precision_score(df_test_target, predict))
print('Точность(recall)', recall_score(df_test_target, predict))
print(multilabel_confusion_matrix(df_test_target, predict)[0])

Правильность(accuracy) 0.7558320373250389
Точность(precision) 0.8225806451612904
Точность(recall) 0.25888324873096447
[[ 51 146]
 [ 11 435]]


In [271]:
model = DecisionTreeClassifier(random_state=123, max_depth=8)
model.fit(df_train_features, df_train_target)
predict = model.predict(df_test_features)
print('Правильность(accuracy)', accuracy_score(df_test_target, predict))
print('Точность(precision)', precision_score(df_test_target, predict))
print('Точность(recall)', recall_score(df_test_target, predict))
print(multilabel_confusion_matrix(df_test_target, predict)[0])

Правильность(accuracy) 0.7916018662519441
Точность(precision) 0.7368421052631579
Точность(recall) 0.49746192893401014
[[ 98  99]
 [ 35 411]]


Общий вывод:
- добиться точности предсказаний свыше 75% с помощью случайного леса не составляет большого труда.
- Основная проблема в предсказании положительных значений(тариф "Ультра") во всех алгоритмах. ("в нашем случае лучшее предсказание выдает логистическая регрессия")