### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("bank/bank-full.csv", sep=';')
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [2]:
text_features = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
features_to_boolean = ['default', 'housing', 'loan']
cat_features = text_features + features_to_boolean + ['campaign']

for f in features_to_boolean + ['y']:
    data.loc[data[f]=='yes', f] = 1
    data.loc[data[f]=='no', f] = 0
    data[f] = data[f].astype(int).fillna(0)
    print('\nраспределние для ' + f)
    print(data[f].value_counts())



распределние для default
0    44396
1      815
Name: default, dtype: int64

распределние для housing
1    25130
0    20081
Name: housing, dtype: int64

распределние для loan
0    37967
1     7244
Name: loan, dtype: int64

распределние для y
0    39922
1     5289
Name: y, dtype: int64


In [3]:
for f in data.columns:
    print(f+': ' + str(data[data[f].isnull()].shape[0]))


age: 0
job: 0
marital: 0
education: 0
default: 0
balance: 0
housing: 0
loan: 0
contact: 0
day: 0
month: 0
duration: 0
campaign: 0
pdays: 0
previous: 0
poutcome: 0
y: 0


In [4]:
data.dtypes


age           int64
job          object
marital      object
education    object
default       int32
balance       int64
housing       int32
loan          int32
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int32
dtype: object

In [5]:
from sklearn.model_selection import train_test_split
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
display(X_test)
display(y_test)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
3776,40,blue-collar,married,secondary,0,580,1,0,unknown,16,may,192,1,-1,0,unknown
9928,47,services,single,secondary,0,3644,0,0,unknown,9,jun,83,2,-1,0,unknown
33409,25,student,single,tertiary,0,538,1,0,cellular,20,apr,226,1,-1,0,unknown
31885,42,management,married,tertiary,0,1773,0,0,cellular,9,apr,311,1,336,1,failure
15738,56,management,married,tertiary,0,217,0,1,cellular,21,jul,121,2,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13353,47,management,married,tertiary,0,1890,0,0,cellular,8,jul,161,1,-1,0,unknown
38732,32,blue-collar,single,secondary,0,217,1,0,cellular,15,may,692,3,-1,0,unknown
5654,52,admin.,divorced,secondary,0,0,1,0,unknown,26,may,206,1,-1,0,unknown
3779,40,admin.,divorced,secondary,0,783,1,1,unknown,16,may,171,2,-1,0,unknown


3776     0
9928     0
33409    0
31885    0
15738    0
        ..
13353    0
38732    1
5654     0
3779     0
11677    0
Name: y, Length: 9043, dtype: int32

In [6]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

def run(name, classifier):
    classifier.fit(X_train, y_train)
    preds = classifier.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, preds)

    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    
    cnf_matrix = confusion_matrix(y_test, preds>thresholds[ix])
    
    metrics = (thresholds[ix], fscore[ix], precision[ix], recall[ix], cnf_matrix)
    metrics_string = 'Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % metrics[0:4]
    results[name] = (metrics_string,) + metrics


In [7]:
from catboost import CatBoostClassifier
results = {}
run('baseline', CatBoostClassifier(random_state=42, cat_features=cat_features, silent=True))

In [8]:
for name in results:
    print(results[name])

('Best Threshold=0.304096, F-Score=0.652, Precision=0.580, Recall=0.744', 0.3040955325430608, 0.6519470092332396, 0.58, 0.7442713107241063, array([[7364,  588],
       [ 280,  811]], dtype=int64))


In [9]:
def prepare_RNS_samples(positive_ratio):
    mod_data = data.copy()
    pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
    np.random.shuffle(pos_ind)
    pos_sample_len = int(np.ceil(positive_ratio * len(pos_ind)))
    print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
    pos_sample = pos_ind[:pos_sample_len]
    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1
    print('mod_data - target variable:\n', mod_data.iloc[:,-1].value_counts())
    mod_data = mod_data.sample(frac=1, random_state=42)
    neg_sample = mod_data[mod_data['class_test']==-1][:pos_sample_len]
    test_sample = mod_data[mod_data['class_test']==-1][pos_sample_len:]
    neg_sample.loc[:,'class_test'] = 0 # этого нет в примере, но кажется принципиальным - мы должны пометить эту выборку как негативную для обучения
    pos_sample = mod_data[mod_data['class_test']==1]
    print(neg_sample.shape, pos_sample.shape)
    train_sample = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    print('train_sample - target variable:\n', train_sample.iloc[:,-1].value_counts())
    return (train_sample.iloc[:,:-2], test_sample.iloc[:,:-2],
            train_sample.iloc[:,-1], # новый класс, полностью размеченный в train_sample 
            test_sample.iloc[:,-2] # оригинальный класс для проверки
           )

In [10]:
for positive_ratio in [0.1, 0.2, 0.4, 0.6, 0.9]:
    X_train, X_test, y_train, y_test = prepare_RNS_samples(positive_ratio)
    run(f'RNS-{positive_ratio}', CatBoostClassifier(random_state=42, cat_features=cat_features, silent=True))

Using 529/5289 as positives and unlabeling the rest
mod_data - target variable:
 -1    44682
 1      529
Name: class_test, dtype: int64
(529, 18) (529, 18)
train_sample - target variable:
 1    529
0    529
Name: class_test, dtype: int64
Using 1058/5289 as positives and unlabeling the rest
mod_data - target variable:
 -1    44153
 1     1058
Name: class_test, dtype: int64
(1058, 18) (1058, 18)
train_sample - target variable:
 1    1058
0    1058
Name: class_test, dtype: int64
Using 2116/5289 as positives and unlabeling the rest
mod_data - target variable:
 -1    43095
 1     2116
Name: class_test, dtype: int64
(2116, 18) (2116, 18)
train_sample - target variable:
 1    2116
0    2116
Name: class_test, dtype: int64
Using 3174/5289 as positives and unlabeling the rest
mod_data - target variable:
 -1    42037
 1     3174
Name: class_test, dtype: int64
(3174, 18) (3174, 18)
train_sample - target variable:
 1    3174
0    3174
Name: class_test, dtype: int64
Using 4761/5289 as positives and 

In [11]:
for n in results:
    print(f'{n:<10}: {results[n][0]}')
    print(results[n][5])

baseline  : Best Threshold=0.304096, F-Score=0.652, Precision=0.580, Recall=0.744
[[7364  588]
 [ 280  811]]
RNS-0.1   : Best Threshold=0.658989, F-Score=0.555, Precision=0.456, Recall=0.709
[[35483  3975]
 [ 1369  3326]]
RNS-0.2   : Best Threshold=0.704536, F-Score=0.552, Precision=0.465, Recall=0.678
[[35777  3204]
 [ 1327  2787]]
RNS-0.4   : Best Threshold=0.754901, F-Score=0.519, Precision=0.430, Recall=0.655
[[35353  2615]
 [ 1040  1971]]
RNS-0.6   : Best Threshold=0.786703, F-Score=0.459, Precision=0.357, Recall=0.641
[[34671  2246]
 [  699  1247]]
RNS-0.9   : Best Threshold=0.896773, F-Score=0.261, Precision=0.183, Recall=0.455
[[34279   944]
 [  255   211]]


При повышении доли P (positive_ratio) метрики в этом эксперименте ухудшались. Это связано скорее со способом организации эксперимента и тестового набора: вся доля P отбиралась в тренировочный набор, и в тестовом наборе оставалось все меньше positive, в пределе 0 при positive_ratio = 1.  