In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

#### 1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

##### Загрузим Electrical Grid Stability Simulated Data Data Set (https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+)

In [2]:
data = pd.read_csv('Data_for_UCI_named.csv', sep=',')
data.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable


In [3]:
data.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


#### 2. Сделать feature engineering

#### Заменим у целевого признака значения на 0 и 1

In [5]:
stabf = {'unstable': 0, 'stable': 1}
data.replace({'stabf': stabf}, inplace=True)
data.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,1


In [6]:
data.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731,0.362
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919,0.480603
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076,0.0
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557,0.0
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142,0.0
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878,1.0
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403,1.0


##### Разделим датасет, выделив целевой признак stabf.

In [7]:
target = data.stabf
df = data.drop(columns='stabf')
df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957


##### Посмотрим, насколько признаки линейно зависимы между собой. Построим матрицу корреляций.

In [8]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
tau1,1.0,0.015586,-0.00597,-0.017265,0.027183,-0.015485,-0.015924,-0.015807,0.010521,0.01535,-0.001279,0.005494,0.275761
tau2,0.015586,1.0,0.014273,-0.001965,-0.004769,0.006573,0.007673,-0.005963,-0.001742,0.015383,0.016508,-0.011764,0.290975
tau3,-0.00597,0.014273,1.0,0.004354,0.016953,-0.003134,-0.00878,-0.017531,-0.011605,0.007671,0.014702,-0.011497,0.2807
tau4,-0.017265,-0.001965,0.004354,1.0,-0.003173,0.010553,0.006169,-0.011211,-0.004149,0.008431,0.00326,-0.000491,0.278576
p1,0.027183,-0.004769,0.016953,-0.003173,1.0,-0.573157,-0.584554,-0.579239,0.000721,0.015405,0.001069,-0.015451,0.010278
p2,-0.015485,0.006573,-0.003134,0.010553,-0.573157,1.0,0.002388,-0.006844,0.015603,-0.018032,0.007555,0.019817,0.006255
p3,-0.015924,0.007673,-0.00878,0.006169,-0.584554,0.002388,1.0,0.012953,-0.003219,-0.011575,-0.005897,-0.010485,-0.003321
p4,-0.015807,-0.005963,-0.017531,-0.011211,-0.579239,-0.006844,0.012953,1.0,-0.013636,0.00285,-0.003515,0.017505,-0.020786
g1,0.010521,-0.001742,-0.011605,-0.004149,0.000721,0.015603,-0.003219,-0.013636,1.0,0.007559,-0.005836,0.012431,0.282774
g2,0.01535,0.015383,0.007671,0.008431,0.015405,-0.018032,-0.011575,0.00285,0.007559,1.0,-0.012809,-0.014909,0.293601


##### Данные достаточно хорошо подготовлены, параметры линейно независимы. Коэффициент корреляции больше 0,5 только у параметров p1, p2, p3, p4, но это незначительно.

#### 3. Обучить любой классификатор (какой вам нравится)

##### Применим классификатор XGBoost

In [9]:
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)

In [10]:
model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

##### Посчитаем основные показатели

In [11]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    return [f1, roc, rec, prc]

In [12]:
classification = evaluate_results(y_test, y_predict)

Classification results:
f1: 99.93%
roc: 99.96%
recall: 100.00%
precision: 99.86%


##### Значения шикарные :D

#### 4. Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

In [13]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 905/3620 as positives and unlabeling the rest


##### Теперь размеченных как класс 1 - всего 905 значений. Остальныепомечаем -1, как неразмеченные

In [14]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    9095
 1     905
Name: class_test, dtype: int64


In [15]:
mod_data.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf,class_test
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,0,-1
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,1,-1


In [16]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

#### 5. Применить random negative sampling для построения классификатора в новых условиях

In [17]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(905, 15) (905, 15)


##### Обучим модель

In [18]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)

In [19]:
pu_20 = evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 99.98%
roc: 99.99%
recall: 100.00%
precision: 99.96%


##### Результаты тоже впечатляющие :D Попахивает переобучением, либо сущесствует четкая зависимость результата от измерений (что более вероятно для физических опытов)

#### 6. Сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [20]:
result = [classification, pu_20]
index = ['Classification', 'PU 20%']

metrics = pd.DataFrame(result, index=index, columns=['F-score', 'Roc_auc', 'Recall', 'Precision'])

In [21]:
metrics

Unnamed: 0,F-score,Roc_auc,Recall,Precision
Classification,0.999279,0.999617,1.0,0.998559
PU 20%,0.999796,0.999913,1.0,0.999593


#### 7. Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

##### Возьмем не 25, а 30 % для P

In [22]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 30% of the positives marked
pos_sample_len = int(np.ceil(0.3 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1086/3620 as positives and unlabeling the rest


##### Теперь размеченных как класс 1 - всего 905 значений. Остальныепомечаем -1, как неразмеченные

In [23]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    8914
 1    1086
Name: class_test, dtype: int64


In [24]:
mod_data.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf,class_test
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,0,-1
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,1,1


In [25]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [26]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1086, 15) (1086, 15)


##### Обучим модель

In [27]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)

In [28]:
pu_30 = evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 99.96%
roc: 99.96%
recall: 99.91%
precision: 100.00%


In [29]:
result = [classification, pu_20, pu_30]
index = ['Classification', 'PU 20%', 'PU 30%']

metrics = pd.DataFrame(result, index=index, columns=['F-score', 'Roc_auc', 'Recall', 'Precision'])

In [30]:
metrics

Unnamed: 0,F-score,Roc_auc,Recall,Precision
Classification,0.999279,0.999617,1.0,0.998559
PU 20%,0.999796,0.999913,1.0,0.999593
PU 30%,0.999553,0.999553,0.999106,1.0


#### С увеличением размера выборки P метрики f1 и roc_auc падают.