# look-alike

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [11]:
data = pd.read_csv("ad.data", header=None, low_memory=False)
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.


У нас есть 1558 признаков и 1 целевая переменная (бинарная) - нужно определить является изображение рекламой или нет

In [12]:
print(data.shape)

(3279, 1559)


Всего 3279 изображений. Столбец с целевой переменной делаем бинарной

In [13]:
data[1558].value_counts()

nonad.    2820
ad.        459
Name: 1558, dtype: int64

In [14]:
data[1558].replace(to_replace='ad.', value=1, inplace=True)
data.loc[data[1558] != 1, 1558] = 0
data.rename(columns={1558: 'target'}, inplace=True)
data['target'] = pd.to_numeric(data['target'])

In [16]:
data.tail(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,target
3259,10,600,60.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3260,11,64,5.8181,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3261,?,?,?,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3262,150,200,1.3333,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3263,16,16,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3264,134,184,1.3731,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3265,23,26,1.1304,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3266,40,130,3.25,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3267,158,192,1.2151,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3268,25,100,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
for i in range(1557):
    data[i] = pd.to_numeric(data[i], errors='coerce')

data = data.fillna(0)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,target
0,125.0,125.0,1.0000,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,57.0,468.0,8.2105,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,33.0,230.0,6.9696,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,60.0,468.0,7.8000,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,60.0,468.0,7.8000,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3274,170.0,94.0,0.5529,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3275,101.0,140.0,1.3861,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3276,23.0,120.0,5.2173,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3277,0.0,0.0,0.0000,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Посмотрим на соотношение классов

In [19]:
data['target'].value_counts()

0    2820
1     459
Name: target, dtype: int64

Разбиваем выборку на тренировочную и тестовую части и обучаем модель

In [20]:
x_train, x_test, y_train, y_test = train_test_split(data.drop('target', 1), data['target'], test_size=0.3, random_state=42)

In [21]:
model = GradientBoostingClassifier(random_state=42)

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

Проверяем качество

In [22]:
results = []

def evaluate_results(name, y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    
    result = [name, prc, rec, roc, f1]
    results.append(result)

In [23]:
evaluate_results('full_model', y_test, y_predict)

Classification results:
f1: 87.11%
roc: 89.70%
recall: 80.13%
precision: 95.42%


### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [24]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 115/459 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [25]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    3164
 1     115
Name: class_test, dtype: int64


* We now have just 153 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1.

* Recall that col 4 still holds the actual label

In [26]:
mod_data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1550,1551,1552,1553,1554,1555,1556,1557,target,class_test
0,125.0,125.0,1.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
1,57.0,468.0,8.2105,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
2,33.0,230.0,6.9696,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
3,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
4,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
5,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
6,59.0,460.0,7.7966,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
7,60.0,234.0,3.9,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
8,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
9,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1


Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

In [27]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### 1. random negative sampling

In [28]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(115, 1560) (115, 1560)


In [29]:
model = GradientBoostingClassifier(random_state=42)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results('rn_sampl', sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 65.53%
roc: 91.11%
recall: 93.33%
precision: 50.49%


In [30]:
df_results = pd.DataFrame(np.array(results),
                    columns=["model", "precision", "recall", "roc_auc", "f_score"])

df_results

Unnamed: 0,model,precision,recall,roc_auc,f_score
0,full_model,0.9541984732824428,0.8012820512820513,0.8970178372352285,0.8710801393728224
1,rn_sampl,0.5049180327868853,0.9333333333333332,0.9111315434596052,0.6553191489361703
