In [28]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

from imblearn.over_sampling import RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

In [2]:
def evaluate_model(estimator, train_X, train_y, test_X, test_y):
    estimator.fit(train_X, train_y)
    
    pred_train = estimator.predict(train_X)
    pred_test = estimator.predict(test_X)
    
    f1_1 = f1_score(train_y, pred_train)
    precision_1 = precision_score(train_y, pred_train)
    recall_1 = recall_score(train_y, pred_train)
    
    f1_2 = f1_score(test_y, pred_test)
    precision_2 = precision_score(test_y, pred_test)
    recall_2 = recall_score(test_y, pred_test)
    
    return pd.DataFrame([[f1_1, precision_1, recall_1], 
                         [f1_2, precision_2, recall_2]], index=['train', 'test'], 
                        columns=['f1', 'precision', 'recall'])

In [52]:
train = pd.read_csv('data/fr_learn.tsv', sep='\t')
test = pd.read_csv('data/fr_test.tsv', sep='\t')

In [53]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (1000000, 9)
Test shape: (200000, 8)


In [54]:
train.head(3)

Unnamed: 0,query,timestamp,requests_per_prev_1_hour,requests_per_prev_2_hour,requests_per_prev_6_hour,requests_per_prev_12_hour,requests_per_prev_24_hour,requests_per_prev_72_hour,fresh_click
0,0 1 2 3 4,1517044380,19,38,67,117,292,292,0
1,5 6,1517055711,120,244,598,735,2352,12021,0
2,7 3,1516903943,3,6,13,31,46,124,0


In [55]:
TARGET = 'fresh_click'

train[TARGET].value_counts()

0    927916
1     72084
Name: fresh_click, dtype: int64

Классы сильно несбалансированы. Отсортируем данные по времени:

In [56]:
train.sort_values(by='timestamp', inplace=True)
train.reset_index(drop=True, inplace=True)

In [57]:
train.head(2)

Unnamed: 0,query,timestamp,requests_per_prev_1_hour,requests_per_prev_2_hour,requests_per_prev_6_hour,requests_per_prev_12_hour,requests_per_prev_24_hour,requests_per_prev_72_hour,fresh_click
0,1025 1026 39,1516741200,18,32,152,350,547,1763,0
1,66 894 419 284 69 3764 27202 569 344 345 131,1516741202,0,0,0,1,2,5,0


Посмотрим за какие периоды данные в трейне и тесте:

In [58]:
pd.to_datetime(train.timestamp, unit='s').agg([min, max])

min   2018-01-23 21:00:00
max   2018-01-29 20:59:59
Name: timestamp, dtype: datetime64[ns]

In [59]:
pd.to_datetime(test.timestamp, unit='s').agg([min, max])

min   2018-01-29 21:00:01
max   2018-01-30 20:59:59
Name: timestamp, dtype: datetime64[ns]

In [60]:
test.drop('timestamp', axis=1, inplace=True)

In [61]:
transformer = ColumnTransformer(
    [
        ('tfidf', TfidfVectorizer(max_features=1000), 'query')
    ],
    remainder='passthrough'
)

In [62]:
X = transformer.fit_transform(train.drop(['timestamp', TARGET], axis=1))

In [63]:
y = train[TARGET]

In [64]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=False)

In [65]:
ros = RandomOverSampler(random_state=0)
train_X_os, train_y_os = ros.fit_resample(train_X, train_y)

# Models

## LogisticRegression

In [66]:
lr = LogisticRegression(max_iter=200, solver='lbfgs', class_weight='balanced')

In [67]:
evaluate_model(lr, train_X, train_y, test_X, test_y)

Unnamed: 0,f1,precision,recall
train,0.114478,0.069648,0.321272
test,0.1108,0.066387,0.33476


In [68]:
evaluate_model(lr, train_X_os, train_y_os, test_X, test_y)

Unnamed: 0,f1,precision,recall
train,0.690712,0.665629,0.717759
test,0.223438,0.13257,0.710303


## BernoulliNB

In [69]:
bnb = BernoulliNB()

In [70]:
evaluate_model(bnb, train_X, train_y, test_X, test_y)

Unnamed: 0,f1,precision,recall
train,0.293908,0.265458,0.329189
test,0.29865,0.274462,0.327514


In [71]:
evaluate_model(bnb, train_X_os, train_y_os, test_X, test_y)

Unnamed: 0,f1,precision,recall
train,0.687233,0.684709,0.689777
test,0.242169,0.14742,0.6778


In [72]:
bnb.fit(X, y)

test = transformer.transform(test)
preds = bnb.predict(test)

In [74]:
pd.Series(preds).to_csv('predictions.tsv', index=False, header=False)