In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
def evaluate_model(estimator, train_X, train_y, test_X, test_y):
    estimator.fit(train_X, train_y)
    
    pred_train = estimator.predict(train_X)
    pred_test = estimator.predict(test_X)
    
    f1_1 = f1_score(train_y, pred_train)
    precision_1 = precision_score(train_y, pred_train)
    recall_1 = recall_score(train_y, pred_train)
    
    f1_2 = f1_score(test_y, pred_test)
    precision_2 = precision_score(test_y, pred_test)
    recall_2 = recall_score(test_y, pred_test)
    
    return pd.DataFrame([[f1_1, precision_1, recall_1], 
                         [f1_2, precision_2, recall_2]], index=['train', 'test'], 
                        columns=['f1', 'precision', 'recall'])

In [4]:
train = pd.read_csv('data/fr_learn.tsv', sep='\t')
test = pd.read_csv('data/fr_test.tsv', sep='\t')

In [5]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (1000000, 9)
Test shape: (200000, 8)


In [6]:
train.head(3)

Unnamed: 0,query,timestamp,requests_per_prev_1_hour,requests_per_prev_2_hour,requests_per_prev_6_hour,requests_per_prev_12_hour,requests_per_prev_24_hour,requests_per_prev_72_hour,fresh_click
0,0 1 2 3 4,1517044380,19,38,67,117,292,292,0
1,5 6,1517055711,120,244,598,735,2352,12021,0
2,7 3,1516903943,3,6,13,31,46,124,0


In [7]:
TARGET = 'fresh_click'

train[TARGET].value_counts()

0    927916
1     72084
Name: fresh_click, dtype: int64

Классы сильно несбалансированы.

In [8]:
train.sort_values(by='timestamp', inplace=True)
train.reset_index(drop=True, inplace=True)

In [9]:
train.head(3)

Unnamed: 0,query,timestamp,requests_per_prev_1_hour,requests_per_prev_2_hour,requests_per_prev_6_hour,requests_per_prev_12_hour,requests_per_prev_24_hour,requests_per_prev_72_hour,fresh_click
0,1025 1026 39,1516741200,18,32,152,350,547,1763,0
1,66 894 419 284 69 3764 27202 569 344 345 131,1516741202,0,0,0,1,2,5,0
2,43201 11 12935 126 87 108 151,1516741203,0,0,0,0,0,0,0


In [10]:
pd.to_datetime(train.timestamp, unit='s').agg([min, max])

min   2018-01-23 21:00:00
max   2018-01-29 20:59:59
Name: timestamp, dtype: datetime64[ns]

In [11]:
pd.to_datetime(test.timestamp, unit='s').agg([min, max])

min   2018-01-29 21:00:01
max   2018-01-30 20:59:59
Name: timestamp, dtype: datetime64[ns]

In [12]:
test.drop('timestamp', axis=1, inplace=True)

In [13]:
transformer = ColumnTransformer(
    [
        ('tfidf', TfidfVectorizer(max_features=1000), 'query')
    ],
    remainder='passthrough'
)

In [23]:
transformer2 = ColumnTransformer(
    [
        ('tfidf', TfidfVectorizer(), 'query')
    ],
    remainder='passthrough'
)

In [14]:
X = transformer.fit_transform(train.drop(['timestamp', TARGET], axis=1))

In [24]:
X2 = transformer2.fit_transform(train.drop(['timestamp', TARGET], axis=1))

In [15]:
y = train[TARGET]

In [20]:
from imblearn.over_sampling import RandomOverSampler

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [20]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=False)

In [24]:
len(y_os)

1855832

In [25]:
ros = RandomOverSampler(random_state=0)
train_X_os, train_y_os = ros.fit_resample(train_X, train_y)

In [57]:
evaluate_model(lr, train_X, train_y, test_X, test_y)

Unnamed: 0,f1,precision,recall
train,0.114478,0.069648,0.321272
test,0.1108,0.066387,0.33476


In [13]:
from sklearn.linear_model import LogisticRegression

In [16]:
lr = LogisticRegression(max_iter=200, solver='lbfgs', class_weight='balanced')

In [58]:
evaluate_model(lr, train_X_os, train_y_os, test_X, test_y)

Unnamed: 0,f1,precision,recall
train,0.690712,0.665629,0.717759
test,0.223438,0.13257,0.710303


In [38]:
test = transformer.transform(test)

In [17]:
from sklearn.naive_bayes import BernoulliNB

In [18]:
bnb = BernoulliNB()

In [21]:
evaluate_model(bnb, train_X, train_y, test_X, test_y)

Unnamed: 0,f1,precision,recall
train,0.293908,0.265458,0.329189
test,0.29865,0.274462,0.327514


In [22]:
evaluate_model(bnb, train_X_os, train_y_os, test_X, test_y)

NameError: name 'train_X_os' is not defined

In [66]:
bnb.fit(X, y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [68]:
preds = bnb.predict(test)

In [70]:
pd.Series(preds).to_csv('pred_nb.tsv', index=False)

  """Entry point for launching an IPython kernel.


In [27]:
train_X2, test_X2, train_y, test_y = train_test_split(X2, y, test_size=0.2, shuffle=False)

In [28]:
evaluate_model(bnb, train_X2, train_y, test_X2, test_y)

Unnamed: 0,f1,precision,recall
train,0.203023,0.347901,0.143333
test,0.169751,0.244234,0.130081


In [41]:
grid = {
    'preprocess__tfidf__max_features': [5, 10, 50, 100, 300, 1000]
}

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [34]:
pipeline = Pipeline([
    ('preprocess', transformer2),
    ('model', bnb)
])

In [43]:
gs = GridSearchCV(pipeline, grid, cv=5, scoring='f1')

In [45]:
gs.fit(train.drop(['timestamp', TARGET], axis=1), y)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


KeyboardInterrupt: 