In [5]:
import os

import pandas as pd
import seaborn as sns

DATA_DIR = '/mnt/ml-team/minerva/talking_data/data'
FILES_DIR = '/mnt/ml-team/minerva/talking_data/files'

In [11]:
train_sample = pd.read_csv(os.path.join(FILES_DIR, 'train_day7_hour10.csv'))
test_sample = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'),nrows=100000)

In [12]:
display(train_sample.shape, test_sample.shape)

(3300746, 8)

(100000, 7)

In [13]:
display(train_sample.head())
display(test_sample.head())

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,128682,12,1,47,328,2017-11-07 10:00:00,,0
1,34958,3,1,18,280,2017-11-07 10:00:00,,0
2,84896,13,1,10,477,2017-11-07 10:00:00,,0
3,93641,15,1,19,130,2017-11-07 10:00:00,,0
4,107922,12,1,25,178,2017-11-07 10:00:00,,0


Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [4]:
train_sample['click_time'] =  pd.to_datetime(train_sample['click_time'], format='%Y-%m-%d %H:%M:%S')
test_sample['click_time'] =  pd.to_datetime(test_sample['click_time'], format='%Y-%m-%d %H:%M:%S')

# Feature distributions

In [14]:
train_sample.nunique()

ip                 57478
app                  294
device               675
os                   229
channel              163
click_time          3600
attributed_time     5301
is_attributed          2
dtype: int64

Categorical features

```python
CATEGORICAL_FEATURES = ['ip','app','device','os','channel']
```

Timestamp
```python
TIMESTAMPS =['click_time']
```
Target
```python
TARGET = ['is_attributed']
```
Let's look at the distributions of categorical features

In [7]:
CATEGORICAL_FEATURES = ['ip','app','device','os','channel']
frequency_threshold = 20

for feature in CATEGORICAL_FEATURES:
    counts = train_sample[feature].value_counts()
    print(feature, counts[counts>frequency_threshold].shape)

ip (281,)
app (47,)
device (6,)
os (65,)
channel (122,)


Very long tails so some clipping before feature extraction will be important

In [8]:
train_sample['click_time'].min(), train_sample['click_time'].max()

(Timestamp('2017-11-06 16:00:00'), Timestamp('2017-11-09 15:59:51'))

In [9]:
test_sample['click_time'].min(), test_sample['click_time'].max()

(Timestamp('2017-11-10 04:00:00'), Timestamp('2017-11-10 04:01:46'))

Test set starts after the train set so creating a validation set in a similar manner is important

# Target

In [10]:
train_sample['is_attributed'].mean()

0.0022699999999999999

Target is highly imbalanced. We should make sure to deal with that in our solution