In [None]:
import pandas as pd
import numpy as  np
from matplotlib import pyplot as plt
import seaborn as sns
import json
import datetime
from sklearn import preprocessing
sns.set()

import joblib
import gc

# Summary
## Feature Engineer:
1. 构造了所有categorical variable的**count 特征**（count for every unique value)
2. 用前一天的数据构造了**点击率**与**曝光数**特征
3. 基于部分特征构造了**二阶统计特征**

## Memory Reduction:
1. 将数据转化成不同的格式
2. 注意删除不再使用的变量

Results: Logloss 0.3907; Ranking 158th; Public board Top10% (late submission)

# 1. Load the data

In [None]:
num_of_chunk = 0
chunksize = 10 ** 6

train = pd.DataFrame()
train_path = "../input/avazu-ctr-prediction/train.gz"
test_path = "../input/avazu-ctr-prediction/test.gz"

for chunk in pd.read_csv(train_path, sep = ",", chunksize=chunksize):
    num_of_chunk += 1
    train = pd.concat([train, chunk.sample(frac=.01, replace=False, random_state=123)], axis=0) # >= chunk.sample(frac = .002)
    print('Processing Chunk No. ' + str(num_of_chunk))     
    
train.reset_index(inplace=True)

# Create a back-up file for the length of the training set
train_len = len(train)
train_len

In [None]:
test = pd.read_csv(test_path, sep = ",", delimiter=',',dtype={'id': str})
data = pd.concat([train,test],sort= False)

In [None]:
def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

In [None]:
data.memory_usage()

In [None]:
data = reduce_mem(data)

In [None]:
data.head()

# 2. Data Processing
Use only date > 27 for model training

In [None]:
# get the data
def get_date(x):
    return str(x)[4:6]
data['day'] = data['hour'].apply(get_date)
data['day'] = data['day'].astype(int)

In [None]:
data = data[data['day']>=27]

In [None]:
del data['id'],data['index']

# 3. Feature Engineering

### 3-1. Count features for every categorical variable

In [None]:
# Generate user identifier
data['user'] = data['device_id'].astype(str) + '-'  + data['device_model'].astype(str)+'-'+ data['device_ip'].astype(str)

In [None]:
cate_fea = ['user','device_id','C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C18', 'C19', 'C20', 'C21']
for f in cate_fea:
    # generate a dictionary for each unique value, associated with its Id
    map_dict = dict(zip(data[f].unique(), range(data[f].nunique())))
    # transform each categorical variable using label encoding (1,2,3,4...)
    data[f] = data[f].map(map_dict).fillna(-1).astype('int32')
    # generate the count feature
    data[f + '_count'] = data[f].map(data[f].value_counts())
    data = reduce_mem(data)

Delete unused columns

In [None]:
for f in ['hour','device_ip','C17']:
  del data[f]

In [None]:
data = data.reset_index(drop = True)
data['id'] = data.index + 1
click_df = data[data['click'] == 1]

In [None]:
data.head()

### 3-2. Click frequency & Impression
Calculate click & impression at different level of aggregation

In [None]:
for f in [
    ['user'],
    ['banner_pos', 'user'],
    ['C21', 'user'],
    # ...
]:
    print('------------------ {} ------------------'.format('_'.join(f)))
    
    # 对前一天的点击次数进行统计
    tmp = click_df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg({'_'.join(f) + '_prev_day_click_count': 'count'})
    tmp['day'] += 1 # make the above calculated count correspond to previous day
    data = data.merge(tmp, on=f + ['day'], how='left')
    data['_'.join(f) + '_prev_day_click_count'] = data['_'.join(f) + '_prev_day_click_count'].fillna(0) # deal with NA in the first day
    data.loc[data['day'] == 27, '_'.join(f) + '_prev_day_click_count'] = None
    
    # 对前一天的曝光量进行统计
    tmp = data[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg({'_'.join(f) + '_prev_day_count': 'count'})
    tmp['day'] += 1
    data = data.merge(tmp, on=f + ['day'], how='left')
    data['_'.join(f) + '_prev_day_count'] = data['_'.join(f) + '_prev_day_count'].fillna(0)
    data.loc[data['day'] == 27, '_'.join(f) + '_prev_day_count'] = None
    
    # 计算前一天的点击率
    data['_'.join(f) + '_prev_day_ctr'] = data['_'.join(f) + '_prev_day_click_count'] / (
            data['_'.join(f) + '_prev_day_count'] + data['_'.join(f) + '_prev_day_count'].mean())

    del tmp

del click_df

data = reduce_mem(data)

In [None]:
cols = ["user_prev_day_click_count", 
        "user_prev_day_count",
        "user_prev_day_ctr", 
        "banner_pos_user_prev_day_click_count", 
        "banner_pos_user_prev_day_count", 
        "banner_pos_user_prev_day_ctr", "C21_user_prev_day_click_count", "C21_user_prev_day_count", "C21_user_prev_day_ctr"]
data[cols].describe()

In [None]:
data  = data.reset_index(drop = True)
data['index'] = data.index +1

from scipy.stats import entropy
print('*************************** cross feat (second order) ***************************')
# 二阶交叉特征，可以继续做更高阶的交叉。
# Why not use device type
cross_cols = ['device_model','app_id','site_id','site_domain','banner_pos']
for f in cross_cols:
    for col in cross_cols:
        if col == f:
            # The continue statement directly continues to loop without running the remaining code
            continue
        print('------------------ {} {} ------------------'.format(f, col))
        data = data.merge(data[[f, col]].groupby(f, as_index=False)[col].agg({
            'cross_{}_{}_nunique'.format(f, col): 'nunique',
            'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0]) # 熵
        }), on=f, how='left')
        if 'cross_{}_{}_count'.format(f, col) not in data.columns.values and 'cross_{}_{}_count'.format(col, f) not in data.columns.values:
            data = data.merge(data[[f, col, 'index']].groupby([f, col], as_index=False)['index'].agg({
                'cross_{}_{}_count'.format(f, col): 'count' # 共现次数
            }), on=[f, col], how='left')
        if 'cross_{}_{}_count_ratio'.format(col, f) not in data.columns.values:
            data['cross_{}_{}_count_ratio'.format(col, f)] = data['cross_{}_{}_count'.format(f, col)] / data[f + '_count'] # 比例偏好
        if 'cross_{}_{}_count_ratio'.format(f, col) not in data.columns.values:
            data['cross_{}_{}_count_ratio'.format(f, col)] = data['cross_{}_{}_count'.format(f, col)] / data[col + '_count'] # 比例偏好
        data['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = data['cross_{}_{}_nunique'.format(f, col)] / data[f + '_count']
    data = reduce_mem(data)
del data['index']
gc.collect()

# 4. Modeling

In [None]:
train_df = data[data['click'].isna()==False].reset_index(drop=True)
test_df = data[data['click'].isna()==True].reset_index(drop=True)

In [None]:
X_train = train_df[train_df["day"]<30].copy()
y_train = X_train["click"].astype('int8')
X_valid = train_df[train_df["day"] ==30]
y_valid = X_valid["click"].astype('int8')
gc.collect()

In [None]:
drop_fea = ['day','click']
feature= [x for x in train_df.columns if x not in drop_fea]
print(len(feature))
print(feature)

del data
gc.collect()

### 4-1. Build the CatBoostClassifier
CatBoost is a high-performance open source library for gradient boosting on decision trees.

In [None]:
# Install CatBoost
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
from catboost import CatBoostClassifier

In [None]:
clf = CatBoostClassifier(iterations=100000, depth=6,learning_rate=0.01, loss_function='Logloss',cat_features=[]
                        ,verbose=True,eval_metric='Logloss',counter_calc_method='Full',task_type='GPU',metric_period=50)
clf.fit(
    X_train[feature], y_train.astype('int32'),
    eval_set=[(X_valid[feature],y_valid.astype('int32'))],
    early_stopping_rounds=200,
    verbose=True,
    use_best_model=True,
)


In [None]:
import matplotlib.pyplot as plt 
from matplotlib import cm
score = pd.DataFrame()
score['fea_name'] = clf.feature_names_
score['fea']=clf.feature_importances_
score = score.sort_values(['fea'], ascending=False)
temp = pd.DataFrame()
temp = score[:60]
color = cm.jet(temp['fea']/temp['fea'].max())
plt.figure(figsize=(10, 15))
plt.barh(temp['fea_name'],temp['fea'],height =0.8,color=color,alpha=0.8)
plt.show()