### Feature preparation

In [1]:
import pandas as pd
import os
from sklearn.utils import shuffle
import warnings
from sklearn.preprocessing import OneHotEncoder # libFM part
from sklearn.datasets import dump_svmlight_file # libFM part
import numpy as np

warnings.filterwarnings('ignore')

In [2]:
def df_shuffle(df):
    df = shuffle(df)
    df.reset_index(inplace = True)
    df.drop('index', inplace = True, axis = 1)
    return df

In [13]:
def libffm(ffm_p, tr_p, v_p, latent, cores, autostop = True, lreg = 0.00002, nrounds = 200, print_s = True, on_disc = False):
    
    if autostop:
        fm_cmd = r"%s -p %s -s %s -k %s -l %s -t %s -r 0.2 --auto-stop %s" % (ffm_p, v_p, cores, latent, str(lreg), 
                                                                                 str(nrounds), tr_p)
    else:
        fm_cmd = r"%s -p %s -s %s -k %s -l %s -t %s -r 0.2 %s" % (ffm_p, v_p, cores, latent, str(lreg), 
                                                                                 str(nrounds), tr_p)
        
    if on_disc:
        fm_cmd = r"%s  -p %s -s 4 -k 10 -t 200  --no-rand --on-disk --auto-stop %s" % (ffm_p, v_p, tr_p)
        print(os.popen(fm_cmd).read())
        
    else:
        if print_s:
            print(fm_cmd)
        log = os.popen(fm_cmd).read()
        print(log)
        try:
            result = log.split('\n')
            loss = float(result[len(result) - 4].split('      ')[2])
        except:
            loss = -1000
        print('Val logloss: ', loss)

In [4]:
def libffm_predict(ffm_p, model_p, test_p,  out_p):
    fm_cmd = r"%s %s %s %s" % (ffm_p, test_p, model_p, out_p)
    print(fm_cmd)
    try:
        os.system(fm_cmd)
        print('Predicted: 1')
    except Exception as e:
        print('Predicted: 0')
        print(e)

In [5]:
def convert_to_ffm(df,type,numerics,categories,features, bpath, model_type = ''):
        currentcode = len(numerics)
        catdict = {}
        catcodes = {}
        # Flagging categorical and numerical fields
        for x in numerics:
             catdict[x] = 0
        for x in categories:
             catdict[x] = 1

        nrows = df.shape[0]
        ncolumns = len(features)
        counter = 0
        with open(bpath + str(type) + "_" + model_type + "_ffm.txt", "w") as text_file:

            # Looping over rows to convert each row to libffm format
            for n, r in enumerate(range(nrows)):
                datastring = ""
                datarow = df.iloc[r].to_dict()
                datastring += str(int(datarow['click']))
                # For numerical fields, we are creating a dummy field here
                for i, x in enumerate(catdict.keys()):
                    if(catdict[x]==0):
                        datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                    else:
                 # For a new field appearing in a training example
                        if(x not in catcodes):
                            catcodes[x] = {}
                            currentcode +=1
                            catcodes[x][datarow[x]] = currentcode #encoding the feature
                 # For already encoded fields
                        elif(datarow[x] not in catcodes[x]):
                            currentcode +=1
                            catcodes[x][datarow[x]] = currentcode #encoding the feature
                        code = catcodes[x][datarow[x]]
                        datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

                datastring += '\n'
                text_file.write(datastring)
                if counter % 10**5 == 0:
                    print('Completed: ', round(counter / nrows, 2))
                counter += 1

In [6]:
def get_hour(x):
    return int(str(x)[6:])

def get_date(x):
    return int(str(x)[:6])

dict_day = {'21' : 1, '22' : 2, '23' : 3, '24' : 4, '25' : 5, '26' : 6, '27' : 0, '28' : 1, '29' : 2, '30' : 3, '31' : 4}
def get_day(x):
    return dict_day[str(x)[4:6]]

In [7]:
base_path = 'D:/Downloads/avazu_feedzai/'


libffm_path = r'D:\Downloads\avazu_feedzai\libffm\ffm-train.exe'
libffm_ftlr_path = r'D:\Downloads\avazu_feedzai\libffm-ftrl-master\libffm-ftrl-master\ffm-train.exe'
libffm_path_predict = r'D:\Downloads\avazu_feedzai\libffm\ffm-predict.exe'
libffm_ftlr_predict = r'D:\Downloads\avazu_feedzai\libffm-ftrl-master\libffm-ftrl-master\ffm-predict.exe'

train_libffm = r'D:\Downloads\avazu_feedzai\train_ffm.txt'
val_libffm = r'D:\Downloads\avazu_feedzai\val_ffm.txt'
test_libffm = r'D:\Downloads\avazu_feedzai\val_ffm.txt'

train_libffm_m = r'D:\Downloads\avazu_feedzai\train_mob_ffm.txt'
val_libffm_m = r'D:\Downloads\avazu_feedzai\val_mob_ffm.txt'
test_libffm_m = r'D:\Downloads\avazu_feedzai\test_mob_ffm.txt'

train_libffm_a = r'D:\Downloads\avazu_feedzai\train_app_ffm.txt'
val_libffm_a = r'D:\Downloads\avazu_feedzai\val_app_ffm.txt'
test_libffm_a = r'D:\Downloads\avazu_feedzai\test_app_ffm.txt'

out_a = r'D:\Downloads\avazu_feedzai\app_ffm.txt'
out_m = r'D:\Downloads\avazu_feedzai\mob_ffm.txt'

model_a = r'D:\Downloads\avazu_feedzai\train_app_ffm.txt.model'
model_m = r'D:\Downloads\avazu_feedzai\train_mob_ffm.txt.model'

In [8]:
def get_features(base_path, model_type, nrows):

    train = pd.read_csv(base_path + 'train.gz', nrows = nrows)
    # train['web'] = 1 - (train['site_id'] == '85f751fd')*1
    if model_type == 'app':
        train = train[train['site_id'] == '85f751fd']
    else:
        train = train[train['site_id'] != '85f751fd']
        
    print("Sample set shape: ",  train.shape[0])

    train_index = int(0.8*train.shape[0])

    train = df_shuffle(train)

    # process hour

    train['day'] = train['hour'].map(get_day)
    train['time'] = train['hour'].map(get_hour)
    train['date'] = train['hour'].map(get_date)

    # generate features

    # define user as device_id + device_model + device_ip

    train['user'] = train['device_id'] + train['device_model'] + train['device_ip']

    # for each user we calculate his characteristics per hour, per day, total

    for c in ['hour', 'date']:
        train['temp'] = train['user'] + train[c].astype(str)
        dicty = train.groupby('temp')['user'].count()
        train['user' + '_count_' + c] = train['temp'].map(dicty)
        print(c)
    train.drop('temp', axis = 1, inplace = True)

    for c in ['hour', 'date']:
        train['temp'] = train['user'] + train[c].astype(str)
        for cc in ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain',  'app_category', 'C14',
           'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']:
            dicty = train.groupby('temp')[cc].nunique()
            train['user' + '_nunique_' + c + '_' + cc] = train['temp'].map(dicty)
            if abs(train[['user' + '_nunique_' + c + '_' + cc, 'click']].corr().values[0][1]) < 0.05:
                train.drop('user' + '_nunique_' + c + '_' + cc, axis = 1, inplace = True)
                print('drop user' + '_nunique_' + c + '_' + cc)
        print(c)
    train.drop('temp', axis = 1, inplace = True)

    train.head(5)

    # define an interaction of ad as site_id + app_id
    
    train['place_id'] = train['site_id'] + train['app_id']
    train['place_genre_id'] = train['site_id'] + train['app_id'] + train['site_category'] + train['app_category']
    train['tech_position'] = train['banner_pos'].astype(str) + train['device_conn_type'].astype(str)
    train['add_position'] = train['place_id'].astype(str) + train['banner_pos'].astype(str)
    train['union_category'] = train['site_category'] + train['app_category']

    train['ultra_C_type'] = train['C1'].astype(str) + train['C14'].astype(str) + train['C15'].astype(str) + train['C16'].astype(str) \
     + train['C17'].astype(str) + train['C18'].astype(str) + train['C19'].astype(str) + train['C20'].astype(str)+ train['C21'].astype(str)

    # for features we calculate how oftern they are met per hour, per day and in total

    train['user_date'] = train['user'].astype(str) + train['date'].astype(str)
    train['place_date'] = train['place_id'].astype(str) + train['date'].astype(str)

    counter = 0

    history_user = []
    history_place = []

    dict_user = {}
    dict_place = {}

    for row in train.itertuples():

        user = row[list(train.columns).index('user_date') + 1]
        place = row[list(train.columns).index('place_date') + 1]


        try:
            history_user.append(dict_user[user])
        except KeyError:
            history_user.append(0)
        try:
            history_place.append(dict_place[place])
        except KeyError:
            history_place.append(0)


        try:
            dict_user[user] += 1
        except KeyError:
            dict_user[user] = 0
        try:
            dict_place[place] += 1
        except KeyError:
            dict_place[place] = 0

        if counter % 10**6 == 0:
            print(counter / train.shape[0])
        counter += 1

    train['user_history'] = history_user
    train.ix[train['user_history'] > 10*3, 'user_history'] = 1000
    train['place_history'] = history_place
    train.ix[train['place_history'] > 10*3, 'place_history'] = 1000
    
    train.drop(['user_date', 'place_date'], axis = 1, inplace = True)

    # check how many unique values we have got

    for c in train.columns:
        print('Column %s contains %s unique values. It is %s' % (c, train[c].nunique(), round((train[c].nunique() / train.shape[0]) * 100, 2))  + ' %')

    avoid = ['click', 'hour', 'id', 'date']
    features = [t for t in train.columns if t not in avoid]

    # delete columns with 1 unique values
    drop_cols = []
    for c in features:
        if train[c].nunique() == 1:
            features.remove(c)

    # replace values with low frequency
    for c in features:
        dicty = train.groupby(c)['click'].count()
        train['temp'] = train[c].map(dicty)
        train.ix[train['temp'] < 10, c] = -999
    train.drop('temp', axis = 1, inplace = True)
    return features, train

# Save features

def save_ffm(train, bpath, name, features):
    convert_to_ffm(train,name,[],features,features, base_path)

In [9]:
# Features for app model

In [12]:
features, train = get_features(base_path, 'app', 5*10**5)
train_index = int(0.8*train.shape[0])
save_ffm(train.ix[:train_index,:], base_path, 'train', features)
save_ffm(train.ix[train_index:,:], base_path, 'val', features)

Sample set shape:  153042
hour
date
drop user_nunique_hour_app_id
drop user_nunique_hour_app_domain
drop user_nunique_hour_app_category
drop user_nunique_hour_C14
drop user_nunique_hour_C15
drop user_nunique_hour_C16
drop user_nunique_hour_C17
drop user_nunique_hour_C18
drop user_nunique_hour_C19
drop user_nunique_hour_C21
hour
drop user_nunique_date_app_id
drop user_nunique_date_app_domain
drop user_nunique_date_app_category
drop user_nunique_date_C14
drop user_nunique_date_C15
drop user_nunique_date_C16
drop user_nunique_date_C17
drop user_nunique_date_C18
drop user_nunique_date_C19
drop user_nunique_date_C21
date
0.0
Column id contains 153042 unique values. It is 100.0 %
Column click contains 2 unique values. It is 0.0 %
Column hour contains 4 unique values. It is 0.0 %
Column C1 contains 7 unique values. It is 0.0 %
Column banner_pos contains 4 unique values. It is 0.0 %
Column site_id contains 1 unique values. It is 0.0 %
Column site_domain contains 1 unique values. It is 0.0 %
Co

In [14]:
libffm(libffm_path,train_libffm, val_libffm, 6, 6, lreg = 0.00002)

D:\Downloads\avazu_feedzai\libffm\ffm-train.exe -p D:\Downloads\avazu_feedzai\val_ffm.txt -s 6 -k 6 -l 2e-05 -t 200 -r 0.2 --auto-stop D:\Downloads\avazu_feedzai\train_ffm.txt
First check if the text file has already converted to binary format (0.2 seconds)
Binary file found. Skip converting text to binary
First check if the text file has already converted to binary format (0.2 seconds)
Binary file found. Skip converting text to binary
iter   tr_logloss   va_logloss      tr_time
   1      0.44175      0.43207          2.4
   2      0.43137      0.42884          4.8
   3      0.42874      0.42670          7.2

Val logloss:  0.43207


In [14]:
# Features for mob model

In [15]:
features, train = get_features(base_path, 'mob', 10**3)
train_index = int(0.8*train.shape[0])
save_ffm(train.ix[:train_index,:], base_path, 'train', features)
save_ffm(train.ix[train_index:,:], base_path, 'val', features)

Sample set shape:  790
hour
date
drop user_nunique_hour_site_id
drop user_nunique_hour_site_domain
drop user_nunique_hour_site_category
drop user_nunique_hour_C14
hour
drop user_nunique_date_site_id
drop user_nunique_date_site_domain
drop user_nunique_date_site_category
drop user_nunique_date_C14
date
0.0
Column id contains 790 unique values. It is 100.0 %
Column click contains 2 unique values. It is 0.25 %
Column hour contains 1 unique values. It is 0.13 %
Column C1 contains 3 unique values. It is 0.38 %
Column banner_pos contains 2 unique values. It is 0.25 %
Column site_id contains 121 unique values. It is 15.32 %
Column site_domain contains 109 unique values. It is 13.8 %
Column site_category contains 10 unique values. It is 1.27 %
Column app_id contains 1 unique values. It is 0.13 %
Column app_domain contains 1 unique values. It is 0.13 %
Column app_category contains 1 unique values. It is 0.13 %
Column device_id contains 33 unique values. It is 4.18 %
Column device_ip contains 70

In [16]:
libffm(libffm_path,train_libffm, val_libffm, 10, 6, lreg = 0.00002)

D:\Downloads\avazu_feedzai\libffm\ffm-train.exe -p D:\Downloads\avazu_feedzai\val_ffm.txt -s 6 -k 10 -l 2e-05 -t 200 -r 0.2 --auto-stop D:\Downloads\avazu_feedzai\train_ffm.txt
First check if the text file has already converted to binary format (0.2 seconds)
Binary file found. Skip converting text to binary
First check if the text file has already converted to binary format (0.2 seconds)
Binary file found. Skip converting text to binary
iter   tr_logloss   va_logloss      tr_time
   1      0.44192      0.43186          3.1
   2      0.43141      0.42890          6.2
   3      0.42866      0.42672          9.2
   4      0.42663      0.42461         12.2

Val logloss:  0.4289


In [17]:
# save features

In [18]:
sub = pd.DataFrame()
sub['features'] = features
sub.to_csv(base_path + 'features.csv')

### Training part for the whole data

In [52]:
# app model

In [10]:
def get_ffm_txt_full(train, base_path, model_type, features_list, sample_size = 1.0):

    if model_type == 'app':
        train = train[train['site_id'] == '85f751fd']
    else:
        train = train[train['site_id'] != '85f751fd']
        
        
    # process hour

    train['day'] = train['hour'].map(get_day)
    train['time'] = train['hour'].map(get_hour)
    train['date'] = train['hour'].map(get_date)

    # generate features

    # define user as device_id + device_model + device_ip

    train['user'] = train['device_id'] + train['device_model'] + train['device_ip']

    # for each user we calculate his characteristics per hour, per day, total

    for c in ['hour', 'date']:
        train['temp'] = train['user'] + train[c].astype(str)
        dicty = train.groupby('temp')['user'].count()
        train['user' + '_count_' + c] = train['temp'].map(dicty)
        print(c)
    train.drop('temp', axis = 1, inplace = True)

    for c in ['hour', 'date']:
        train['temp'] = train['user'] + train[c].astype(str)
        for cc in ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain',  'app_category', 'C14',
           'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']:
            dicty = train.groupby('temp')[cc].nunique()
            train['user' + '_nunique_' + c + '_' + cc] = train['temp'].map(dicty)
        print(c)
    train.drop('temp', axis = 1, inplace = True)

    # define an interaction of ad as site_id + app_id

    train['place_id'] = train['site_id'] + train['app_id']
    train['place_genre_id'] = train['site_id'] + train['app_id'] + train['site_category'] + train['app_category']
    train['tech_position'] = train['banner_pos'].astype(str) + train['device_conn_type'].astype(str)
    train['add_position'] = train['place_id'].astype(str) + train['banner_pos'].astype(str)
    train['union_category'] = train['site_category'] + train['app_category']

    train['ultra_C_type'] = train['C1'].astype(str) + train['C14'].astype(str) + train['C15'].astype(str) + train['C16'].astype(str) \
     + train['C17'].astype(str) + train['C18'].astype(str) + train['C19'].astype(str) + train['C20'].astype(str)+ train['C21'].astype(str)

    

    train['user_date'] = train['user'].astype(str) + train['date'].astype(str)
    train['place_date'] = train['place_id'].astype(str) + train['date'].astype(str)

    counter = 0

    history_user = []
    history_place = []

    dict_user = {}
    dict_place = {}
    
    # for features user and place we calculate how oftern they are met per date in cummulative way

    for row in train.itertuples():

        user = row[list(train.columns).index('user_date') + 1]
        place = row[list(train.columns).index('place_date') + 1]


        try:
            history_user.append(dict_user[user])
        except KeyError:
            history_user.append(0)
        try:
            history_place.append(dict_place[place])
        except KeyError:
            history_place.append(0)


        try:
            dict_user[user] += 1
        except KeyError:
            dict_user[user] = 0
        try:
            dict_place[place] += 1
        except KeyError:
            dict_place[place] = 0

        if counter % 10**6 == 0:
            print(counter / train.shape[0])
        counter += 1

    train['user_history'] = history_user
    train.ix[train['user_history'] > 10*3, 'user_history'] = 1000
    train['place_history'] = history_place
    train.ix[train['place_history'] > 10*3, 'place_history'] = 1000
    
    train.drop(['user_date', 'place_date'], axis = 1, inplace = True)
    
    train = pd.concat([train[train['type'] == 1].sample(frac=sample_size, replace=False, random_state = 1),
                       train[train['type'] == 0]])
    
    train.reset_index(inplace = True)
    train.drop('index', inplace = True, axis = 1)
        
    train_shape = train[train['type'] == 1].shape[0]
    train.drop('type', axis = 1, inplace = True)
    
    train_index = int(0.8*train_shape)
    convert_to_ffm(train.ix[:train_index, :],'train',[],features_list,features_list, base_path, model_type = model_type)
    convert_to_ffm(train.ix[train_index:train_shape - 1, :],'val',[],features_list,features_list, base_path, model_type = model_type)
    convert_to_ffm(train.ix[train_shape:, :],'test',[],features_list,features_list, base_path, model_type = model_type)

In [11]:
train = pd.read_csv(base_path + 'train.gz')
test = pd.read_csv(base_path + 'test.gz')
test['click'] = 0

train['type'] = 1
test['type'] = 0

cols = [t for t in train.columns]
data = pd.concat([train[cols], test[cols]])


features = list(pd.read_csv(base_path + 'features.csv')['features'].values)

del train
del test

In [1]:
get_ffm_txt_full(data, base_path, 'app', features)

In [None]:
# mob model

In [None]:
train = pd.read_csv(base_path + 'train.gz')
test = pd.read_csv(base_path + 'test.gz')
test['click'] = 0

train['type'] = 1
test['type'] = 0

cols = [t for t in train.columns]
data = pd.concat([train[cols], test[cols]])


features = list(pd.read_csv(base_path + 'features.csv')['features'].values)

del train
del test

get_ffm_txt_full(data, base_path, 'mob', features)

In [25]:
# Training and prediction for app

In [17]:
df = pd.read_csv(r'D:\Downloads\avazu_feedzai\ffm_txt\train_app_ffm.txt', nrows = 10**6, header = None)

In [18]:
df.to_csv(r'D:\Downloads\avazu_feedzai\ffm_txt\train_app_ffmm.txt', index = False)

In [None]:
libffm(libffm_path, train_libffm_a, val_libffm_a, 10, 6, autostop = True, lreg = 0.00002, print_s = True)

In [37]:
libffm_predict(libffm_path_predict, model_a, train_libffm_a,  out_a)

D:\Downloads\avazu_feedzai\libffm\ffm-predict.exe D:\Downloads\avazu_feedzai\train_app_ffm.txt D:\Downloads\avazu_feedzai\train_app_ffm.txt.model D:\Downloads\avazu_feedzai\app_ffm.txt
Predicted: 1


In [None]:
# Training and prediction for mob

In [9]:
libffm(libffm_path,train_libffm_m, val_libffm_m, 10, 6, lreg = 0.00002)

First check if the text file has already converted to binary format (0.0 seconds)
Binary file NOT found. Convert text file to binary file (273.6 seconds)
First check if the text file has already converted to binary format (0.0 seconds)
Binary file NOT found. Convert text file to binary file (69.3 seconds)
iter   tr_logloss   va_logloss      tr_time

Val logloss:  -1000


-1000

In [None]:
libffm_predict(libffm_path_predict, model_m, test_libffm_m,  out_m)

### Combine libFFM predictions

In [None]:
sub = pd.DataFrame()
sub['site_id'] = test['site_id']
sub['click'] = 0

mob_pred = pd.read_csv(base_path + 'out_mob.txt', header = None)
app_pred = pd.read_csv(base_path + 'out_app.txt', header = None)

sub.ix[sub['site_id'] != '85f751fd', 'click'] = mob_pred[0].values
sub.ix[sub['site_id'] == '85f751fd', 'click'] = app_pred[0].values

sub[['id', 'click']].to_csv(base_path + 'submission.csv', index = False)