In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.model_selection import train_test_split
import random
import os
from tqdm import tqdm
from multiprocessing import Pool

import pyarrow.parquet as parquet
from sklearn.linear_model import LogisticRegression

In [2]:
zero_features = ['first_image_id']
duplicates = [
        'userOwnerCounters_MOVIE_COMMENT_CREATE',
        'userOwnerCounters_PHOTO_COMMENT_CREATE',
        'userOwnerCounters_PHOTO_MARK_CREATE',
        'userOwnerCounters_PHOTO_PIN_BATCH_CREATE',
        'userOwnerCounters_PHOTO_PIN_UPDATE',
        'userOwnerCounters_PHOTO_VIEW',
        'userOwnerCounters_USER_DELETE_MESSAGE', 
        'userOwnerCounters_USER_FORUM_MESSAGE_CREATE',
        'userOwnerCounters_USER_INTERNAL_LIKE',
        'userOwnerCounters_USER_INTERNAL_UNLIKE', 
        'userOwnerCounters_USER_PHOTO_ALBUM_COMMENT_CREATE', 
        'userOwnerCounters_USER_PRESENT_SEND', 
        'userOwnerCounters_USER_PROFILE_VIEW', 
        'userOwnerCounters_USER_SEND_MESSAGE', 
        'userOwnerCounters_USER_STATUS_COMMENT_CREATE', 
        'userOwnerCounters_VOTE_POLL'
        ]
zero_features2 = [
    'HAS_TEXT',
    'user_is_deleted',
    'user_is_semiactivated',
    'auditweights_userOwner_USER_PROFILE_VIEW',
    'auditweights_hasDetectedText',
    'auditweights_userOwner_PHOTO_VIEW',
    'auditweights_relationMasks',
    'auditweights_userOwner_CREATE_IMAGE',
    'user_is_activated',
    'user_is_active',
    'auditweights_source_MOVIE_TOP',
    'auditweights_closed',
    'user_is_abused',
    'IS_PROMO',
    'auditweights_userOwner_UNKNOWN',
    'IS_CENSORED',
    'IS_DISGUSTING']

In [3]:
cat_columns = ['instanceId_objectType',
           'audit_clientType',
           'metadata_ownerType',
           'metadata_platform',
           'membership_status']
# catboost_cat_columns=[
#         'audit_clientType',
#         'audit_experiment',
#         'audit_resourceType',
#         'instanceId_objectType',
#         'membership_status',
#         'metadata_ownerType',
#         'metadata_platform',
#         'user_ID_country',
#         'user_region',
#         'user_status'
# ]

In [4]:
additional_cat_cols = ['instanceId_userId', 'user_region', 'user_status', 'metadata_authorId', 'metadata_ownerId',
                       'audit_resourceType', 'audit_timestamp_dow', 'metadata_createdAt_dow', 'user_ID_Location', 'user_ID_country']

In [5]:
md_columns = ['HAS_COMPANIONS',
    'HAS_DETECTED_TEXT',
    'HAS_MUSIC',
    'HAS_PHOTOS',
    'HAS_PINS',
    'HAS_POLLS',
    'HAS_TEXT',
    'HAS_URLS',
    'HAS_VIDEOS',
    'IS_CENSORED',
    'IS_DISGUSTING',
    'IS_EXTERNAL_SHARE',
    'IS_GIF',
    'IS_INTERNAL_GROUP_SHARE',
    'IS_INTERNAL_SHARE',
    'IS_PART_OF_ALBUM',
    'IS_PART_OF_TOPIC',
    'IS_PROMO']
feedback_columns = ['Clicked',
    'Commented',
    'Complaint',
    'Disliked',
    'Ignored',
    'Liked',
    'ReShared',
    'Unliked',
    'Viewed']

In [6]:
def convert_int_float(df):
    df_float = df.select_dtypes(include=['float'])
    converted_float = df_float.apply(pd.to_numeric,downcast='float')
    
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')
    
    optimized_df = df.copy()
    optimized_df[converted_int.columns] = converted_int
    optimized_df[converted_float.columns] = converted_float
    return optimized_df

In [7]:
def get_first_image_id(ids):
    if ids is not None:
        return ids[0]

def get_df(f):
    print(f)
    df = parquet.read_table(f).to_pandas()
    md = pd.get_dummies(df.metadata_options.apply(pd.Series).stack()).sum(level=0)
    df.drop('metadata_options', inplace=True, axis=1)
    feedback_df = pd.get_dummies(df.feedback.apply(pd.Series).stack()).sum(level=0)
    df.drop('feedback', axis=1, inplace=True)
    df['first_image_id'] = df.ImageId.apply(get_first_image_id)
    df.drop('ImageId', axis=1, inplace=True)
    for c in cat_columns:
        df[c] = df[c].fillna('NULL').astype('category')
    return pd.concat([convert_int_float(df), md, feedback_df], axis=1)

In [8]:
def read_parquet(folder):
    folders = [os.path.join(folder,d) for d in os.listdir(folder) if 'date' in d]
    pool = Pool(14)
    dfs = pool.map(get_df, folders)
    return pd.concat(dfs).reset_index(drop=True)

In [9]:
%%time
df = read_parquet('../../dataset/train/')

../../dataset/train/date=2018-02-01
../../dataset/train/date=2018-02-05
../../dataset/train/date=2018-02-02
../../dataset/train/date=2018-02-04
../../dataset/train/date=2018-02-03
../../dataset/train/date=2018-02-06
../../dataset/train/date=2018-02-07
../../dataset/train/date=2018-02-08
../../dataset/train/date=2018-02-09
../../dataset/train/date=2018-02-14
../../dataset/train/date=2018-02-10
../../dataset/train/date=2018-02-13
../../dataset/train/date=2018-02-12
../../dataset/train/date=2018-02-15
../../dataset/train/date=2018-02-16
../../dataset/train/date=2018-02-17
../../dataset/train/date=2018-02-18
../../dataset/train/date=2018-02-19
../../dataset/train/date=2018-02-20
../../dataset/train/date=2018-02-21
../../dataset/train/date=2018-02-22
../../dataset/train/date=2018-02-23
../../dataset/train/date=2018-02-24
../../dataset/train/date=2018-02-25
../../dataset/train/date=2018-02-26
../../dataset/train/date=2018-02-27
../../dataset/train/date=2018-02-28
../../dataset/train/date=201

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


CPU times: user 8min 6s, sys: 1min 56s, total: 10min 2s
Wall time: 28min 29s


In [10]:
for c in md_columns+feedback_columns:
    df[c] = df[c].fillna(0).astype('uint8')

In [11]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30787847 entries, 0 to 30787846
Columns: 133 entries, Clicked to user_status
dtypes: category(4), float32(72), float64(11), int32(11), int64(1), object(2), uint16(1), uint64(2), uint8(29)
memory usage: 17.3 GB


In [12]:
df = convert_int_float(df)
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30787847 entries, 0 to 30787846
Columns: 133 entries, Clicked to user_status
dtypes: category(4), float32(83), int32(11), int64(1), object(2), uint16(1), uint64(2), uint8(29)
memory usage: 16.0 GB


In [13]:
for c in df.columns:
    un = df[c].nunique()
    if un==0:
        df.drop(c, axis=1, inplace=True)
        print(c,un,'REMOVED')
    if (un==1):
        print(c,un,df[df[c].isnull()].shape[0]/df.shape[0],df[c].unique())
    if len(df[c].unique()) == 1:
        df.drop(c, axis=1, inplace=True)
        print(c,'REMOVED2')

auditweights_closed 1 0.999999935039303 [nan  1.]
auditweights_hasDetectedText 1 0.9999974665328173 [nan  1.]
auditweights_hasText 1 0.9889632100614245 [nan  1.]
auditweights_isRandom 1 0.0 [1.]
auditweights_isRandom REMOVED2
auditweights_notOriginalPhoto 1 0.9708656470847085 [nan  1.]
auditweights_onlineVideo 1 0.9993546154753855 [nan  1.]
auditweights_processedVideo 1 0.9757074276743027 [nan  1.]
auditweights_relationMasks 1 0.999999935039303 [nan  1.]
auditweights_source_MOVIE_TOP 1 0.9999937962534373 [nan  1.]
auditweights_userOwner_PHOTO_VIEW 1 0.9999999675196515 [nan  1.]
auditweights_userOwner_USER_PROFILE_VIEW 1 0.9999999675196515 [       nan 0.97587299]
metadata_applicationId 1 0.0 [0]
metadata_applicationId REMOVED2
userOwnerCounters_COMMENT_INTERNAL_LIKE 1 0.40763038740578383 [nan  0.]
userOwnerCounters_MOVIE_COMMENT_CREATE 1 0.40763038740578383 [nan  0.]
userOwnerCounters_PHOTO_COMMENT_CREATE 1 0.40763038740578383 [nan  0.]
userOwnerCounters_PHOTO_MARK_CREATE 1 0.4076303874

In [14]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30787847 entries, 0 to 30787846
Columns: 131 entries, Clicked to user_status
dtypes: category(4), float32(82), int32(11), int64(1), object(2), uint16(1), uint64(2), uint8(28)
memory usage: 15.9 GB


In [22]:
df.drop('IS_PARTNER_CONTENT', axis=1, inplace=True)

In [48]:
# df = pd.read_hdf('../data/df_md_fb_2.h5', key='c')

In [50]:
df.drop(duplicates, axis=1, inplace=True)

In [51]:
df.to_hdf('../data/df_md_fb_2.h5', key='c', mode='w', format='table')

In [52]:
# df = pd.read_hdf('../data/df_md_fb.h5', key='c')[good_columns + feedback_columns + ['instanceId_userId', 'instanceId_objectId']].sort_values('instanceId_userId')

In [53]:
df1 = pd.read_hdf('../data/df_md_fb_2.h5', key='c').sort_values('instanceId_userId')
df1.shape

(30787847, 114)

In [54]:
df_test = parquet.read_table('../../dataset/test').to_pandas()
md = pd.get_dummies(df_test.metadata_options.apply(pd.Series).stack()).sum(level=0)
df_test.drop('metadata_options', inplace=True, axis=1)
df_test = pd.concat([df_test,md], axis=1)
df_test['first_image_id'] = df_test.ImageId.apply(get_first_image_id)
df_test.drop('ImageId', axis=1, inplace=True)
for c in cat_columns:
    df_test[c] = df_test[c].fillna('NULL').astype('category')
# for i, col in enumerate(df_test.columns):
#     if col in cat_columns:
#         df_test[col] = df_test[col].fillna('NULL').astype('category')
#         print (col, df_test[col].unique())
df_test.shape

(2671862, 124)

In [55]:
df = pd.concat([df1, df_test[df1[:1].drop(feedback_columns, axis=1).columns]])
df.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


(33459709, 114)

In [56]:
import gc
del(df1)
gc.collect()

18703083

In [57]:
for c in cat_columns:
    df[c] = df[c].astype('str').fillna('NULL').astype('category')
    print (c, df[c].unique())

instanceId_objectType [Post, Photo, Video]
Categories (3, object): [Post, Photo, Video]
audit_clientType [API, WEB, MOB]
Categories (3, object): [API, WEB, MOB]
metadata_ownerType [GROUP_OPEN_OFFICIAL, GROUP_OPEN]
Categories (2, object): [GROUP_OPEN_OFFICIAL, GROUP_OPEN]
metadata_platform [OTHER, WEB, ANDROID, MOB, IOS, WINPHONE]
Categories (6, object): [OTHER, WEB, ANDROID, MOB, IOS, WINPHONE]
membership_status [A, NULL, P, Y, I, M, !, B, R]
Categories (9, object): [A, NULL, P, Y, ..., M, !, B, R]


In [58]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33459709 entries, 26684558 to 2671861
Columns: 114 entries, Clicked to user_status
dtypes: category(5), float32(1), float64(75), int32(11), int64(3), object(1), uint8(18)
memory usage: 23.9 GB


In [59]:
df = convert_int_float(df)
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33459709 entries, 26684558 to 2671861
Columns: 114 entries, Clicked to user_status
dtypes: category(5), float32(76), int32(11), int64(1), object(1), uint16(1), uint8(19)
memory usage: 14.1 GB


In [60]:
# df_test[['instanceId_userId','metadata_ownerId', 'instanceId_objectId','auditweights_numDislikes','auditweights_numLikes','auditweights_numShows','audit_timestamp']]\
#     .sort_values(['instanceId_objectId', 'audit_timestamp'])

In [61]:
# df[['instanceId_userId','metadata_ownerId', 'instanceId_objectId','auditweights_numDislikes','auditweights_numLikes','Liked','auditweights_numShows','audit_timestamp']]\
#     .sort_values(['instanceId_objectId', 'audit_timestamp'])

In [62]:
# sorted(df.columns)

In [63]:
def feature_preprocessing(df):
    df['created_to_showed_days'] = ((df.audit_timestamp-df.metadata_createdAt)/86400000).astype(np.float32)
    df['membership_updated_to_showed_days'] = ((df.audit_timestamp-df.membership_statusUpdateDate)/86400000).astype(np.float32)
    for c in ['audit_timestamp','metadata_createdAt']:
        x = pd.to_datetime(df[c],unit='ms')
        df[c+'_sin_time'] = np.sin(2*np.pi*(x.dt.minute + x.dt.hour*60)/1440.0).astype(np.float32)
        df[c+'_cos_time'] = np.cos(2*np.pi*(x.dt.minute + x.dt.hour*60)/1440.0).astype(np.float32)
        df[c+'_dow'] = x.dt.dayofweek
    
    df['likes_to_shows'] = df.auditweights_numLikes/df.auditweights_numShows.astype(np.float32)
    df['dislikes_to_likes'] = df.auditweights_numDislikes/df.auditweights_numLikes.astype(np.float32)
    
    for c in additional_cat_cols:
        df[c] = df[c].fillna(df[c].min()-1).astype(int).astype('str')
    return df 


In [64]:
%%time
df = feature_preprocessing(df)

CPU times: user 12min 25s, sys: 48.6 s, total: 13min 14s
Wall time: 9min 36s


In [65]:
agg_features = ['likes_to_shows','dislikes_to_likes', 'auditweights_svd',
                'userOwnerCounters_USER_FEED_REMOVE', 'userOwnerCounters_CREATE_LIKE',
                'auditweights_ctr_gender', 'metadata_numSymbols', 'audit_pos']
agg_groups = [('User', ['instanceId_userId']), ('Object',['instanceId_objectId']), ('Owner',['metadata_ownerId']),('UserOwner',['instanceId_userId', 'metadata_ownerId'])]
agg_functions = [('Max',max), ('Min',min), ('Mean','mean')]

In [66]:
%%time
for feature in agg_features:
    if df[feature].nunique() < 3:
        af = [('Mean','mean')]
    else:
        af = agg_functions
    for group in agg_groups:
        for agg in af:
            print (agg[0]+'_'+group[0]+'_'+feature)
            df[agg[0]+'_'+group[0]+'_'+feature] = df.groupby(group[1])[feature].transform(agg[1]).astype(np.float32)

Max_User_likes_to_shows
Min_User_likes_to_shows
Mean_User_likes_to_shows
Max_Object_likes_to_shows
Min_Object_likes_to_shows
Mean_Object_likes_to_shows
Max_Owner_likes_to_shows
Min_Owner_likes_to_shows
Mean_Owner_likes_to_shows
Max_UserOwner_likes_to_shows
Min_UserOwner_likes_to_shows
Mean_UserOwner_likes_to_shows
Max_User_dislikes_to_likes
Min_User_dislikes_to_likes
Mean_User_dislikes_to_likes
Max_Object_dislikes_to_likes
Min_Object_dislikes_to_likes
Mean_Object_dislikes_to_likes
Max_Owner_dislikes_to_likes
Min_Owner_dislikes_to_likes
Mean_Owner_dislikes_to_likes
Max_UserOwner_dislikes_to_likes
Min_UserOwner_dislikes_to_likes
Mean_UserOwner_dislikes_to_likes
Max_User_auditweights_svd
Min_User_auditweights_svd
Mean_User_auditweights_svd
Max_Object_auditweights_svd
Min_Object_auditweights_svd
Mean_Object_auditweights_svd
Max_Owner_auditweights_svd
Min_Owner_auditweights_svd
Mean_Owner_auditweights_svd
Max_UserOwner_auditweights_svd
Min_UserOwner_auditweights_svd
Mean_UserOwner_auditweig

In [67]:
%%time
for group in agg_groups:
    df['Last_'+group[0]+'_likes_to_shows'] = (df.groupby(group[1])['auditweights_numLikes'].transform(max)/
                                              df.groupby(group[1])['auditweights_numShows'].transform(max)).astype(np.float32)
    df['Last_'+group[0]+'_dislikes_to_likes'] = (df.groupby(group[1])['auditweights_numDislikes'].transform(max)/
                                  df.groupby(group[1])['auditweights_numLikes'].transform(max)).astype(np.float32)

CPU times: user 16min 42s, sys: 5min 10s, total: 21min 52s
Wall time: 4min 32s


In [68]:
num_columns = df[:1].drop(['instanceId_objectId'] + feedback_columns + cat_columns + additional_cat_cols, axis=1).columns

In [69]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33459709 entries, 26684558 to 2671861
Columns: 228 entries, Clicked to Last_UserOwner_dislikes_to_likes
dtypes: category(5), float32(184), int32(8), int64(1), object(11), uint16(1), uint8(18)
memory usage: 46.3 GB


In [70]:
df = convert_int_float(df)

In [71]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33459709 entries, 26684558 to 2671861
Columns: 228 entries, Clicked to Last_UserOwner_dislikes_to_likes
dtypes: category(5), float32(184), int32(8), int64(1), object(11), uint16(1), uint8(18)
memory usage: 46.3 GB


In [72]:
df.to_hdf('../data/df_train_test_preprocessed.h5', key='c', mode='w', format='table')

# READ

In [12]:
df = pd.read_hdf('../data/df_train_test_preprocessed.h5', key='c').reset_index(drop=True).sort_values('audit_timestamp')

In [13]:
# df[-100:][cat_columns+additional_cat_cols]

In [14]:
df.drop(zero_features2, axis=1, inplace=True)

In [15]:
df['Next_auditweights_numLikes'] = df.groupby('instanceId_objectId').auditweights_numLikes.shift(-1).astype('float32')
df['Next_auditweights_numDislikes'] = df.groupby('instanceId_objectId').auditweights_numDislikes.shift(-1).astype('float32')
df['Next_auditweights_numShows'] = df.groupby('instanceId_objectId').auditweights_numShows.shift(-1).astype('float32')

df['Diff_Next_auditweights_numLikes'] = (df.Next_auditweights_numLikes - df.auditweights_numLikes).astype('float32')
df['Diff_Next_auditweights_numDislikes'] = (df.Next_auditweights_numDislikes - df.auditweights_numDislikes).astype('float32')
df['Diff_Next_auditweights_numShows'] = (df.Next_auditweights_numShows - df.auditweights_numShows).astype('float32')

df['Diff_Next_likes_to_shows'] = (df.Diff_Next_auditweights_numLikes/df.Diff_Next_auditweights_numShows)
df['Diff_Next_dislikes_to_shows'] = (df.Diff_Next_auditweights_numDislikes/df.Diff_Next_auditweights_numShows)
df['Diff_Next_dislikes_to_likes'] = (df.Diff_Next_auditweights_numDislikes/df.Diff_Next_auditweights_numLikes)

df['Next_auditweights_ctr_gender'] = df.groupby(['instanceId_objectId','user_gender']).auditweights_ctr_gender.shift(-1).astype('float32')
df['Diff_Next_auditweights_ctr_gender'] = (df.Next_auditweights_ctr_gender/df.auditweights_ctr_gender - 1).astype('float32')

df['Next_userOwnerCounters_CREATE_LIKE'] = df.groupby(['instanceId_userId','metadata_ownerId']).userOwnerCounters_CREATE_LIKE.shift(-1).astype('float32')
df['Diff_Next_userOwnerCounters_CREATE_LIKE'] = (df.Next_userOwnerCounters_CREATE_LIKE - df.userOwnerCounters_CREATE_LIKE).astype('float32')

df['Next_userOwnerCounters_USER_FEED_REMOVE'] = df.groupby(['instanceId_userId','metadata_ownerId']).userOwnerCounters_USER_FEED_REMOVE.shift(-1).astype('float32')
df['Diff_Next_userOwnerCounters_USER_FEED_REMOVE'] = (df.Next_userOwnerCounters_USER_FEED_REMOVE - df.userOwnerCounters_USER_FEED_REMOVE).astype('float32')

df['Next_auditweights_ctr_high'] = df.groupby('instanceId_objectId').auditweights_ctr_high.shift(-1).astype('float32')
df['Diff_Next_auditweights_ctr_high'] = (df.Next_auditweights_ctr_high/df.auditweights_ctr_high - 1).astype('float32')

df['dislikes_to_shows'] = df.auditweights_numDislikes/df.auditweights_numShows.astype(np.float32)

In [16]:
# df[['instanceId_userId','metadata_ownerId', 'instanceId_objectId', 'auditweights_ctr_high','Next_auditweights_ctr_high','Diff_Next_auditweights_ctr_high','Liked','audit_timestamp']]\
#     .sort_values(['instanceId_objectId', 'audit_timestamp'])

In [17]:
# df[['instanceId_userId','metadata_ownerId', 'instanceId_objectId','userOwnerCounters_CREATE_LIKE','Next_userOwnerCounters_CREATE_LIKE', 'Diff_Next_userOwnerCounters_CREATE_LIKE',
#     'userOwnerCounters_USER_FEED_REMOVE', 'auditweights_ctr_high', 'Liked','audit_timestamp']]\
#     .sort_values(['instanceId_userId','metadata_ownerId', 'audit_timestamp'])

In [18]:
df.sort_index(inplace=True)

In [19]:
# for c in additional_cat_cols:
#     df[c] = df[c].astype(np.float32)
#     df[c] = df[c].fillna(df[c].min()-1).astype(int).astype(str)
# df[-100:][cat_columns+additional_cat_cols].values

In [20]:
test_len = 2671862
df_test = df.iloc[-test_len:].drop(feedback_columns, axis=1)
df_test.shape

(2671862, 220)

In [21]:
(2671862, 236)

(2671862, 236)

In [22]:
df = df.iloc[:-test_len]
df.shape

(30787847, 229)

In [23]:
(30787847, 245)

(30787847, 245)

In [24]:
df.to_hdf('../data/df_train_preprocessed.h5', key='c', mode='w', format='table')

In [25]:
df_test.to_hdf('../data/df_test_preprocessed.h5', key='c', mode='w', format='table')

# Read2

In [8]:
df = pd.read_hdf('../data/df_train_preprocessed.h5', key='c')
df_test = pd.read_hdf('../data/df_test_preprocessed.h5', key='c')

In [9]:
img_features = pd.read_csv('../data/image_classes.csv')

In [10]:
# df = df.sort_values('audit_timestamp')
# df = df[-20000000:]
# df.sort_index(inplace=True)

In [11]:
df = df.merge(img_features, left_on='first_image_id', right_on='hash', how='left')\
    .drop(['first_image_id','hash'], axis=1).rename(columns={'score':'image_class_score'})
df_test = df_test.merge(img_features, left_on='first_image_id', right_on='hash', how='left')\
    .drop(['first_image_id','hash'], axis=1).rename(columns={'score':'image_class_score'})

In [12]:
df['clazz'] = df['clazz'].astype(str)
df_test['clazz'] = df_test['clazz'].astype(str)
additional_cat_cols.append('clazz')

In [13]:
txt_class70 = pd.read_hdf('../data/text_cluster_70.h5', key='c')\
    .rename(columns={'instanceId_userId':'instanceId_objectId'})
txt_class120 = pd.read_hdf('../data/text_cluster_120.h5', key='c')\
    .rename(columns={'instanceId_userId':'instanceId_objectId'})
txt_class70['instanceId_objectId']=txt_class70['instanceId_objectId'].astype(int)
txt_class120['instanceId_objectId']=txt_class120['instanceId_objectId'].astype(int)

In [14]:
df = df.merge(txt_class70, on='instanceId_objectId', how='left').merge(txt_class120, on='instanceId_objectId', how='left')
df_test = df_test.merge(txt_class70, on='instanceId_objectId', how='left').merge(txt_class120, on='instanceId_objectId', how='left')

In [17]:
additional_cat_cols.append('text_cluster_70')
additional_cat_cols.append('text_cluster_120')

In [18]:
# df.to_hdf('../data/df_train_img_txt_preprocessed.h5', key='c', mode='w', format='table')
# df_test.to_hdf('../data/df_test_img_txt_preprocessed.h5', key='c', mode='w', format='table')

In [19]:
train_columns = df[:1].drop(['instanceId_objectId'] + feedback_columns, axis=1).columns
num_columns = df[:1].drop(['instanceId_objectId'] + feedback_columns + cat_columns + additional_cat_cols, axis=1).columns

In [20]:
for c in num_columns:
    if df[c].dtype != 'float32':
        print(c)
        df[c] = df[c].astype('float32')

HAS_COMPANIONS
HAS_DETECTED_TEXT
HAS_MUSIC
HAS_PHOTOS
HAS_PINS
HAS_POLLS
HAS_URLS
HAS_VIDEOS
IS_EXTERNAL_SHARE
IS_GIF
IS_INTERNAL_GROUP_SHARE
IS_INTERNAL_SHARE
IS_PART_OF_ALBUM
IS_PART_OF_TOPIC
audit_pos
metadata_createdAt
metadata_numCompanions
metadata_numPhotos
metadata_numPolls
metadata_numSymbols
metadata_numTokens
metadata_numVideos
metadata_totalVideoLength
image_class_score


In [21]:
# cat_cols = []
# for i, col in enumerate(train_columns):   
#     if col in cat_columns+additional_cat_cols:
#         print (col, df[col].nunique())
#         cat_cols.append(i)

In [22]:
train_objects, val_objects = train_test_split(df.instanceId_objectId.unique(), test_size=0.1, random_state=42)
df_train = df[df.instanceId_objectId.isin(train_objects)]
df_val = df[df.instanceId_objectId.isin(val_objects)]
# X_train = df_train.drop('target', axis=1)
# y_train = df_train.target
# X_val = df_val.drop('target', axis=1)
# y_val = df_val.target
# X_train.shape, X_val.shape

In [23]:
import gc
gc.collect()

679

In [24]:
df_train.shape, df_val.shape

((27718040, 232), (3069807, 232))

In [25]:
%%time
df_train['target'] = (df_train.Liked - df_train.Unliked).clip(0,1).astype(np.float32)
df_val['target'] = df_val.Liked.astype(np.float32)
# for c in feedback_columns:
#     print(c)
#     df_train.drop(c, axis=1, inplace=True)
#     df_val.drop(c, axis=1, inplace=True)
df_train.drop(feedback_columns, axis=1, inplace=True)
df_val.drop(feedback_columns, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


CPU times: user 1min 30s, sys: 27 s, total: 1min 57s
Wall time: 23.5 s


In [26]:
x = df_train.groupby(df_train.instanceId_userId).target.nunique()
df_train = df_train[df_train.instanceId_userId.isin(x[x==2].index)]
y = df_val.groupby(df_val.instanceId_userId).target.nunique()
df_val = df_val[df_val.instanceId_userId.isin(y[y==2].index)]
df_train.shape, df_val.shape

((20625630, 224), (1076397, 224))

In [27]:
1/df_train.target.mean()

4.06487802948628

In [28]:
df_train.shape

(20625630, 224)

In [29]:
from catboost import Pool, CatBoostClassifier, CatBoost, FeaturesData

In [30]:
train_pool = Pool(
    FeaturesData(num_feature_data=df_train[num_columns].values.astype(np.float32),
                 cat_feature_data=df_train[cat_columns+additional_cat_cols].values),
    df_train.target.values)
# train_pool = Pool(df_train[train_columns].values, df_train.target.values, cat_features=cat_cols)

In [31]:
val_pool = Pool(
    FeaturesData(num_feature_data=df_val[num_columns].values.astype(np.float32),
                 cat_feature_data=df_val[cat_columns+additional_cat_cols].values),
    df_val.target.values)
# val_pool = Pool(df_val[train_columns].values, df_val.target.values, cat_features=cat_cols) 

In [None]:
model = CatBoostClassifier(iterations=3000, depth=8, loss_function='Logloss', learning_rate=0.1,
                           eval_metric='AUC', random_seed=42, verbose=2, task_type='GPU', 
                           # gpu_cat_features_storage='CpuPinnedMemory', pinned_memory_size='16gb',
                           gpu_ram_part=0.98,
                           scale_pos_weight=1/df_train.target.mean(),
                           max_ctr_complexity=1)
#train the model
model.fit(train_pool, eval_set=val_pool, use_best_model=True, plot=False)

depth10 -> depth12
bestTest = 0.7985493541
bestIteration = 496
0.7200108101931573
 - collabSubmit113.csv.gz

max_ctr_complexity 1 -> 4
bestTest = 0.8016450531
bestIteration = 499
0.7255148903601538
0.7252125 - collabSubmit121.csv.gz

depth12, max_ctr_complexity 1, GPU
bestTest = 0.7985717058
bestIteration = 498
0.7200780194906244
0.7213254 - collabSubmit241.csv.gz

depth12, max_ctr_complexity 1, GPU, add_features
bestTest = 0.8060864806
bestIteration = 499
0.7341605931245898
0.7301234 - collabSubmit252.csv.gz

depth12, max_ctr_complexity 1, GPU, add_features2
bestTest = 0.8064311147
bestIteration = 499
0.7347634359708437
0.7311602 - collabSubmit261.csv.gz

depth12, max_ctr_complexity 4, CPU, add_features2
bestTest = 0.8079399604
bestIteration = 499
0.7360686718052136
0.7321119 - collabSubmit271.csv.gz

In [None]:
depth12, max_ctr_complexity 1, GPU, add_features3
bestTest = 0.8071155548
bestIteration = 498
0.7351772107705281
0.7328861 - collabSubmit321.csv.gz

In [None]:
depth12, max_ctr_complexity 1, GPU, add_features3, unlike
bestTest = 0.8070437908
bestIteration = 499
0.7362668403773499
0.732909 - collabSubmit331.csv.gz

In [None]:
depth12, max_ctr_complexity 1, GPU, add_features3, unlike, del0
bestTest = 0.8071227074
bestIteration = 499
0.7356318202632838
0.732965 - collabSubmit351.csv.gz

In [None]:
depth12, max_ctr_complexity 1, GPU, add_features3, unlike, del0, Next
bestTest = 0.8084143996
bestIteration = 498
0.7372449094873829
0.7339634 - collabSubmit361.csv.gz

In [None]:
depth12, max_ctr_complexity 1, GPU, add_features3, unlike, del0, Next, 3000 iter
bestTest = 0.8096089661
bestIteration = 1143
0.7393927891787139
0.7341876 - collabSubmit381.csv.gz

In [None]:
depth12, max_ctr_complexity 1, GPU, add_features3, unlike, del0, Next, 1200 iter, fix_sort
bestTest = 0.8184744716
bestIteration = 1109
0.7535516521739076
0.7389455 - collabSubmit391.csv.gz

In [None]:
depth12, max_ctr_complexity 1, GPU, add_features3, unlike, del0, Next, 1200 iter, fix_sort, ctr_high
bestTest = 0.8186135292
bestIteration = 1165
0.7538811142309291
 - collabSubmit401.csv.gz

In [None]:
max_ctr_complexity 1, GPU, add_features3, unlike, del0, Next, 3000 iter, fix_sort, ctr_high, deph8
bestTest = 
bestIteration = 3999

 - collabSubmit411.csv.gz

In [None]:
max_ctr_complexity 1, GPU, add_features3, unlike, del0, Next, 3000 iter, fix_sort, ctr_high, deph8
bestTest = 0.8089591861
bestIteration = 2996
0.7491738315796818
0.7247734 (5) - collabSubmit441.csv.gz

In [None]:
max_ctr_complexity 1, GPU, add_features3, unlike, del0, Next, 3000 iter, fix_sort, ctr_high, deph8
bestTest = 0.8090895712
bestIteration = 2999
0.7493512464035558
0.7246788 (5) - collabSubmit451.csv.gz

In [None]:
max_ctr_complexity 1, GPU, add_features3, unlike, del0, Next, 3000 iter, fix_sort, ctr_high, deph8, only_rankable
bestTest = 0.7588869929
bestIteration = 2999
0.7494765187355047
0.7266737 (5) - collabSubmit461.csv.gz

In [None]:
max_ctr_complexity 1, GPU, add_features3, 3000 iter, deph8, only_rankable, img
bestTest = 0.759110868
bestIteration = 2999
0.7500146814328006
0.7270203 (5) - collabSubmit481.csv.gz

In [46]:
df_val['predictions'] = model.predict(val_pool, prediction_type='RawFormulaVal')
def auc(labels, scores):
    if len(labels) > sum(labels) > 0:
        return roc_auc_score(labels, scores)
    return float('NaN')
df_v = df_val[['instanceId_userId', 'predictions', 'target']]
df_v['instanceId_userId'] = df_v.instanceId_userId.astype(float).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [47]:
%%time
df_v[['instanceId_userId', 'predictions', 'target']].groupby('instanceId_userId')\
    .apply(lambda y: auc(y.target.values, y.predictions.values)).mean()

CPU times: user 3min 4s, sys: 1.92 s, total: 3min 6s
Wall time: 3min 6s


0.7499329616670976

In [48]:
imp = pd.DataFrame(list(zip(model.get_feature_importance(train_pool),np.asarray(list(num_columns) + list(cat_columns+additional_cat_cols))))).set_index(1)
imp.sort_values(0, ascending=True)

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
Min_Object_userOwnerCounters_USER_FEED_REMOVE,0.001699
metadata_createdAt_dow,0.002022
HAS_COMPANIONS,0.002104
auditweights_onlineVideo,0.002910
userOwnerCounters_CREATE_MOVIE,0.002973
auditweights_userOwner_CREATE_COMMENT,0.003814
user_status,0.004056
Min_Owner_audit_pos,0.004494
Min_Owner_userOwnerCounters_USER_FEED_REMOVE,0.004603
HAS_CHEATED_TEXT,0.004640


In [49]:
imp = imp.sort_values(0, ascending=False)
list(imp[imp[0]==0].index)

[]

In [50]:
imp.to_csv('../imp/features502.csv')

In [None]:
res = df_test[['instanceId_userId', 'instanceId_objectId']]

In [None]:
df_test = df_test[train_columns]

In [None]:
test_pool = Pool(
    FeaturesData(num_feature_data=df_test[num_columns].values.astype(np.float32),
                 cat_feature_data=df_test[cat_columns+additional_cat_cols].values))

In [52]:
res['prediction'] = model.predict(test_pool, prediction_type='RawFormulaVal')

In [53]:
res[:3]

Unnamed: 0,instanceId_userId,instanceId_objectId,prediction
0,602,25708174,1.217894
1,602,24393746,0.185588
2,803,25400331,-1.004105


In [54]:
SUBMIT_ID = 502
res.to_hdf('../predictions/result{0}.h5'.format(SUBMIT_ID), key='c', mode='w', format='table')

r = res.sort_values(by=['instanceId_userId', 'prediction'], ascending = (1,0))
submit = r.groupby("instanceId_userId")['instanceId_objectId'].apply(list)
submit.to_csv('../submissions/collabSubmit{}_fixed.csv.gz'.format(SUBMIT_ID), header = False, compression='gzip')

In [None]:
df.instanceId_userId.nunique(), df_test.instanceId_userId.nunique()

In [9]:
len(set(df.instanceId_userId) & set(df_test.instanceId_userId))

226695

In [57]:
df.instanceId_objectId.nunique(), df_test.instanceId_objectId.nunique()

(2936576, 325333)

In [58]:
len(set(df.instanceId_objectId) & set(df_test.instanceId_objectId))

237

In [60]:
df.metadata_ownerId.nunique(), df_test.metadata_ownerId.nunique()

(81272, 41917)

In [61]:
len(set(df.metadata_ownerId.unique())&set(df_test.metadata_ownerId.unique()))

41321

In [54]:
df.instanceId_objectId.astype(np.int32).sort_values()

4946045            4
6869544            5
16777446           5
5805959            5
11653345           6
16022439           6
10222943           6
2079063            6
6314355            9
7037492            9
2742419            9
2938895            9
13576051           9
9523476           12
1534641           12
4809764           14
15103271          14
18008520          15
5358744           15
4669565           15
11101842          16
7307918           17
17615566          17
11441941          17
2547592           17
2507888           17
1242875           17
7175505           17
15856619          17
3450605           18
              ...   
13266521    39082219
13621772    39082220
13313714    39082222
13406240    39082222
13370170    39082222
13404648    39082222
13374295    39082222
13503519    39082222
13541212    39082222
13390863    39082222
13395891    39082222
13485091    39082222
13486158    39082222
13533236    39082222
13608756    39082223
13474751    39082223
13584953    3

In [55]:
df.instanceId_userId.astype(np.int32).sort_values()

11641346           3
5992427            3
5992428            6
13786300           9
2607358           12
6794970           15
2824890           15
173384            15
15005587          15
9300061           15
3208966           15
13466446          15
9684334           15
5874662           18
8519890           21
7371870           21
4559145           36
12634140          39
2716815           39
3098515           39
4274528           39
2330308           39
2330309           39
18011799          39
3881770           39
17625036          39
4274529           39
18011798          39
6691677           39
13732682          39
              ...   
7918971     15716385
13624193    15716385
7918972     15716385
13571412    15716388
13518694    15716394
7647447     15716397
7647449     15716418
7647448     15716418
7918973     15716463
7810450     15716469
13466444    15716472
13624194    15716478
13624195    15716478
7918974     15716478
13571413    15716505
13310029    15716523
13310030    1

In [56]:
df.metadata_ownerId.astype(np.int32).sort_values()

8146190         1
6351758         1
3278319         1
11971078        1
11825805        1
3217325         1
5281427         2
14951079        2
11876252        2
4638776         2
15337707        2
2612839         2
13427850        3
11649056        3
13435217        3
8967786         3
15229088        3
8799007         3
10524167        3
14570592        3
11309157        3
9677001         3
6467756         3
8855236         3
6172614         3
8689000         3
9596194         3
6549290         3
7778363         3
9193236         3
            ...  
13329973    85854
3581270     85855
13260904    85856
1427978     85857
3576989     85858
7749778     85858
3739627     85858
1147887     85860
1147885     85860
16105306    85862
7863417     85862
17232880    85865
11749942    85865
3626573     85865
2222729     85865
17381132    85865
7643805     85865
2501007     85867
2330292     85868
7579999     85870
7543628     85875
5827788     85875
1106010     85875
13474207    85875
2505528   