In [1]:
import pandas as pd
import numpy as np
import catboost as cat

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 

    return df

def load_data(path):
    user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None, engine='c'))
    item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None, engine='c'))
    data = pd.read_csv(path + 'user_behavior.csv',header=None, engine='c')

    data.columns = ['userID','itemID','behavior','timestamp']
    data['day'] = data['timestamp'] // 86400
    data['hour'] = data['timestamp'] // 3600 % 24
    
    ## 生成behavior的onehot
    for i in ['pv','fav','cart','buy']:
        data[i] = 0
        data.loc[data['behavior'] == i, i] = 1

    ## 生成behavior的加权
    
    data['day_hour'] = data['day'] + data['hour'] / float(24)
    data.loc[data['behavior']=='pv','behavior'] = 1
    data.loc[data['behavior']=='fav','behavior'] = 2
    data.loc[data['behavior']=='cart','behavior'] = 3
    data.loc[data['behavior']=='buy','behavior'] = 1
    max_day = max(data['day'])
    min_day = min(data['day'])
    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] 

    item.columns = ['itemID','category','shop','brand']
    user.columns = ['userID','sex','age','ability']
    
    data = reduce_mem_usage(data)

    data = pd.merge(left=data, right=item, on='itemID',how='left', sort=False)
    data = pd.merge(left=data, right=user, on='userID',how='left', sort=False)

    return user, item, data
    

In [3]:
user, item, data = load_data(path = '../ECommAI_EUIR_round2_train_20190816/')
user['age'] = user['age'] // 10
data['age'] = data['age'] // 10

In [4]:
#########需要修改！！！！！！！！
###路径也要改
recall_train_list = []
for i in range(7):
    recall_train_list.append(
        reduce_mem_usage(pd.read_csv(str(i) + 'recall_list_round2_15day_300lenth-Copy1.csv', engine='c')))

In [24]:
recall_train = pd.concat(recall_train_list, sort=False)
recall_train = recall_train.fillna(0)

In [27]:
def downsample(df, percent=10):
    '''
    percent:多数类别下采样的数量相对于少数类别样本数量的比例
    '''
    
    data1 = df[df['label'] != 0]
    data0 = df[df['label'] == 0]
    index = np.random.randint(len(data0), size = percent * len(data1))
    lower_data0 = data0.iloc[list(index)]
    
    return(pd.concat([lower_data0, data1]))

In [None]:
recall_train = downsample(recall_train,10 )

recall_train = pd.merge(left=recall_train, right=item, on='itemID',how='left', sort=False)
recall_train = pd.merge(left=recall_train, right=user, on='userID',how='left', sort=False)

In [None]:
feature_path = '../Step2 Generate_feature_for_Ranking/'

In [28]:
underline_features_files = [
'brand_count.csv',
'brand_sum.csv',
'category_count.csv',
'category_sum.csv',
'itemID_count.csv',
'itemID_sum.csv',
'shop_count.csv',
'shop_sum.csv',
'category_lower.csv',
'item_rank.csv',
'category_higher.csv',
'itemID_higher.csv',
]

underline_features = []
for f in underline_features_files:
    underline_features.append(pd.read_csv(feature_path+f, engine='c'))

In [29]:
for f in underline_features:
    recall_train = pd.merge(left=recall_train, right=f, on=f.columns[0], how='left', sort=False)

In [31]:
## 注意这个线下训练时 是underline

double_underline_features_files = [
'item_to_ability_count_underline.csv',
'item_to_sex_count_underline.csv',
'item_to_age_count_underline.csv',
]

double_underline_features = []
for f in double_underline_features_files:
    double_underline_features.append(pd.read_csv(feature_path+f, engine='c'))

In [32]:
for f in double_underline_features:
    recall_train = pd.merge(left=recall_train, right=f, on=list(f.columns[0: 2]), how='left', sort=False)

In [33]:
## 注意这个线下训练时 是underline

time_features_files = [
'itemID_last_time_underline.csv',
'brand_last_time_underline.csv',
'shop_last_time_underline.csv'
]

time_features = []
for f in time_features_files:
    time_features.append(pd.read_csv(feature_path+f, engine='c'))

In [34]:
for f in time_features:
    recall_train = pd.merge(left=recall_train, right=f, on=f.columns[0], how='left', sort=False)

In [35]:
online_features_files =  ['user_to_brand_count.csv',
'user_to_brand_sum.csv',
'user_to_category_count.csv',
'user_to_category_sum.csv',
'user_to_shop_count.csv',
'user_to_shop_sum.csv',]


online2 = ['user_to_category_count_pv.csv',
 'user_to_category_count_buy.csv',
 'user_to_shop_count_pv.csv',
 'user_to_shop_count_buy.csv',
 'user_to_brand_count_pv.csv',
 'user_to_brand_count_buy.csv']


online3 = ['user_to_category_count_yestday.csv',
'user_to_category_count_pv_yestday.csv',
 'user_to_category_count_buy_yestday.csv',
 'user_to_shop_count_pv_yestday.csv',
 'user_to_shop_count_buy_yestday.csv',
 'user_to_brand_count_pv_yestday.csv',
 'user_to_brand_count_buy_yestday.csv']

online4 = [
 'user_to_category_count_5days.csv',
 'user_to_category_count_pv_5days.csv',
 'user_to_category_count_buy_5days.csv',
 'user_to_shop_count_pv_5days.csv',
 'user_to_shop_count_buy_5days.csv',
 'user_to_brand_count_pv_5days.csv',
 'user_to_brand_count_buy_5days.csv']

online5 = [
'user_to_shop_lasttime.csv',
'user_to_category_lasttime.csv',
'user_to_brand_lasttime.csv' ,
]

online_features_files = online_features_files + online2 + online3 + online4 + online5


In [36]:
online_features = []
for f in online_features_files:
    online_features.append(pd.read_csv(feature_path+f, engine='c'))

In [37]:
for f in online_features:
    recall_train = pd.merge(left=recall_train, right=f, on=list(f.columns[0: 2]), how='left', sort=False)

In [38]:
def transfer_label(x):
    if x == 0:
        return 0
    else:
        return 1

recall_train['label'] = recall_train['label'].apply(transfer_label)

In [39]:
features = [x for x in recall_train.columns if x not in ['itemID','userID','category','shop','brand','label','apriori_rank','apriori_top']]

In [40]:
cbt_model = cat.CatBoostClassifier(iterations=300,learning_rate=0.1,depth=5,verbose=True,thread_count=12
                                   ,random_seed=1024)
cbt_model.fit(recall_train[features], recall_train['label'])

0:	learn: 0.5969336	total: 617ms	remaining: 3m 4s
1:	learn: 0.5237747	total: 1.2s	remaining: 2m 58s
2:	learn: 0.4676843	total: 1.74s	remaining: 2m 51s
3:	learn: 0.4248270	total: 2.24s	remaining: 2m 46s
4:	learn: 0.3923313	total: 2.78s	remaining: 2m 44s
5:	learn: 0.3675028	total: 3.31s	remaining: 2m 41s
6:	learn: 0.3480424	total: 3.85s	remaining: 2m 41s
7:	learn: 0.3332812	total: 4.37s	remaining: 2m 39s
8:	learn: 0.3221164	total: 4.84s	remaining: 2m 36s
9:	learn: 0.3126932	total: 5.33s	remaining: 2m 34s
10:	learn: 0.3053910	total: 5.8s	remaining: 2m 32s
11:	learn: 0.2996877	total: 6.35s	remaining: 2m 32s
12:	learn: 0.2950420	total: 6.92s	remaining: 2m 32s
13:	learn: 0.2915425	total: 7.41s	remaining: 2m 31s
14:	learn: 0.2886570	total: 7.98s	remaining: 2m 31s
15:	learn: 0.2865094	total: 8.53s	remaining: 2m 31s
16:	learn: 0.2846760	total: 8.98s	remaining: 2m 29s
17:	learn: 0.2833095	total: 9.43s	remaining: 2m 27s
18:	learn: 0.2818918	total: 9.99s	remaining: 2m 27s
19:	learn: 0.2807000	tota

158:	learn: 0.2689558	total: 1m 20s	remaining: 1m 11s
159:	learn: 0.2689389	total: 1m 21s	remaining: 1m 11s
160:	learn: 0.2689273	total: 1m 21s	remaining: 1m 10s
161:	learn: 0.2689062	total: 1m 22s	remaining: 1m 10s
162:	learn: 0.2688837	total: 1m 22s	remaining: 1m 9s
163:	learn: 0.2688722	total: 1m 23s	remaining: 1m 9s
164:	learn: 0.2688602	total: 1m 23s	remaining: 1m 8s
165:	learn: 0.2688419	total: 1m 24s	remaining: 1m 7s
166:	learn: 0.2688311	total: 1m 24s	remaining: 1m 7s
167:	learn: 0.2688143	total: 1m 25s	remaining: 1m 6s
168:	learn: 0.2687947	total: 1m 25s	remaining: 1m 6s
169:	learn: 0.2687703	total: 1m 26s	remaining: 1m 5s
170:	learn: 0.2687481	total: 1m 26s	remaining: 1m 5s
171:	learn: 0.2687167	total: 1m 27s	remaining: 1m 4s
172:	learn: 0.2686880	total: 1m 27s	remaining: 1m 4s
173:	learn: 0.2686678	total: 1m 28s	remaining: 1m 3s
174:	learn: 0.2686523	total: 1m 28s	remaining: 1m 3s
175:	learn: 0.2686358	total: 1m 29s	remaining: 1m 2s
176:	learn: 0.2686162	total: 1m 29s	remain

<catboost.core.CatBoostClassifier at 0x7f8f45214630>

In [41]:
cbt_model.save_model('model0924_base.file')

In [42]:
importance = dict(zip(features,
cbt_model.feature_importances_))

In [43]:
sorted(importance.items(), key=lambda x:x[1], reverse=True)

[('apriori', 27.28756987244248),
 ('itemID_median', 8.290965095145731),
 ('user_to_category_lastday', 7.07994210784468),
 ('user_to_category_count_buy', 5.7383039412964445),
 ('user_to_category_count_pv_5days', 5.523705076092089),
 ('user_to_category_count_pv', 3.6634786916021396),
 ('itemID_std', 2.973804811358111),
 ('user_to_category_count_buy_yestday', 2.9720553018294242),
 ('user_to_age_count', 2.4646399997374417),
 ('user_to_sex_count', 2.409206256237985),
 ('user_to_category_lasttime', 2.319085627011794),
 ('user_to_category_count_pv_yestday', 1.96984573441514),
 ('age', 1.757174970832472),
 ('itemID_skew', 1.5063951577357892),
 ('user_to_shop_count_pv', 1.4966560077500304),
 ('user_to_category_count_buy_5days', 1.3092884271919525),
 ('rank', 1.1516030170793725),
 ('category_count', 1.147464644646628),
 ('user_to_category_count_5days', 1.1221689244633937),
 ('category_std', 1.0918870855320673),
 ('itemID_count', 1.0201773241728571),
 ('shop_count', 1.0185020472919601),
 ('user_t

In [None]:
#####要有LGB和融合的代码！！！！！！！！