# 导入工具包

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import gc
from collections import Counter
import copy
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

# 数据读取

In [3]:
train_file = '../input/data_format1/train_format1.csv'
test_file = '../input/data_format1/test_format1.csv'
user_info_file = '../input/data_format1/user_info_format1.csv'
user_log_file = '../input/data_format1/user_log_format1.csv'

train_data = reduce_mem_usage(pd.read_csv(train_file))
test_data = reduce_mem_usage(pd.read_csv(test_file))
user_info = reduce_mem_usage(pd.read_csv(user_info_file))
user_log = reduce_mem_usage(pd.read_csv(user_log_file))

Memory usage of dataframe is 5.97 MB
Memory usage after optimization is: 1.74 MB
Decreased by 70.8%
Memory usage of dataframe is 5.98 MB
Memory usage after optimization is: 3.49 MB
Decreased by 41.7%
Memory usage of dataframe is 9.71 MB
Memory usage after optimization is: 3.24 MB
Decreased by 66.7%
Memory usage of dataframe is 2933.33 MB
Memory usage after optimization is: 890.48 MB
Decreased by 69.6%


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int32
 1   merchant_id  260864 non-null  int16
 2   label        260864 non-null  int8 
dtypes: int16(1), int32(1), int8(1)
memory usage: 1.7 MB


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261477 entries, 0 to 261476
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      261477 non-null  int32  
 1   merchant_id  261477 non-null  int16  
 2   prob         0 non-null       float64
dtypes: float64(1), int16(1), int32(1)
memory usage: 3.5 MB


In [6]:
user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int32  
 1   age_range  421953 non-null  float16
 2   gender     417734 non-null  float16
dtypes: float16(2), int32(1)
memory usage: 3.2 MB


In [7]:
user_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int32  
 1   item_id      int32  
 2   cat_id       int16  
 3   seller_id    int16  
 4   brand_id     float16
 5   time_stamp   int16  
 6   action_type  int8   
dtypes: float16(1), int16(3), int32(2), int8(1)
memory usage: 890.5 MB


# 数据处理

In [9]:
all_data = test_data.append(test_data)
all_data = all_data.merge(user_info, on=['user_id'], how='left')

del train_data, test_data, user_info
gc.collect()

26024

In [10]:
# 按时间排序
user_log = user_log.sort_values(['user_id', 'time_stamp'])
gc.collect()

In [11]:
# 对每个用户逐个合并所有字段
list_join_func = lambda x: ' '.join([str(i) for i in x])

agg_dict = {
    'item_id': list_join_func,
    'cat_id': list_join_func,
    'seller_id': list_join_func,
    'brand_id': list_join_func,
    'time_stamp': list_join_func,
    'action_type': list_join_func
}

rename_dict = {
    'item_id': 'item_path',
    'cat_id': 'cat_path',
    'seller_id': 'seller_path',
    'brand_id': 'brand_path',
    'time_stamp': 'time_stamp_path',
    'action_type': 'action_type_path'
}


def merge_list(df_ID, join_columns, df_data, agg_dict, rename_dict):
    df_data = df_data.groupby(join_columns).agg(agg_dict).reset_index().rename(columns=rename_dict)
    
    df_ID = df_ID.merge(df_data, on=join_columns, how='left')
    return df_ID

all_data = merge_list(all_data, 'user_id', user_log, agg_dict, rename_dict)

del user_log
gc.collect()

26328

# 定义特征统计函数
## 定义统计函数

1. 定义统计数据总数的函数

In [12]:
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1

2. 定义统计数据唯一值总数的函数

In [13]:
def nunique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1

3. 定义统计数据最大值的函数

In [14]:
def max_(x):
    try:
        return np.max([float(i) for i in x.split(' ')])
    except:
        return -1

4. 定义统计数据最小值的函数

In [15]:
def min_(x):
    try:
        return np.min([float(i) for i in x.split(' ')])
    except:
        return -1

5. 定义统计数据标准差的函数

In [16]:
def std_(x):
    try:
        return np.std([float(i) for i in x.split(' ')])
    except:
        return -1

6. 定义统计数据中频次为$topN$数据的元素的函数

In [17]:
def most_n(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        return -1

7. 定义统计数据中频次为$topN$数据的元素的频次的函数

In [18]:
def most_n(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][1]
    except:
        return -1

## 调用定义的统计函数

In [19]:
def user_cnt(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(cnt_)
    return df_data


def user_nunique(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(nunique_)
    return df_data


def user_max(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(max_)
    return df_data


def user_min(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(min_)
    return df_data


def user_std(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(std_)
    return df_data


def user_most_n(df_data, single_col, name, n=1):
    df_data[name] = df_data[single_col].apply(lambda x: most_n(x, n))
    return df_data


def user_most_n_cnt(df_data, single_col, name, n=1):
    df_data[name] = df_data[single_col].apply(lambda x: most_n_cnt(x, n))
    return df_data

# 提取统计特征
## 特征统计

1. 店铺特征统计：统计与店铺特点相关的特征，如店铺、商品、品牌等

In [None]:
all_data_test = all_data.head(2000)