https://tianchi.aliyun.com/notebook-ai/detail?postId=103212

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time
from tqdm import tqdm
import itertools
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.decomposition import PCA, FastICA, FactorAnalysis, SparsePCA

import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import scipy.signal as signal

In [2]:
# 处理异常值
def smooth_cols(group,out_value,kind):
    cols = ['power']
    if kind == 'g':
        for col in cols:
            yes_no = (group[col]<out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.995))
        return group
    if kind == 'l':
        for col in cols:
            yes_no = (group[col]>out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.07))
        return group        

def date_proc(x):
    m = int(x[4:6])
    if m == 0:
        m = 1
    return x[:4] + '-' + str(m) + '-' + x[6:]

# 定义日期提取函数
def date_tran(df,fea_col):
    for f in tqdm(fea_col):
        df[f] = pd.to_datetime(df[f].astype('str').apply(date_proc))
        df[f + '_year'] = df[f].dt.year
        df[f + '_month'] = df[f].dt.month
        df[f + '_day'] = df[f].dt.day
        df[f + '_dayofweek'] = df[f].dt.dayofweek
    return (df)

# 分桶操作
def cut_group(df,cols,num_bins=50):
    for col in cols:
        all_range = int(df[col].max()-df[col].min())
        bin = [i*all_range/num_bins for i in range(all_range)]
        df[col+'_bin'] = pd.cut(df[col], bin, labels=False)
    return df

# count编码
def count_coding(df,fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return(df)

# 定义交叉特征统计
def cross_cat_num(df,num_col,cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max', '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
            })
            df = df.merge(feat, on=f1, how='left')
    return(df)

# 类别特征的二阶交叉
from scipy.stats import entropy
def cross_qua_cat_num(df):
    for f_pair in tqdm([
        ['model', 'brand'], ['model', 'regionCode'], ['brand', 'regionCode']
    ]):
        # 共现次数
        df['_'.join(f_pair) + '_count'] = df.groupby(f_pair)['SaleID'].transform('count')
        # n unique、熵
        df = df.merge(df.groupby(f_pair[0], as_index=False)[f_pair[1]].agg({
            '{}_{}_nunique'.format(f_pair[0], f_pair[1]): 'nunique',
            '{}_{}_ent'.format(f_pair[0], f_pair[1]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[0], how='left')
        df = df.merge(df.groupby(f_pair[1], as_index=False)[f_pair[0]].agg({
            '{}_{}_nunique'.format(f_pair[1], f_pair[0]): 'nunique',
            '{}_{}_ent'.format(f_pair[1], f_pair[0]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[1], how='left')
        # 比例偏好
        df['{}_in_{}_prop'.format(f_pair[0], f_pair[1])] = df['_'.join(f_pair) + '_count'] / df[f_pair[1] + '_count']
        df['{}_in_{}_prop'.format(f_pair[1], f_pair[0])] = df['_'.join(f_pair) + '_count'] / df[f_pair[0] + '_count']
    return (df)

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
Train_data = reduce_mem_usage(pd.read_csv('input/used_car_train_20200313.csv',
                                          sep=' '))
TestA_data = reduce_mem_usage(pd.read_csv('input/used_car_testA_20200313.csv',
                                          sep=' '))

print('Train data shape: {}'.format(Train_data.shape))
print('TestA_data shape: {}'.format(TestA_data.shape))

Memory usage of dataframe is 37200080.00 MB
Memory usage after optimization is: 10200184.00 MB
Decreased by 72.6%
Memory usage of dataframe is 12000080.00 MB
Memory usage after optimization is: 3200184.00 MB
Decreased by 73.3%
Train data shape: (150000, 31)
TestA_data shape: (50000, 30)


In [4]:
Train_data.head(10)

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,0,0,20160404,1850,43.34375,3.966797,0.050262,2.160156,1.143555,0.235718,0.10199,0.129517,0.022812,0.097473,-2.880859,2.804688,-2.419922,0.79541,0.914551
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,-,4366,0,0,20160309,3600,45.3125,5.234375,0.137939,1.380859,-1.421875,0.264893,0.121033,0.135742,0.026596,0.020584,-4.902344,2.095703,-1.030273,-1.722656,0.245483
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,0,0,20160402,6222,45.96875,4.824219,1.319336,-0.998535,-0.99707,0.251465,0.114929,0.165161,0.062164,0.027069,-4.847656,1.803711,1.56543,-0.83252,-0.22998
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,0,0,20160312,2400,45.6875,4.492188,-0.050629,0.883789,-2.228516,0.274414,0.110291,0.121948,0.033386,0.0,-4.507812,1.286133,-0.501953,-2.4375,-0.47876
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,0,0,20160313,5200,44.375,2.03125,0.572266,-1.571289,2.246094,0.228027,0.073181,0.091858,0.078796,0.121521,-1.896484,0.910645,0.931152,2.833984,1.923828
5,5,137642,20090602,24.0,10,0.0,1.0,0.0,109,10.0,0.0,3690,0,0,20160319,8000,46.3125,-3.228516,0.156616,-1.727539,-0.345703,0.260254,0.000518,0.119812,0.090942,0.048767,1.885742,-2.722656,2.457031,-0.286865,0.206543
6,6,2402,19990411,13.0,4,0.0,0.0,1.0,150,15.0,0.0,3073,0,0,20160317,3500,46.09375,4.925781,0.113281,1.644531,-1.270508,0.268066,0.117676,0.142334,0.025452,0.028168,-4.902344,1.610352,-0.834473,-1.996094,-0.10321
7,7,165346,19990706,26.0,14,1.0,0.0,0.0,101,15.0,0.0,4000,0,0,20160326,1000,42.25,-3.167969,-0.676758,1.942383,0.524414,0.239502,0.0,0.122925,0.039825,0.082397,3.693359,-0.244995,-2.193359,0.236694,0.195557
8,8,2974,20030205,19.0,1,2.0,1.0,1.0,179,15.0,0.0,4679,0,0,20160326,2850,46.09375,4.894531,0.475342,0.556641,-1.262695,0.263916,0.116577,0.144287,0.039856,0.024384,-4.925781,1.587891,0.075317,-1.550781,0.069458
9,9,82021,19980101,7.0,7,5.0,0.0,0.0,88,15.0,0.0,302,0,0,20160402,650,43.0625,1.666016,-2.201172,3.097656,0.84375,0.262451,0.068237,0.012177,0.010292,0.098755,-1.089844,0.600586,-4.1875,0.198242,-1.025391


In [5]:
TestA_data.head(10)

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,150000,66932,20111212,222.0,4,5.0,1.0,1.0,313,15.0,0.0,1440,0,0,20160329,49.59375,5.246094,1.000977,-4.121094,0.737305,0.264404,0.121826,0.070923,0.106567,0.078857,-7.050781,-0.854492,4.800781,0.620117,-3.664062
1,150001,174960,19990211,19.0,21,0.0,0.0,0.0,75,12.5,1.0,5419,0,0,20160404,42.40625,-3.253906,-1.753906,3.646484,-0.725586,0.261719,0.0,0.096741,0.013702,0.052368,3.679688,-0.729004,-3.796875,-1.541016,-0.756836
2,150002,5356,20090304,82.0,21,0.0,0.0,0.0,109,7.0,0.0,5045,0,0,20160308,45.84375,4.703125,0.155396,-1.118164,-0.229126,0.260254,0.112061,0.078064,0.062073,0.050537,-4.925781,1.000977,0.82666,0.138184,0.753906
3,150003,50688,20100405,0.0,0,0.0,0.0,1.0,160,7.0,0.0,4023,0,0,20160325,46.4375,4.320312,0.428955,-2.037109,-0.234741,0.260498,0.10675,0.081116,0.075989,0.048279,-4.863281,0.505371,1.870117,0.365967,1.3125
4,150004,161428,19970703,26.0,14,2.0,0.0,0.0,75,15.0,0.0,3103,0,0,20160309,42.1875,-3.166016,-1.572266,2.603516,0.387451,0.250977,0.0,0.07782,0.028595,0.081726,3.617188,-0.67334,-3.197266,-0.025681,-0.101318
5,150005,2741,20000504,0.0,0,2.0,1.0,0.0,90,15.0,0.0,4959,0,0,20160311,44.4375,4.363281,-0.315674,1.53418,-0.925781,0.261475,0.107666,0.107178,0.025681,0.036743,-3.855469,1.90918,-1.69043,-1.108398,0.210815
6,150006,180443,20040208,4.0,4,0.0,1.0,0.0,150,15.0,0.0,1731,0,0,20160314,45.71875,-3.287109,-0.188232,-0.285645,-0.89502,0.26416,0.0,0.130005,0.070557,0.036407,2.277344,-2.318359,1.084961,-1.09668,0.209961
7,150007,3787,20041101,104.0,4,0.0,1.0,1.0,204,15.0,0.0,762,0,0,20160314,47.15625,4.824219,0.683594,-1.203125,-1.217773,0.267578,0.115417,0.122314,0.064026,0.023544,-5.464844,0.689941,1.826172,-1.307617,-1.543945
8,150008,17976,20040908,228.0,13,3.0,3.0,0.0,0,12.5,0.0,1774,0,0,20160401,44.6875,5.042969,-0.754883,-0.252441,2.619141,0.243774,0.115906,0.007771,0.052094,0.141846,-4.820312,1.392578,-0.963379,2.470703,-2.126953
9,150009,111167,20020404,11.0,10,0.0,0.0,0.0,82,15.0,0.0,185,0,0,20160328,44.3125,-3.220703,-0.833008,0.830078,-0.75,0.262939,0.0,0.10437,0.053406,0.043457,2.755859,-1.754883,-0.627441,-0.953125,0.325928


In [None]:
# 合并数据集
concat_data = pd.concat([Train_data, TestA_data],
                        ignore_index=True) # 重新生成索引
concat_data['notReparedDamage'] = concat_data['notRepairedDamage'].replace('-', 0).astype('float16')
concat_data = concat_data.fillna(concat_data.mode())