In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing

In [2]:
df1 = pd.read_csv("data/train_preliminary/user.csv")
df2 = pd.read_csv("data/train_preliminary/ad.csv")
df3 = pd.read_csv("data/train_preliminary/click_log.csv")
df_tmp = pd.merge(df3, df1, left_on='user_id', right_on='user_id')
df = pd.merge(df_tmp, df2, left_on='creative_id', right_on='creative_id')

In [3]:
df.head()

Unnamed: 0,time,user_id,creative_id,click_times,age,gender,ad_id,product_id,product_category,advertiser_id,industry
0,9,30920,567330,1,2,1,504423,30673,3,32638,319
1,15,320815,567330,1,3,1,504423,30673,3,32638,319
2,11,355089,567330,1,1,1,504423,30673,3,32638,319
3,9,363442,567330,1,2,1,504423,30673,3,32638,319
4,14,370513,567330,1,4,1,504423,30673,3,32638,319


In [4]:
order = ['user_id', 'time', 'creative_id', 'click_times', 'ad_id', 'product_category', 'advertiser_id', 
         'age', 'gender']
df = df[order]
df.head()

Unnamed: 0,user_id,time,creative_id,click_times,ad_id,product_category,advertiser_id,age,gender
0,30920,9,567330,1,504423,3,32638,2,1
1,320815,15,567330,1,504423,3,32638,3,1
2,355089,11,567330,1,504423,3,32638,1,1
3,363442,9,567330,1,504423,3,32638,2,1
4,370513,14,567330,1,504423,3,32638,4,1


In [5]:
df.shape

(30082771, 9)

In [6]:
df.describe()

Unnamed: 0,user_id,time,creative_id,click_times,ad_id,product_category,advertiser_id,age,gender
count,30082770.0,30082770.0,30082770.0,30082770.0,30082770.0,30082770.0,30082770.0,30082770.0,30082770.0
mean,449855.6,48.01738,1676532.0,1.063127,1447917.0,8.128011,21194.45,4.233062,1.316965
std,259993.9,25.92504,1308054.0,0.2982831,1120569.0,7.055121,14065.27,2.049908,0.4652938
min,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0
25%,224613.0,26.0,415810.0,1.0,371110.0,2.0,10986.0,3.0,1.0
50%,449804.0,49.0,1508864.0,1.0,1310317.0,3.0,18103.0,4.0,1.0
75%,675242.0,71.0,2740464.0,1.0,2355111.0,18.0,30034.0,6.0,2.0
max,900000.0,91.0,4445718.0,152.0,3812200.0,18.0,62965.0,10.0,2.0


In [7]:
Y = df[['user_id', 'age', 'gender']]
Y = Y.drop_duplicates(subset=['user_id'], keep='first')
Y = Y.sort_values('user_id')
Y.head()

Unnamed: 0,user_id,age,gender
5325388,1,4,1
22350,2,10,1
2224953,3,7,2
1773340,4,5,1
128645,5,4,1


In [8]:
Y1_train, Y2_train, Y1_test, Y2_test = Y.iloc[:800000, 1:2], Y.iloc[:800000, 2:3], Y.iloc[800000:, 1:2], Y.iloc[800000:, 2:3]
Y1_train

Unnamed: 0,age
5325388,4
22350,10
2224953,7
1773340,5
128645,4
...,...
3864900,5
1014202,2
1683043,7
664305,4


In [9]:
# 点击次数总和
X = df3.groupby('user_id')['click_times'].sum()
X

user_id
1         14
2         46
3         30
4         29
5         34
          ..
899996    14
899997    20
899998    15
899999    22
900000    12
Name: click_times, Length: 900000, dtype: int64

In [10]:
df["fre_creative_id"] = (df.groupby("user_id")["creative_id"].transform(lambda x: x.value_counts().index[0]))
X_tmp = df.drop_duplicates(subset=['user_id'],keep='first')
X_tmp = X_tmp[['user_id', 'creative_id']]
X = pd.merge(X, X_tmp, left_on='user_id', right_on='user_id')
X

Unnamed: 0,user_id,click_times,creative_id
0,1,14,71691
1,2,46,2085566
2,3,30,599128
3,4,29,72989
4,5,34,24333
...,...,...,...
899995,899996,14,114074
899996,899997,20,24333
899997,899998,15,1251649
899998,899999,22,12838


In [11]:
# 点击最多的类
X = pd.merge(X, df2[['creative_id', 'product_category']], left_on='creative_id', right_on='creative_id')
X = X.drop('creative_id', axis = 1)
X_tmp = pd.get_dummies(X['product_category'], prefix='category')
X = pd.concat([X.drop('product_category', axis = 1), X_tmp], axis=1)
X = X.sort_values('user_id')
X

Unnamed: 0,user_id,click_times,category_1,category_2,category_3,category_4,category_5,category_7,category_8,category_9,category_11,category_12,category_13,category_15,category_16,category_17,category_18
0,1,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
277,2,46,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
27290,3,30,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
29230,4,29,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
29362,5,34,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624424,899996,14,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
52488,899997,20,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
723887,899998,15,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
581077,899999,22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
X_ready = X.drop(['user_id'], axis = 1)
X_ready = preprocessing.MinMaxScaler().fit_transform(X_ready)
X_train, X_test = X_ready[:800000, :], X_ready[800000:, :]
model1 = xgb.XGBClassifier()
model1.fit(X_train, Y2_train)
model1.score(X_train, Y2_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.67090375

In [13]:
X_tmp1 = df3.groupby('user_id')['click_times'].mean()
X_tmp2 = df3.groupby('user_id')['click_times'].std()

In [14]:
# 点击次数的平均值和集中程度
X = pd.merge(X, X_tmp1, left_on='user_id', right_on='user_id')
X = pd.merge(X, X_tmp2, left_on='user_id', right_on='user_id')
X

Unnamed: 0,user_id,click_times_x,category_1,category_2,category_3,category_4,category_5,category_7,category_8,category_9,category_11,category_12,category_13,category_15,category_16,category_17,category_18,click_times_y,click_times
0,1,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.076923,0.277350
1,2,46,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.022222,0.149071
2,3,30,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.000000,0.000000
3,4,29,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.000000,0.000000
4,5,34,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.030303,0.174078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899995,899996,14,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.000000,0.000000
899996,899997,20,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.111111,0.323381
899997,899998,15,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.071429,0.267261
899998,899999,22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.000000,0.000000


In [15]:
X_ready = X.drop(['user_id'], axis = 1)
X_ready = preprocessing.MinMaxScaler().fit_transform(X_ready)
X_train, X_test = X_ready[:800000, :], X_ready[800000:, :]
model1 = xgb.XGBClassifier()
model1.fit(X_train, Y2_train)
model1.score(X_train, Y2_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.67241

In [16]:
# 点击时间分布情况
X_tmp = df[['user_id', 'time']].groupby('user_id')['time'].std()
X = pd.merge(X, X_tmp, left_on='user_id', right_on='user_id')
X

Unnamed: 0,user_id,click_times_x,category_1,category_2,category_3,category_4,category_5,category_7,category_8,category_9,category_11,category_12,category_13,category_15,category_16,category_17,category_18,click_times_y,click_times,time
0,1,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.076923,0.277350,19.585775
1,2,46,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.022222,0.149071,21.653562
2,3,30,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.000000,0.000000,26.386953
3,4,29,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.000000,0.000000,17.421039
4,5,34,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.030303,0.174078,23.186783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899995,899996,14,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.000000,0.000000,23.559138
899996,899997,20,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.111111,0.323381,28.867344
899997,899998,15,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.071429,0.267261,18.683012
899998,899999,22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.000000,0.000000,24.854773


In [17]:
X_ready = X.drop(['user_id'], axis = 1)
X_ready = preprocessing.MinMaxScaler().fit_transform(X_ready)
X_train, X_test = X_ready[:800000, :], X_ready[800000:, :]
model1 = xgb.XGBClassifier()
model1.fit(X_train, Y2_train)
model1.score(X_train, Y2_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.6731

In [18]:
# creave_id 以取模方式进入特征
mo = 200
X_tmp = df.drop_duplicates(subset=['user_id'],keep='first')
X_tmp = X_tmp[['user_id', 'creative_id']]
X_tmp['creative_id'] = X_tmp['creative_id'] % mo
X_tmp

Unnamed: 0,user_id,creative_id
0,30920,130
1,320815,130
2,355089,130
3,363442,130
4,370513,130
...,...,...
24638166,512632,116
25102087,872929,117
25314288,833411,199
26443563,404337,23


In [19]:
X_tmp = X_tmp.drop_duplicates(subset=['user_id'], keep='first')
X_tmp = X_tmp.sort_values('user_id')
X_tmp = pd.get_dummies(X_tmp['creative_id'], prefix='creative_id')
X_tmp = X_tmp.reset_index(drop=True)
X_tmp

Unnamed: 0,creative_id_0,creative_id_1,creative_id_2,creative_id_3,creative_id_4,creative_id_5,creative_id_6,creative_id_7,creative_id_8,creative_id_9,...,creative_id_190,creative_id_191,creative_id_192,creative_id_193,creative_id_194,creative_id_195,creative_id_196,creative_id_197,creative_id_198,creative_id_199
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
899996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
899997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
899998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
X = pd.concat([X, X_tmp], axis = 1)
X

Unnamed: 0,user_id,click_times_x,category_1,category_2,category_3,category_4,category_5,category_7,category_8,category_9,...,creative_id_190,creative_id_191,creative_id_192,creative_id_193,creative_id_194,creative_id_195,creative_id_196,creative_id_197,creative_id_198,creative_id_199
0,1,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,46,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,30,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,29,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,34,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899995,899996,14,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
899996,899997,20,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
899997,899998,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
899998,899999,22,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
X_ready = X.drop(['user_id'], axis = 1)
X_ready = preprocessing.MinMaxScaler().fit_transform(X_ready)
X_train, X_test = X_ready[:800000, :], X_ready[800000:, :]
model1 = xgb.XGBClassifier()
model1.fit(X_train, Y2_train)
model1.score(X_train, Y2_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.7159225

In [22]:
model1.score(X_test, Y2_test)

0.71632

In [23]:
model2 = xgb.XGBClassifier()
model2.fit(X_train, Y1_train)
model2.score(X_train, Y1_train)

0.249255

In [24]:
model2.score(X_test, Y1_test)

0.22991