-
Notifications
You must be signed in to change notification settings - Fork 4
/
process_data.py
130 lines (98 loc) · 4.64 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
'''
https://github.com/jeremite/channel-attribution-model/blob/master/process_data.py
'''
import numpy as np
import random
import tensorflow as tf
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from pandas.api.types import is_string_dtype, is_numeric_dtype
import warnings
import pandas as pd
warnings.filterwarnings("ignore")
def cat(data,field,leng,s=','):
data[field] = data[field].map(lambda x:x.split(s))
data[leng] = data[field].map(lambda x:len(x))
def most_fre(lt):
return max(lt, key=lt.count)
most_fre('north')
def sca(col):
col = [int(v) for v in col]
return [(v-min(col))/(max(col)-min(col)) for v in col]
def train_cats(df):
"""Change any columns of strings in a panda's dataframe to a column of
categorical values. This applies the changes inplace.
"""
for n,c in df.items():
if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
def process_data(data,seq_length = 20):
#data_a = data_att.loc[:,['path_id','path','total_conversions','last_time_lapse','null_conversion']]
data.dropna(axis=0, how='any', inplace=True) # 删除了所有na的
#merge target variables
# str -> list
cat(data,'path',"leng_path",s='>')
cat(data,'marketing_area',"leng_area",s=',')
cat(data,'tier',"leng_tier",s=',')
cat(data,'customer_type',"leng_type",s=',')
# remove those path with channels less than 3
data_new = data[(data.leng_path >=3)]
data_new = data_new.reset_index()
# leave with the most common value
data_new.marketing_area = data_new.marketing_area.map(lambda x:most_fre(x))
data_new.tier = data_new.tier.map(lambda x:most_fre(x))
data_new.customer_type = data_new.customer_type.map(lambda x:most_fre(x))
data_new.replace('','NA',inplace=True)
y = data_new.total_conversions
# 抽取训练集、测试集
# got train and test data indices
idx = [x for x in range(data_new.shape[0])]
np.random.seed(111)
random.shuffle(idx)
tr_idx = idx[0:int(0.9*len(idx))]
te_idx = idx[int(0.9*len(idx)):]
# got data for time decay
cat(data_new,'last_time_lapse',"leng_time_lapse",s=',')
data_new.last_time_lapse=data_new.last_time_lapse.map(lambda x:sca(x))
pad_sequence = tf.contrib.keras.preprocessing.sequence.pad_sequences
time_decay =pad_sequence(data_new.last_time_lapse,maxlen=seq_length,padding='pre',truncating='pre',dtype='float32')
time_decay = time_decay.reshape(-1,20,1)
time_decay_tr = time_decay[tr_idx]
time_decay_te = time_decay[te_idx]
# got data for attribution
text = data_new.path
# encoding
t = Tokenizer()
t.fit_on_texts(text)
# vocabulary size
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(text)
# padding and truncating path data
newlines =pad_sequence(encoded_docs,maxlen=seq_length,padding='pre',truncating='post')
X_train = newlines[tr_idx]
Y_train = y[tr_idx]
X_test = newlines[te_idx]
Y_test = y[te_idx]
all_X = np.array(list(map(lambda x: to_categorical(x, num_classes=vocab_size),newlines)), ndmin=3)
X_tr = np.array(list(map(lambda x: to_categorical(x, num_classes=vocab_size), X_train)), ndmin=3)
X_te = np.array(list(map(lambda x: to_categorical(x, num_classes=vocab_size), X_test)), ndmin=3)
paths = text[tr_idx].reset_index().path
# got customer data (control data)
data_lr = data_new.loc[:,['marketing_area','tier','customer_type']]
train_cats(data_lr)
data_lr['c_type'+'_na'] = [1 if v=='NA' else 0 for v in data_lr['customer_type']]
for col in data_lr.columns:
if not is_numeric_dtype(data_lr[col]):
data_lr[col] = data_lr[col].cat.codes+1
X_tr_lr = data_lr.iloc[tr_idx,:]
X_te_lr = data_lr.iloc[te_idx,:]
scaler = MinMaxScaler()
scaler.fit(X_tr_lr)
X_tr_lr[['marketing_area', 'tier','customer_type']] = scaler.fit_transform(X_tr_lr[['marketing_area', 'tier','customer_type']])
X_te_lr.loc[:,['marketing_area', 'tier','customer_type']] = scaler.transform(X_te_lr[['marketing_area', 'tier','customer_type']])
categorical_vars = data_lr.columns[0:3]
return [time_decay_tr,time_decay_te, X_tr,X_te, X_tr_lr, X_te_lr, Y_train, Y_test,
all_X,time_decay, newlines, y , categorical_vars, paths]
# data = pd.read_excel('fake_data.xlsx')
# data['last_time_lapse'] = data['last_time_lapse'].map(eval)