<a href="https://colab.research.google.com/github/pko89403/Recsys_test/blob/master/Wide%26Deep_Data_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wide - And - Deep - Pytorch
1. Wide : 스파스한 피처들, 원-핫 인코딩 되어 아웃풋 뉴런과 바로 연결됨
2. Deep Dense : 카테고리 피처들, 임베딩으로 표횐됨, 연속성 피처들이 덴스 레이어로 보내짐
3. Deep Text : 워드 임베딩들 보내지는 RNN들의 스택으로
4. Deep Image : RGB 이미지들 CNN으로 보내지는, ResNet

# 데이터 전처리

In [0]:
import pandas as pd
import numpy as np

raw_data_path = "/content/drive/My Drive/data/adult_data/adult-data.csv"
raw_data = pd.read_csv(raw_data_path)
raw_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital-gain,capital-loss,wrk_hrs_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Logist Regression에 사용하기 위한 라벨 만들기

In [0]:
raw_data['income_label'] = (raw_data['income'].apply(lambda x: ">50K" in x)).astype(int)
raw_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital-gain,capital-loss,wrk_hrs_per_week,native_country,income,income_label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


## 1-Set 실험
"Wide-" 나 "Deep-" 모델에 넣을 컬럼을 정의한다.

In [0]:
wide_cols = ['age', 'wrk_hrs_per_week', 'education', 'relationship', 'workclass', 'occupation', 'native_country', 'sex']
crossed_cols = (['education', 'occupation'], ['native_country', 'occupation'])
embedding_cols = [('education', 10), ('relationship', 8), ('workclass', 10), ('occupation', 10), ('native_country', 12)]
continuous_cols = ['age', 'wrk_hrs_per_week']
target = 'income_label'
method = 'logistic'

if type(embedding_cols[0]) is tuple:
  emb_dim = dict(embedding_cols)
  embedding_cols = [emb[0] for emb in embedding_cols]
else:
  emb_dim = {e:def_dim for e in embedding_cols}
deep_cols = embedding_cols + continuous_cols
print(deep_cols)

['education', 'relationship', 'workclass', 'occupation', 'native_country', 'age', 'wrk_hrs_per_week']


In [0]:
Y = np.array(raw_data[target])
raw_temp = raw_data.copy()[list(set(wide_cols + deep_cols))]
raw_temp.head()

Unnamed: 0,native_country,relationship,sex,workclass,occupation,wrk_hrs_per_week,education,age
0,United-States,Not-in-family,Male,State-gov,Adm-clerical,40,Bachelors,39
1,United-States,Husband,Male,Self-emp-not-inc,Exec-managerial,13,Bachelors,50
2,United-States,Not-in-family,Male,Private,Handlers-cleaners,40,HS-grad,38
3,United-States,Husband,Male,Private,Handlers-cleaners,40,11th,53
4,Cuba,Wife,Female,Private,Prof-specialty,40,Bachelors,28


In [0]:
print(crossed_cols)
crossed_columns = []
# Build the crossed columns
for cols in crossed_cols:
  colname = '_'.join(cols)
  raw_temp[colname] = raw_temp[cols].apply (lambda x: '-'.join(x), axis=1)
  crossed_columns.append(colname)

# extract the categorical column names that can be one-hot encoded layer
categorical_columns = list(raw_temp.select_dtypes(include=['object']).columns)

(['education', 'occupation'], ['native_country', 'occupation'])


In [0]:
raw_temp['education_occupation'].head()
#raw_temp.head()

0        Bachelors- Adm-clerical
1     Bachelors- Exec-managerial
2     HS-grad- Handlers-cleaners
3        11th- Handlers-cleaners
4      Bachelors- Prof-specialty
Name: education_occupation, dtype: object

In [0]:
def label_encode(df, cols=None):
  if cols == None:
    cols = list(df.select_dtypes(include=['object']).columns)
  
  val_types = dict()
  for c in cols:
    val_types[c] = df[c].unique()

  
  val_to_idx = dict()
  for k, v in val_types.items():
    val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
  
  for k, v in val_to_idx.items():
    df[k] = df[k].apply(lambda x: v[x])
  
  
  return val_to_idx, df

In [0]:
# 데이터프레임을 인코딩한다. 그리고 인코딩 딕셔너리를 리턴
# deep_cols
print(raw_temp.head())
encoding_dict, raw_temp = label_encode(raw_temp)
print(encoding_dict)
encoding_dict = {k:encoding_dict[k] for k in encoding_dict if k in deep_cols}
embeddings_input = []
for k, v in encoding_dict.items():
  embeddings_input.append((k, len(v), emb_dim[k]))
print(embeddings_input)

   native_country  ...          native_country_occupation
0   United-States  ...        United-States- Adm-clerical
1   United-States  ...     United-States- Exec-managerial
2   United-States  ...   United-States- Handlers-cleaners
3   United-States  ...   United-States- Handlers-cleaners
4            Cuba  ...               Cuba- Prof-specialty

[5 rows x 10 columns]
{'native_country': {' United-States': 0, ' Cuba': 1, ' Jamaica': 2, ' India': 3, ' ?': 4, ' Mexico': 5, ' South': 6, ' Puerto-Rico': 7, ' Honduras': 8, ' England': 9, ' Canada': 10, ' Germany': 11, ' Iran': 12, ' Philippines': 13, ' Italy': 14, ' Poland': 15, ' Columbia': 16, ' Cambodia': 17, ' Thailand': 18, ' Ecuador': 19, ' Laos': 20, ' Taiwan': 21, ' Haiti': 22, ' Portugal': 23, ' Dominican-Republic': 24, ' El-Salvador': 25, ' France': 26, ' Guatemala': 27, ' China': 28, ' Japan': 29, ' Yugoslavia': 30, ' Peru': 31, ' Outlying-US(Guam-USVI-etc)': 32, ' Scotland': 33, ' Trinadad&Tobago': 34, ' Greece': 35, ' Nicaragua'

In [0]:
df_deep = raw_temp[deep_cols]
deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}
deep_column_idx

{'age': 5,
 'education': 0,
 'native_country': 4,
 'occupation': 3,
 'relationship': 1,
 'workclass': 2,
 'wrk_hrs_per_week': 6}

In [0]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for cc in continuous_cols:
  df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))

df_wide = raw_temp[wide_cols + crossed_columns]
del(raw_temp)

dummy_cols = [c for c in wide_cols+crossed_columns if c in categorical_columns]
print(dummy_cols)
df_wide = pd.get_dummies(df_wide, columns=dummy_cols)

['education', 'relationship', 'workclass', 'occupation', 'native_country', 'sex', 'education_occupation', 'native_country_occupation']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [0]:
print(df_wide.shape)
for i in df_wide.columns:
  print(i)

(32561, 751)
age
wrk_hrs_per_week
education_0
education_1
education_2
education_3
education_4
education_5
education_6
education_7
education_8
education_9
education_10
education_11
education_12
education_13
education_14
education_15
relationship_0
relationship_1
relationship_2
relationship_3
relationship_4
relationship_5
workclass_0
workclass_1
workclass_2
workclass_3
workclass_4
workclass_5
workclass_6
workclass_7
workclass_8
occupation_0
occupation_1
occupation_2
occupation_3
occupation_4
occupation_5
occupation_6
occupation_7
occupation_8
occupation_9
occupation_10
occupation_11
occupation_12
occupation_13
occupation_14
native_country_0
native_country_1
native_country_2
native_country_3
native_country_4
native_country_5
native_country_6
native_country_7
native_country_8
native_country_9
native_country_10
native_country_11
native_country_12
native_country_13
native_country_14
native_country_15
native_country_16
native_country_17
native_country_18
native_country_19
native_country_20
na

In [0]:
print(df_deep)

       education  relationship  ...       age  wrk_hrs_per_week
0              0             0  ...  0.030671         -0.035429
1              0             1  ...  0.837109         -2.222153
2              1             0  ... -0.042642         -0.035429
3              2             1  ...  1.057047         -0.035429
4              0             2  ... -0.775768         -0.035429
...          ...           ...  ...       ...               ...
32556          6             2  ... -0.849080         -0.197409
32557          1             1  ...  0.103983         -0.035429
32558          1             4  ...  1.423610         -0.035429
32559          1             3  ... -1.215643         -1.655225
32560          1             2  ...  0.983734         -0.035429

[32561 rows x 7 columns]


In [0]:
print(Y)

[0 0 0 ... 0 0 1]


In [0]:
from sklearn.model_selection import train_test_split

seed = 1981
x_train_deep, x_test_deep = train_test_split(df_deep.values, test_size=0.3, random_state=seed)
x_train_wide, x_test_wide = train_test_split(df_wide.values, test_size=0.3, random_state=seed)
y_train, y_test = train_test_split(Y, test_size=0.3, random_state=seed)

In [0]:
from collections import namedtuple

wd_dataset = dict()
train_dataset = namedtuple('train_dataset', 'wide, deep, labels')
test_dataset = namedtuple('test_dataset', 'wide, deep, labels')
wd_dataset['train_dataset'] = train_dataset(x_train_wide, x_train_deep, y_train)
wd_dataset['test_dataset'] = test_dataset(x_test_wide, x_test_deep, y_test)
wd_dataset['embeddings_input'] = embeddings_input
wd_dataset['deep_column_idx'] = deep_column_idx
wd_dataset['encoding_dict'] = encoding_dict

## 모델을 만들어 보자
### Wide

In [0]:
# Wide Model
import torch.nn as nn
import torch.nn.functional as F

wide_dim = wd_dataset['train_dataset'].wide.shape[1]
n_class = 1
wide_part = nn.Linear(wide_dim, n_class)
print(wide_part)

Linear(in_features=751, out_features=1, bias=True)


In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.autograd import Variable 
from torch.utils.data import DataLoader


class Wide(nn.Module):
  def __init__(self, wide_dim, n_class):
    super(Wide, self).__init__()
    self.wide_dim = wide_dim
    self.n_class = n_class
    self.linear = nn.Linear(self.wide_dim, self.n_class)

  def forward(self, x):
    return F.sigmoid(self.linear(x))

In [0]:
wide_model = Wide(wide_dim, n_class)
print(wide_model)

Wide(
  (linear): Linear(in_features=751, out_features=1, bias=True)
)


In [0]:
train_dataset = np.hstack([wd_dataset['train_dataset'].labels.reshape(-1,1), wd_dataset['train_dataset'].wide])
train_dataset

array([[ 0, 49, 40, ...,  0,  0,  0],
       [ 0, 67, 25, ...,  0,  0,  0],
       [ 0, 48, 40, ...,  0,  0,  0],
       ...,
       [ 1, 73, 50, ...,  0,  0,  0],
       [ 0, 45, 37, ...,  0,  0,  0],
       [ 0, 40, 45, ...,  0,  0,  0]])

In [0]:
optimizer = torch.optim.Adam(wide_model.parameters())
batch_size = 64
n_epochs = 10
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

In [0]:
for epoch in range(n_epochs):
  total = 0
  correct = 0

  for i, batch in enumerate(train_loader):
    x_w = Variable(batch[:, 1:]).float()
    y = Variable(batch[:, 0]).float()

    optimizer.zero_grad() # zeroes the gradient buffers of all parameters
    y_pred = wide_model(x_w)

    loss = F.binary_cross_entropy(y_pred, y)
    loss.backward()
    optimizer.step()

    total += y.size(0)
    y_pred_cat = (y_pred > 0.5).squeeze(1).float()
  
    correct += float((y_pred_cat == y).sum().item())

  print('Epoch {} of {}, Loss: {}, Accuracy: {}'.format(epoch+1, n_epochs, round(loss.item(),3), round(correct/total,4)))

  if sys.path[0] == '':
  if sys.path[0] == '':


Epoch 1 of 10, Loss: 0.489, Accuracy: 0.819
Epoch 2 of 10, Loss: 0.33, Accuracy: 0.8268
Epoch 3 of 10, Loss: 0.446, Accuracy: 0.8295
Epoch 4 of 10, Loss: 0.429, Accuracy: 0.8316
Epoch 5 of 10, Loss: 0.582, Accuracy: 0.8324
Epoch 6 of 10, Loss: 0.302, Accuracy: 0.8352
Epoch 7 of 10, Loss: 0.346, Accuracy: 0.8346
Epoch 8 of 10, Loss: 0.252, Accuracy: 0.8366
Epoch 9 of 10, Loss: 0.26, Accuracy: 0.8358
Epoch 10 of 10, Loss: 0.216, Accuracy: 0.8369


### Deep

In [0]:
print(wd_dataset['embeddings_input'])
print(wd_dataset['deep_column_idx'])

[('native_country', 42, 12), ('relationship', 6, 8), ('workclass', 9, 10), ('occupation', 15, 10), ('education', 16, 10)]
{'education': 0, 'relationship': 1, 'workclass': 2, 'occupation': 3, 'native_country': 4, 'age': 5, 'wrk_hrs_per_week': 6}


In [0]:
col_name, unique_vals, n_emb = wd_dataset['embeddings_input'][0]
emb_layer = nn.Embedding(unique_vals, n_emb)
print(emb_layer)

Embedding(42, 12)


In [0]:
class Deep(nn.Module):
  def __init__(self, embeddings_input, continuous_cols, deep_column_idx, hidden_layers, n_class):
    super(Deep, self).__init__()
    self.deep_column_idx = deep_column_idx
    self.embeddings_input = embeddings_input
    self.continuous_cols = continuous_cols
    self.hidden_layers = hidden_layers
    self.n_class = n_class

    for col, val, dim in self.embeddings_input:
      setattr(self, 'emb_layer_' + col, nn.Embedding(val, dim))

      input_emb_dim = np.sum([emb[2] for emb in self.embeddings_input])
      self.linear_1 = nn.Linear(input_emb_dim + len(continuous_cols), self.hidden_layers[0])
      for i, h in enumerate(self.hidden_layers[1:],1):
        setattr(self, 'linear_' + str(i+1), nn.Linear( self.hidden_layers[i-1], self.hidden_layers[i]))

      self.output = nn.Linear(self.hidden_layers[-1], n_class)
  
  def forward(self, x):
    pass