<a href="https://colab.research.google.com/github/pko89403/Recsys_test/blob/master/Wide%26Deep_Data_Model_MoreAboutDATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wide - And - Deep - Pytorch
1. Wide : 스파스한 피처들, 원-핫 인코딩 되어 아웃풋 뉴런과 바로 연결됨
2. Deep Dense : 카테고리 피처들, 임베딩으로 표횐됨, 연속성 피처들이 덴스 레이어로 보내짐
3. Deep Text : 워드 임베딩들 보내지는 RNN들의 스택으로
4. Deep Image : RGB 이미지들 CNN으로 보내지는, ResNet

# 데이터 전처리

In [248]:
import pandas as pd
import numpy as np

raw_data_path = "/content/drive/My Drive/data/adult_data/adult-data.csv"
raw_data = pd.read_csv(raw_data_path)
raw_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital-gain,capital-loss,wrk_hrs_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Logist Regression에 사용하기 위한 라벨 만들기

In [249]:
raw_data['income_label'] = (raw_data['income'].apply(lambda x: ">50K" in x)).astype(int)
raw_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital-gain,capital-loss,wrk_hrs_per_week,native_country,income,income_label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


## 1-Set 실험
"Wide-" 나 "Deep-" 모델에 넣을 컬럼을 정의한다.

In [250]:
wide_cols = ['age', 'wrk_hrs_per_week', 'education', 'relationship', 'workclass', 'occupation', 'native_country', 'sex']
crossed_cols = (['education', 'occupation'], ['native_country', 'occupation'])
embedding_cols = [('education', 10), ('relationship', 8), ('workclass', 10), ('occupation', 10), ('native_country', 12)]
continuous_cols = ['age', 'wrk_hrs_per_week']
target = 'income_label'
method = 'logistic'

if type(embedding_cols[0]) is tuple:
  emb_dim = dict(embedding_cols)
  embedding_cols = [emb[0] for emb in embedding_cols]
else:
  emb_dim = {e:def_dim for e in embedding_cols}
deep_cols = embedding_cols + continuous_cols
print(deep_cols)

['education', 'relationship', 'workclass', 'occupation', 'native_country', 'age', 'wrk_hrs_per_week']


In [251]:
Y = np.array(raw_data[target])
raw_temp = raw_data.copy()[list(set(wide_cols + deep_cols))]
raw_temp.head()

Unnamed: 0,age,wrk_hrs_per_week,education,occupation,sex,relationship,workclass,native_country
0,39,40,Bachelors,Adm-clerical,Male,Not-in-family,State-gov,United-States
1,50,13,Bachelors,Exec-managerial,Male,Husband,Self-emp-not-inc,United-States
2,38,40,HS-grad,Handlers-cleaners,Male,Not-in-family,Private,United-States
3,53,40,11th,Handlers-cleaners,Male,Husband,Private,United-States
4,28,40,Bachelors,Prof-specialty,Female,Wife,Private,Cuba


In [252]:
print(crossed_cols)
crossed_columns = []
# Build the crossed columns
for cols in crossed_cols:
  colname = '_'.join(cols)
  raw_temp[colname] = raw_temp[cols].apply (lambda x: '-'.join(x), axis=1)
  crossed_columns.append(colname)



# extract the categorical column names that can be one-hot encoded layer
categorical_columns = list(raw_temp.select_dtypes(include=['object']).columns)
categorical_columns

(['education', 'occupation'], ['native_country', 'occupation'])


['education',
 'occupation',
 'sex',
 'relationship',
 'workclass',
 'native_country',
 'education_occupation',
 'native_country_occupation']

In [253]:
raw_temp['education_occupation'].head()
#raw_temp.head()

0        Bachelors- Adm-clerical
1     Bachelors- Exec-managerial
2     HS-grad- Handlers-cleaners
3        11th- Handlers-cleaners
4      Bachelors- Prof-specialty
Name: education_occupation, dtype: object

In [0]:
def label_encode(df, cols=None):
  if cols == None:
    # 여기서 continuous feature들은 걸러진다
    cols = list(df.select_dtypes(include=['object']).columns)
  
  # 모든 컬럼들에 대해 유일한 값들을 뽑는다.
  # 카테고리 컬럼의 각 유일 원소들을 사전에 저장한다. key(string) - value(list(string))
  val_types = dict()
  for c in cols: 
    val_types[c] = df[c].unique()
  
  val_to_idx = dict()
  for k, v in val_types.items():
    # 카테고리 컬럼의 각 유일 원소들에 인덱스를 부여한다.
    val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
  

  for k, v in val_to_idx.items():
    # 입력 데이터들을 인코딩한다. category feature의 원소 -> 인덱스 
    df[k] = df[k].apply(lambda x: v[x])
  
  return val_to_idx, df

In [255]:
# 데이터프레임을 인코딩한다. 그리고 인코딩 딕셔너리를 리턴
# deep_cols
print(raw_temp.head())
encoding_dict, raw_temp = label_encode(raw_temp)

print(raw_temp.head())
print(encoding_dict) # 카테고리 피처들의 원소들에 인덱스가 부여되었다. Continuous Column은 걸러졌다
print(encoding_dict.keys())


encoding_dict = {k:encoding_dict[k] for k in encoding_dict if k in deep_cols}
print(encoding_dict.keys())

embeddings_input = []
for k, v in encoding_dict.items():
  embeddings_input.append((k, len(v), emb_dim[k])) # ( embedding_feature_name,  embedding_max_index, embedding_length )
print(embeddings_input)

   age  ...          native_country_occupation
0   39  ...        United-States- Adm-clerical
1   50  ...     United-States- Exec-managerial
2   38  ...   United-States- Handlers-cleaners
3   53  ...   United-States- Handlers-cleaners
4   28  ...               Cuba- Prof-specialty

[5 rows x 10 columns]
   age  wrk_hrs_per_week  ...  education_occupation  native_country_occupation
0   39                40  ...                     0                          0
1   50                13  ...                     1                          1
2   38                40  ...                     2                          2
3   53                40  ...                     3                          2
4   28                40  ...                     4                          3

[5 rows x 10 columns]
{'education': {' Bachelors': 0, ' HS-grad': 1, ' 11th': 2, ' Masters': 3, ' 9th': 4, ' Some-college': 5, ' Assoc-acdm': 6, ' Assoc-voc': 7, ' 7th-8th': 8, ' Doctorate': 9, ' Prof-school': 10, ' 5th-

In [256]:
# embeding_feature_name -> index에 해당하는 dict를 만들어 준다
df_deep = raw_temp[deep_cols] # label_encode()함수를 통해 카테고리 원소들을 인덱스로 이미 변환 했다.
deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}
deep_column_idx

{'age': 5,
 'education': 0,
 'native_country': 4,
 'occupation': 3,
 'relationship': 1,
 'workclass': 2,
 'wrk_hrs_per_week': 6}

In [257]:
# continuous column들에 대해 standardization을 한다.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for cc in continuous_cols:
  df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))

df_deep.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,education,relationship,workclass,occupation,native_country,age,wrk_hrs_per_week
0,0,0,0,0,0,0.030671,-0.035429
1,0,1,1,1,0,0.837109,-2.222153
2,1,0,2,2,0,-0.042642,-0.035429
3,2,1,2,2,0,1.057047,-0.035429
4,0,2,2,3,1,-0.775768,-0.035429


In [258]:
df_wide = raw_temp[wide_cols + crossed_columns]
del(raw_temp)

# 카테고리 피처들만 추려낸다. 
dummy_cols = [c for c in wide_cols+crossed_columns if c in categorical_columns]
print(dummy_cols)
df_wide.head()
df_wide = pd.get_dummies(df_wide, columns=dummy_cols) # pd.get_dummies() 를 이용해 가변수(dummy var) 만들기
df_wide.head()

['education', 'relationship', 'workclass', 'occupation', 'native_country', 'sex', 'education_occupation', 'native_country_occupation']


Unnamed: 0,age,wrk_hrs_per_week,education_0,education_1,education_2,education_3,education_4,education_5,education_6,education_7,education_8,education_9,education_10,education_11,education_12,education_13,education_14,education_15,relationship_0,relationship_1,relationship_2,relationship_3,relationship_4,relationship_5,workclass_0,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,workclass_7,workclass_8,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,...,native_country_occupation_402,native_country_occupation_403,native_country_occupation_404,native_country_occupation_405,native_country_occupation_406,native_country_occupation_407,native_country_occupation_408,native_country_occupation_409,native_country_occupation_410,native_country_occupation_411,native_country_occupation_412,native_country_occupation_413,native_country_occupation_414,native_country_occupation_415,native_country_occupation_416,native_country_occupation_417,native_country_occupation_418,native_country_occupation_419,native_country_occupation_420,native_country_occupation_421,native_country_occupation_422,native_country_occupation_423,native_country_occupation_424,native_country_occupation_425,native_country_occupation_426,native_country_occupation_427,native_country_occupation_428,native_country_occupation_429,native_country_occupation_430,native_country_occupation_431,native_country_occupation_432,native_country_occupation_433,native_country_occupation_434,native_country_occupation_435,native_country_occupation_436,native_country_occupation_437,native_country_occupation_438,native_country_occupation_439,native_country_occupation_440,native_country_occupation_441
0,39,40,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,50,13,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,38,40,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,53,40,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,28,40,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
from sklearn.model_selection import train_test_split

seed = 1981
x_train_deep, x_test_deep = train_test_split(df_deep.values, test_size=0.3, random_state=seed)
x_train_wide, x_test_wide = train_test_split(df_wide.values, test_size=0.3, random_state=seed)
y_train, y_test = train_test_split(Y, test_size=0.3, random_state=seed)

In [0]:
from collections import namedtuple

wd_dataset = dict()
train_dataset = namedtuple('train_dataset', 'wide, deep, labels')
test_dataset = namedtuple('test_dataset', 'wide, deep, labels')
wd_dataset['train_dataset'] = train_dataset(x_train_wide, x_train_deep, y_train)
wd_dataset['test_dataset'] = test_dataset(x_test_wide, x_test_deep, y_test)
wd_dataset['embeddings_input'] = embeddings_input
wd_dataset['deep_column_idx'] = deep_column_idx
wd_dataset['encoding_dict'] = encoding_dict

## 모델을 만들어 보자
### Wide

In [262]:
# Wide Model
import torch.nn as nn
import torch.nn.functional as F

wide_dim = wd_dataset['train_dataset'].wide.shape[1]
n_class = 1
wide_part = nn.Linear(wide_dim, n_class)
print(wide_part)

Linear(in_features=751, out_features=1, bias=True)


In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.autograd import Variable 
from torch.utils.data import DataLoader


class Wide(nn.Module):
  def __init__(self, wide_dim, n_class):
    super(Wide, self).__init__()
    self.wide_dim = wide_dim
    self.n_class = n_class
    self.linear = nn.Linear(self.wide_dim, self.n_class)

  def forward(self, x):
    return F.sigmoid(self.linear(x))

In [264]:
wide_model = Wide(wide_dim, n_class)
print(wide_model)

Wide(
  (linear): Linear(in_features=751, out_features=1, bias=True)
)


In [265]:
train_dataset = np.hstack([wd_dataset['train_dataset'].labels.reshape(-1,1), wd_dataset['train_dataset'].wide])
train_dataset

array([[ 0, 49, 40, ...,  0,  0,  0],
       [ 0, 67, 25, ...,  0,  0,  0],
       [ 0, 48, 40, ...,  0,  0,  0],
       ...,
       [ 1, 73, 50, ...,  0,  0,  0],
       [ 0, 45, 37, ...,  0,  0,  0],
       [ 0, 40, 45, ...,  0,  0,  0]])

In [0]:
optimizer = torch.optim.Adam(wide_model.parameters())
batch_size = 64
n_epochs = 10
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

In [267]:
for epoch in range(n_epochs):
  total = 0
  correct = 0

  for i, batch in enumerate(train_loader):
    x_w = Variable(batch[:, 1:]).float()
    y = Variable(batch[:, 0]).float()

    optimizer.zero_grad() # zeroes the gradient buffers of all parameters
    y_pred = wide_model(x_w)

    loss = F.binary_cross_entropy(y_pred, y)
    loss.backward()
    optimizer.step()

    total += y.size(0)
    y_pred_cat = (y_pred > 0.5).squeeze(1).float()
  
    correct += float((y_pred_cat == y).sum().item())

  print('Epoch {} of {}, Loss: {}, Accuracy: {}'.format(epoch+1, n_epochs, round(loss.item(),3), round(correct/total,4)))

  if sys.path[0] == '':
  if sys.path[0] == '':


Epoch 1 of 10, Loss: 0.388, Accuracy: 0.7202
Epoch 2 of 10, Loss: 0.374, Accuracy: 0.8
Epoch 3 of 10, Loss: 0.372, Accuracy: 0.8181
Epoch 4 of 10, Loss: 0.692, Accuracy: 0.8259
Epoch 5 of 10, Loss: 0.209, Accuracy: 0.8284
Epoch 6 of 10, Loss: 0.344, Accuracy: 0.8315
Epoch 7 of 10, Loss: 0.183, Accuracy: 0.8321
Epoch 8 of 10, Loss: 0.347, Accuracy: 0.8334
Epoch 9 of 10, Loss: 0.506, Accuracy: 0.8352
Epoch 10 of 10, Loss: 0.286, Accuracy: 0.8362


### Deep

In [268]:
print(wd_dataset['embeddings_input'])
print(wd_dataset['deep_column_idx'])

[('education', 16, 10), ('occupation', 15, 10), ('relationship', 6, 8), ('workclass', 9, 10), ('native_country', 42, 12)]
{'education': 0, 'relationship': 1, 'workclass': 2, 'occupation': 3, 'native_country': 4, 'age': 5, 'wrk_hrs_per_week': 6}


In [269]:
col_name, unique_vals, n_emb = wd_dataset['embeddings_input'][0]
emb_layer = nn.Embedding(unique_vals, n_emb)
print(emb_layer)

Embedding(16, 10)


In [0]:
class Deep(nn.Module):
  def __init__(self, embeddings_input, continuous_cols, deep_column_idx, hidden_layers, n_class):
    super(Deep, self).__init__()
    self.deep_column_idx = deep_column_idx
    self.embeddings_input = embeddings_input
    self.continuous_cols = continuous_cols
    self.hidden_layers = hidden_layers
    self.n_class = n_class

    for col, val, dim in self.embeddings_input:
      setattr(self, 'emb_layer_' + col, nn.Embedding(val, dim))

      input_emb_dim = np.sum([emb[2] for emb in self.embeddings_input])
      self.linear_1 = nn.Linear(input_emb_dim + len(continuous_cols), self.hidden_layers[0])
      for i, h in enumerate(self.hidden_layers[1:],1):
        setattr(self, 'linear_' + str(i+1), nn.Linear( self.hidden_layers[i-1], self.hidden_layers[i]))

      self.output = nn.Linear(self.hidden_layers[-1], n_class)
  
  def forward(self, x):
    pass