<font color="#CC3D3D"><p>
# Making Word2Vec Features
    
<font color="black"><p>
- [W2V Feature](#W2V-Feature)
    - [corner_nm](#corner_nm)
    - [brd_nm](#brd_nm)
    - [pc_nm](#pc_nm)
    - [part_nm](#part_nm)
    - [customer_info](#customer_info)
- [Feature Merge](#Feature-Merge)

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('../input/X_train.csv', encoding = 'cp949')
test = pd.read_csv('../input/X_test.csv', encoding = 'cp949')
y_train = pd.read_csv('../input/y_train.csv', encoding = 'cp949')  

In [3]:
data = pd.concat([train, test]).reset_index(drop=True)

## corner_nm

In [6]:
%%writefile word2vec_corner.py

### Imports
import pandas as pd
import numpy as np
import os


### Read data
train = pd.read_csv('../input/X_train.csv', encoding = 'cp949')
test = pd.read_csv('../input/X_test.csv', encoding = 'cp949')


### Make corpus
p_level = 'corner_nm'  # 상품 분류 수준

# W2V 학습데이터가 부족하여 구매한 상품 목록으로부터 n배 oversampling을 수행
def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=True))  # 복원추출
    return list(bs)

train_corpus = list(train.groupby('custid')[p_level].agg(oversample, 20))
test_corpus = list(test.groupby('custid')[p_level].agg(oversample, 20))


### Training the Word2Vec model
num_features = 100 # 단어 벡터 차원 수
min_word_count = 1 # 최소 단어 수
context = 5 # 학습 윈도우(인접한 단어 리스트) 크기

# 초기화 및 모델 학습
from gensim.models import word2vec

# 모델 학습
w2v = word2vec.Word2Vec(train_corpus, 
                        vector_size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)
# 필요없는 메모리 unload
w2v.init_sims(replace=True)


### Make features
# 구매상품에 해당하는 벡터의 평균/최소/최대 벡터를 feature로 만드는 전처리기
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                #np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

# W2V 기반 feature 생성
train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

# 학습용과 제출용 데이터로 분리
X_train_corner = pd.concat([pd.DataFrame({'custid': np.sort(train['custid'].unique())}), train_features], axis=1)#.to_csv('X_train_buyer.csv', index=False)
X_test_corner = pd.concat([pd.DataFrame({'custid': np.sort(test['custid'].unique())}), test_features], axis=1)#.to_csv('X_test_buyer.csv', index=False)

Overwriting word2vec_corner.py


In [None]:
%run word2vec_corner.py

## brd_nm

In [None]:
%%writefile word2vec_brd.py

### Imports
import pandas as pd
import numpy as np
import os


### Read data
train = pd.read_csv('X_train.csv', encoding='cp949')
test = pd.read_csv('X_test.csv', encoding='cp949')


### Make corpus
p_level = 'brd_nm'  # 상품 분류 수준

# W2V 학습데이터가 부족하여 구매한 상품 목록으로부터 n배 oversampling을 수행
def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=True))
    return list(bs)

train_corpus = list(train.groupby('custid')[p_level].agg(oversample, 20))
test_corpus = list(test.groupby('custid')[p_level].agg(oversample, 20))


### Training the Word2Vec model
num_features = 300 # 단어 벡터 차원 수
min_word_count = 1 # 최소 단어 수
context = 5 # 학습 윈도우(인접한 단어 리스트) 크기

# 초기화 및 모델 학습
from gensim.models import word2vec

# 모델 학습
w2v = word2vec.Word2Vec(train_corpus, 
                        vector_size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)
# 필요없는 메모리 unload
w2v.init_sims(replace=True)


### Make features
# 구매상품에 해당하는 벡터의 평균/최소/최대 벡터를 feature로 만드는 전처리기
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                #np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

# W2V 기반 feature 생성
train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

# 학습용과 제출용 데이터로 분리
X_train_brd = pd.concat([pd.DataFrame({'custid': np.sort(train['custid'].unique())}), train_features], axis=1)#.to_csv('X_train_buyer.csv', index=False)
X_test_brd = pd.concat([pd.DataFrame({'custid': np.sort(test['custid'].unique())}), test_features], axis=1)#.to_csv('X_test_buyer.csv', index=False)

In [2]:
%run word2vec_brd.py

  w2v.init_sims(replace=True)


## pc_nm

In [37]:
%%writefile word2vec_pc.py

### Imports
import pandas as pd
import numpy as np
import os


### Read data
train = pd.read_csv('X_train.csv', encoding='cp949')
test = pd.read_csv('X_test.csv', encoding='cp949')


### Make corpus
p_level = 'pc_nm'  # 상품 분류 수준

# W2V 학습데이터가 부족하여 구매한 상품 목록으로부터 n배 oversampling을 수행
def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=True))
    return list(bs)

train_corpus = list(train.groupby('custid')[p_level].agg(oversample, 20))
test_corpus = list(test.groupby('custid')[p_level].agg(oversample, 20))


### Training the Word2Vec model
num_features = 50 # 단어 벡터 차원 수
min_word_count = 1 # 최소 단어 수
context = 5 # 학습 윈도우(인접한 단어 리스트) 크기

# 초기화 및 모델 학습
from gensim.models import word2vec

# 모델 학습
w2v = word2vec.Word2Vec(train_corpus, 
                        vector_size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)
# 필요없는 메모리 unload
w2v.init_sims(replace=True)


### Make features
# 구매상품에 해당하는 벡터의 평균/최소/최대 벡터를 feature로 만드는 전처리기
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                #np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

# W2V 기반 feature 생성
train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

# 학습용과 제출용 데이터로 분리
X_train_pc = pd.concat([pd.DataFrame({'custid': np.sort(train['custid'].unique())}), train_features], axis=1)#.to_csv('X_train_buyer.csv', index=False)
X_test_pc = pd.concat([pd.DataFrame({'custid': np.sort(test['custid'].unique())}), test_features], axis=1)#.to_csv('X_test_buyer.csv', index=False)

Overwriting word2vec_pc.py


In [38]:
%run word2vec_pc.py

  w2v.init_sims(replace=True)


## part_nm

In [43]:
%%writefile word2vec_part.py

### Imports
import pandas as pd
import numpy as np
import os


### Read data
train = pd.read_csv('X_train.csv', encoding='cp949')
test = pd.read_csv('X_test.csv', encoding='cp949')


### Make corpus
p_level = 'part_nm'  # 상품 분류 수준

# W2V 학습데이터가 부족하여 구매한 상품 목록으로부터 n배 oversampling을 수행
def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=True))
    return list(bs)

train_corpus = list(train.groupby('custid')[p_level].agg(oversample, 20))
test_corpus = list(test.groupby('custid')[p_level].agg(oversample, 20))


### Training the Word2Vec model
num_features = 100 # 단어 벡터 차원 수
min_word_count = 1 # 최소 단어 수
context = 5 # 학습 윈도우(인접한 단어 리스트) 크기

# 초기화 및 모델 학습
from gensim.models import word2vec

# 모델 학습
w2v = word2vec.Word2Vec(train_corpus, 
                        vector_size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)
# 필요없는 메모리 unload
w2v.init_sims(replace=True)


### Make features
# 구매상품에 해당하는 벡터의 평균/최소/최대 벡터를 feature로 만드는 전처리기
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                #np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

# W2V 기반 feature 생성
train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

# 학습용과 제출용 데이터로 분리
X_train_part = pd.concat([pd.DataFrame({'custid': np.sort(train['custid'].unique())}), train_features], axis=1)#.to_csv('X_train_buyer.csv', index=False)
X_test_part = pd.concat([pd.DataFrame({'custid': np.sort(test['custid'].unique())}), test_features], axis=1)#.to_csv('X_test_buyer.csv', index=False)

Overwriting word2vec_part.py


In [44]:
%run word2vec_part.py

  w2v.init_sims(replace=True)


## customer_info

In [10]:
%%writefile word2vec_customer_info.py

### Imports
import pandas as pd
import numpy as np
import os


### Read data
train = pd.read_csv('../input/X_train.csv', encoding='cp949')
test = pd.read_csv('../input/X_test.csv', encoding='cp949')
train['customer_info'] = train['brd_nm'].astype(str) + '_' + train['corner_nm'].astype(str) + '_' + train['pc_nm'].astype(str) + '_' + train['part_nm'].astype(str) + '_' + train['str_nm'].astype(str) + '_' + train['team_nm'].astype(str) + '_' + train['buyer_nm'].astype(str)
test['customer_info'] = test['brd_nm'].astype(str) + '_' + test['corner_nm'].astype(str) + '_' + test['pc_nm'].astype(str) + '_' + test['part_nm'].astype(str) + '_' + test['str_nm'].astype(str) + '_' + test['team_nm'].astype(str) + '_' + test['buyer_nm'].astype(str)

### Make corpus
p_level = 'customer_info'  # 상품 분류 수준

# W2V 학습데이터가 부족하여 구매한 상품 목록으로부터 n배 oversampling을 수행
def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=True))
    return list(bs)

train_corpus = list(train.groupby('custid')[p_level].agg(oversample, 20))
test_corpus = list(test.groupby('custid')[p_level].agg(oversample, 20))


### Training the Word2Vec model
num_features = 100 # 단어 벡터 차원 수
min_word_count = 1 # 최소 단어 수
context = 5 # 학습 윈도우(인접한 단어 리스트) 크기

# 초기화 및 모델 학습
from gensim.models import word2vec

# 모델 학습
w2v = word2vec.Word2Vec(train_corpus, 
                        vector_size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)
# 필요없는 메모리 unload
w2v.init_sims(replace=True)


### Make features
# 구매상품에 해당하는 벡터의 평균/최소/최대 벡터를 feature로 만드는 전처리기
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                #np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

# W2V 기반 feature 생성
train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

# 학습용과 제출용 데이터로 분리
X_train_customer_info = pd.concat([pd.DataFrame({'custid': np.sort(train['custid'].unique())}), train_features], axis=1)#.to_csv('X_train_buyer.csv', index=False)
X_test_customer_info = pd.concat([pd.DataFrame({'custid': np.sort(test['custid'].unique())}), test_features], axis=1)#.to_csv('X_test_buyer.csv', index=False)

Overwriting word2vec_customer_info.py


In [11]:
%run word2vec_customer_info.py

  w2v.init_sims(replace=True)


# Feature Merge

In [12]:
del X_train_corner['custid']
del X_test_corner['custid']
del X_train_brd['custid']
del X_test_brd['custid']
del X_train_pc['custid']
del X_test_pc['custid']
del X_train_part['custid']
del X_test_part['custid']
del X_train_customer_info['custid']
del X_test_customer_info['custid']

In [13]:
X_train_corner.columns = X_train_corner.columns.map(lambda x : "corner_" + str(x))
X_test_corner.columns = X_test_corner.columns.map(lambda x : "corner_" + str(x))
X_train_brd.columns = X_train_brd.columns.map(lambda x : "brd_" + str(x))
X_test_brd.columns = X_test_brd.columns.map(lambda x : "brd_" + str(x))
X_train_pc.columns = X_train_pc.columns.map(lambda x : "pc_" + str(x))
X_test_pc.columns = X_test_pc.columns.map(lambda x : "pc_" + str(x))
X_train_part.columns = X_train_part.columns.map(lambda x : "part_" + str(x))
X_test_part.columns = X_test_part.columns.map(lambda x : "part_" + str(x))
X_train_customer_info.columns = X_train_customer_info.columns.map(lambda x : "customer_info_" + str(x))
X_test_customer_info.columns = X_test_customer_info.columns.map(lambda x : "customer_info_" + str(x))

In [18]:
w2v_features_train = pd.concat([X_train_corner, X_train_brd, X_train_pc, X_train_part, X_train_buyer], axis=1) ; X_train_w2v
w2v_features_test = pd.concat([X_test_corner, X_test_brd, X_test_pc, X_test_part, X_test_buyer], axis=1) ; X_test_w2v

Unnamed: 0,customer_info_v001,customer_info_v002,customer_info_v003,customer_info_v004,customer_info_v005,customer_info_v006,customer_info_v007,customer_info_v008,customer_info_v009,customer_info_v010,...,part_v291,part_v292,part_v293,part_v294,part_v295,part_v296,part_v297,part_v298,part_v299,part_v300
0,0.126028,0.238448,0.010795,0.129367,0.157828,0.190237,0.161458,0.315062,0.192215,0.175752,...,0.071623,0.001848,-0.078153,0.055440,0.012121,-0.097974,0.023984,0.015453,0.082945,-0.009952
1,0.126246,0.181655,0.175741,0.184657,0.206774,0.242676,0.233091,0.166628,0.209461,0.213504,...,0.004586,0.019767,-0.033093,-0.003008,-0.001773,-0.038660,-0.021493,-0.024507,0.052645,-0.018186
2,0.160387,0.144859,0.138732,0.222700,0.194777,0.205199,0.216914,0.207435,0.208292,0.120357,...,-0.011320,0.057334,0.077167,-0.093443,-0.093308,0.119095,-0.027661,-0.012306,-0.116368,0.025222
3,0.124489,-0.037535,0.138732,0.026085,0.159970,0.104852,0.077204,0.167743,0.084491,-0.111642,...,-0.075549,0.027678,0.120764,-0.060207,-0.114727,0.126929,0.009970,-0.117949,-0.029883,0.004025
4,0.139541,0.168830,0.089678,0.086368,0.132142,0.190312,0.115949,0.142076,0.049963,-0.007831,...,-0.035434,0.023045,0.039544,-0.052808,-0.066477,0.063561,-0.044272,-0.090704,0.004998,0.038559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,0.079857,-0.012189,0.127850,0.017006,0.178856,0.067744,0.059685,0.055509,0.038627,0.077512,...,0.141780,0.031884,-0.017108,0.079073,-0.035915,-0.089745,0.069129,0.030556,0.029815,-0.003495
14376,-0.122372,0.168830,0.048312,0.018549,0.116079,0.020438,-0.067901,-0.003937,-0.015879,-0.007831,...,-0.107623,-0.003805,-0.021734,-0.011623,0.058924,-0.024888,-0.085418,-0.138603,0.177582,0.056818
14377,0.074228,0.082197,-0.003594,-0.051917,0.180146,0.037355,0.149413,0.159084,0.075022,0.121613,...,0.141780,0.031884,-0.017108,0.079073,-0.035915,-0.089745,0.069129,0.030556,0.029815,-0.003495
14378,0.088175,0.194241,0.149394,0.138779,0.169383,0.037109,0.168674,0.119944,0.170581,0.034228,...,-0.013600,0.057094,0.059845,-0.119288,-0.063213,0.107178,-0.040400,-0.058523,-0.092707,0.006307


In [15]:
w2v_features_train.to_csv('w2v_features_train.csv', index=False)
w2v_features_test.to_csv('w2v_features_test.csv', index=False)

<font color="#CC3D3D"><p>
# End