# Embeding 기반 encoder
data
*   data/vector_*{vector_size}*_embeding_X.csv
*   data/vector_*{vector_size}*_embeding_X_test.csv

model 
*   label_encoding_mutation_model_vector_*{vector_size}*.w2v

In [21]:
vector_size = 100

# Import library

In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

# Load Data

In [23]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# Data Preprocessing

In [24]:
# SUBCLASS 가 범주형이기 때문에 LabelEncoder 사용
le_subclass = LabelEncoder()
train['SUBCLASS'] = le_subclass.fit_transform(train['SUBCLASS'])

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: ACC, 변환된 숫자: 0
원래 레이블: BLCA, 변환된 숫자: 1
원래 레이블: BRCA, 변환된 숫자: 2
원래 레이블: CESC, 변환된 숫자: 3
원래 레이블: COAD, 변환된 숫자: 4
원래 레이블: DLBC, 변환된 숫자: 5
원래 레이블: GBMLGG, 변환된 숫자: 6
원래 레이블: HNSC, 변환된 숫자: 7
원래 레이블: KIPAN, 변환된 숫자: 8
원래 레이블: KIRC, 변환된 숫자: 9
원래 레이블: LAML, 변환된 숫자: 10
원래 레이블: LGG, 변환된 숫자: 11
원래 레이블: LIHC, 변환된 숫자: 12
원래 레이블: LUAD, 변환된 숫자: 13
원래 레이블: LUSC, 변환된 숫자: 14
원래 레이블: OV, 변환된 숫자: 15
원래 레이블: PAAD, 변환된 숫자: 16
원래 레이블: PCPG, 변환된 숫자: 17
원래 레이블: PRAD, 변환된 숫자: 18
원래 레이블: SARC, 변환된 숫자: 19
원래 레이블: SKCM, 변환된 숫자: 20
원래 레이블: STES, 변환된 숫자: 21
원래 레이블: TGCT, 변환된 숫자: 22
원래 레이블: THCA, 변환된 숫자: 23
원래 레이블: THYM, 변환된 숫자: 24
원래 레이블: UCEC, 변환된 숫자: 25


In [25]:
## x 의 경우도 범주형으로 구성되어 있어, 알맞은 인코딩 필요
X = train.drop(columns=['SUBCLASS', 'ID'])
y_subclass = train['SUBCLASS']

* 이 코드는 다음과 같이 작동합니다:
1. 데이터를 로드합니다.
2. 샘플의 변이 데이터를 전처리하여 "유전자_변이" 형식의 문자열 리스트로 변환합니다.
3. Word2Vec 모델을 사용하여 변이 임베딩을 학습합니다. 여기서는 10차원 벡터를 사용했지만, 이는 조정 가능합니다.
4. 각 변이에 대한 임베딩을 가져오는 함수와 각 샘플의 임베딩을 계산하는 함수를 정의합니다. 샘플 임베딩은 해당 샘플의 모든 변이 임베딩의 평균으로 계산됩니다.
5. 모든 샘플에 대한 임베딩을 생성합니다.
6. SUBCLASS를 숫자로 인코딩합니다.
7. 최종 데이터셋을 생성합니다. 이 데이터셋은 샘플 ID, 임베딩 벡터, 인코딩된 SUBCLASS를 포함합니다.
8. 이 방법의 장점은 다음과 같습니다:
9. 변이 간의 의미적 관계를 포착할 수 있습니다.
10. 고차원 데이터를 저차원으로 축소할 수 있습니다.
11. 새로운 또는 희귀한 변이에 대해서도 의미 있는 표현을 제공할 수 있습니다.
* 단점은 다음과 같습니다:
충분한 데이터가 없으면 임베딩의 품질이 떨어질 수 있습니다.
하이퍼파라미터 튜닝(예: vector_size, window 등)이 필요할 수 있습니다.
이 인코딩 방법을 사용한 후, 결과 데이터셋을 머신러닝 모델의 입력으로 사용할 수 있습니다.

In [26]:
# 변이 데이터 전처리
def preprocess_mutations(row):
    return [f"{gene}_{mutation}" for gene, mutation in row.items() if mutation != 'WT']

In [27]:
# 각 샘플을 변이 리스트로 변환
mutation_sequences = X.apply(preprocess_mutations, axis=1).tolist()

In [28]:
# Word2Vec 모델 학습
model = Word2Vec(sentences=mutation_sequences, vector_size=vector_size, window=5, min_count=1, workers=4)

In [29]:
# 변이 임베딩 함수
def get_mutation_embedding(mutation):
    if mutation in model.wv:
        return model.wv[mutation]
    else:
        return np.zeros(model.vector_size)

In [30]:
# 샘플 임베딩 함수
def get_sample_embedding(mutations):
    embeddings = [get_mutation_embedding(mut) for mut in mutations]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

In [31]:
# 모든 샘플에 대한 임베딩 생성
sample_embeddings = [get_sample_embedding(mutations) for mutations in mutation_sequences]

# 임베딩을 DataFrame으로 변환
embedding_X = pd.DataFrame(sample_embeddings, columns=[f'embed_{i}' for i in range(model.vector_size)])

In [32]:
embedding_X

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_90,embed_91,embed_92,embed_93,embed_94,embed_95,embed_96,embed_97,embed_98,embed_99
0,-0.001129,0.000582,0.000447,-0.000238,-0.000709,-0.000965,-0.001345,0.002244,-0.001245,0.000941,...,-0.001586,0.000280,-0.000369,-0.000753,-0.000025,-0.000537,-0.002210,0.000635,0.001316,0.000986
1,-0.004093,-0.000638,-0.002105,0.000170,-0.001026,-0.000330,0.000738,0.000376,-0.001659,-0.000450,...,0.000033,-0.001717,0.002820,-0.001457,0.001160,-0.001360,0.000384,0.002086,0.000502,0.001064
2,0.000853,-0.000413,-0.000039,-0.000454,-0.000490,-0.000972,-0.000729,0.000609,-0.000690,0.000540,...,0.000274,0.001413,0.000347,0.000351,0.000494,-0.000446,-0.000208,-0.000402,0.000369,0.000714
3,-0.002286,0.003273,-0.000205,-0.001424,-0.002244,0.002207,-0.001496,-0.003482,0.000563,0.003463,...,0.001586,-0.001404,-0.003384,-0.001913,-0.001637,-0.002290,-0.001759,-0.000386,0.000941,0.002159
4,-0.000234,-0.001336,0.000386,0.000899,0.002214,0.001363,0.001679,0.001675,-0.000812,0.002045,...,-0.000476,0.000995,0.000920,-0.000197,-0.000370,0.000238,0.001798,0.002282,-0.000100,0.000584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6196,-0.000454,-0.000521,-0.000143,0.000207,0.000739,0.000524,0.001881,-0.000353,0.000975,-0.000389,...,-0.000364,0.000208,0.000767,-0.000594,-0.001215,-0.000622,0.000446,0.000194,-0.000211,0.000256
6197,-0.001165,-0.002464,0.000024,0.002669,-0.002302,-0.000028,0.000254,-0.000077,0.001693,0.002579,...,0.002234,0.000420,0.001624,-0.002102,-0.003690,0.000087,-0.000291,0.000780,-0.001051,-0.000294
6198,0.001183,0.000273,0.000025,-0.001374,0.002934,0.000702,-0.001328,0.000574,-0.002885,-0.000903,...,0.000310,-0.000454,0.001892,-0.000737,-0.001684,0.002488,0.000905,0.001542,-0.000469,-0.000451
6199,-0.000064,-0.001094,-0.001524,-0.000040,0.000551,0.001924,-0.000296,0.003042,0.001101,0.002890,...,0.004014,0.001711,-0.002476,-0.001579,0.001287,0.005739,0.001672,0.002546,0.003090,-0.002197


In [33]:
model.save(f"label_encoding_mutation_model_vector_{vector_size}.w2v")

In [34]:
loaded_model = Word2Vec.load(f"label_encoding_mutation_model_vector_{vector_size}.w2v")

In [35]:
X_test = test.drop(["ID"], axis = 1)

In [36]:
def encode_test_sample(sample, model):
    mutations = preprocess_mutations(sample)
    return get_sample_embedding(mutations)

In [37]:
# 테스트 데이터 인코딩
test_embeddings = [encode_test_sample(sample, loaded_model) for _, sample in X_test.iterrows()]

In [38]:
embedding_X_test = pd.DataFrame(test_embeddings, columns=[f'embed_{i}' for i in range(loaded_model.vector_size)])

# Save in data

In [39]:
embedding_X.to_csv(f'../data/vector_{vector_size}_embeding_X.csv', encoding='UTF-8-sig', index=False)

In [40]:
embedding_X_test.to_csv(f'../data/vector_{vector_size}_embeding_X_test.csv', encoding='UTF-8-sig', index=False)