In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import json
import numpy as np
from gensim.models import FastText

# 경로 설정
common_path = "/content/drive/MyDrive/기학_팀플젝/박채연/전처리 완료/"
save_path = "/content/drive/MyDrive/기학_팀플젝/박채연/embedding/"

In [None]:
# FastText 모델 학습
def train_fasttext_model(combined_data):
    # FastText 모델 학습
    model = FastText(sentences=[sentence.split() for sentence in combined_data],  # 데이터를 단어별로 분리하여 학습
                     vector_size=100,  # 단어 벡터의 차원 크기
                     window=7,         # 주변 단어의 범위
                     sg=1,             # 1은 skip-gram 방식, 0은 CBOW 방식
                     negative=3,       # negative sampling 수
                     min_count=5,      # 최소 등장 횟수
                     epochs=10)        # 학습 반복 횟수
    model.save(save_path + 'fasttext100.model')  # 모델 저장
    return model

# 벡터화 함수 정의
def vectorize_text(text, model):
    # 텍스트를 토큰화하고 각 단어의 벡터 평균을 구함
    tokens = text.split()
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)  # 각 단어 벡터의 평균 반환
    else:
        return np.zeros(model.vector_size)  # 빈 텍스트의 경우 0 벡터 반환

# 데이터 로드 및 벡터화 후 덮어쓰기
def embedding(filename):
    # CSV 파일 로드
    df = pd.read_csv(common_path + filename)

    # 뉴스 제목과 내용 결합하여 학습용 데이터 준비
    combined_data = df['newsTitle'].fillna('') + " " + df['newsContent'].fillna('')

    # FastText 모델 학습
    model = train_fasttext_model(combined_data)

    # 벡터화 및 원래 데이터 덮어쓰기
    for index, row in df.iterrows():
        if 'newsTitle' in row and pd.notna(row['newsTitle']):
            df.at[index, 'newsTitle'] = vectorize_text(row['newsTitle'], model).tolist()
        if 'newsContent' in row and pd.notna(row['newsContent']):
            df.at[index, 'newsContent'] = vectorize_text(row['newsContent'], model).tolist()

    # 새로운 파일 경로로 저장
    new_filename = save_path + filename + "fasttext100"
    df.to_csv(new_filename, index=False, encoding='utf-8')

# 벡터화 후 CSV 저장
embedding('sampled_extracted_data_Clickbait_GB_D.csv')

In [None]:
model = FastText.load(save_path+'fasttext.model')
print(model.wv['서울'])
print(model.wv.most_similar('서울'))

[ 0.15297505 -0.03969152  0.08827855  0.12931383 -0.07131296  0.08875901
  0.1412154   0.12433794  0.14529976  0.11402723  0.04023464 -0.05001029
 -0.0004515  -0.02202269 -0.11124094  0.15260513 -0.06979501 -0.03460567
 -0.06699899 -0.03216155 -0.04092739  0.10254266  0.00328359 -0.19958758
  0.13955915  0.02570171  0.11257134 -0.05356277 -0.1283233  -0.0614771
  0.01426351  0.03582283  0.10514344  0.10191986  0.13869551 -0.00471917
  0.02070667 -0.08990808 -0.05372972  0.03732272 -0.17482239  0.00508278
 -0.20427671 -0.24731621  0.02877089  0.05576922  0.0120332   0.05504047
  0.27455562  0.03932856  0.04408191 -0.00844462  0.06462041  0.01002549
  0.06623228  0.00386491  0.12138618 -0.06202064  0.10145557 -0.04387568
  0.11602737  0.00954842 -0.05423176 -0.05710137 -0.3262819   0.06746909
  0.01672877  0.21101162 -0.03167107  0.1702346  -0.12602016 -0.02317284
 -0.10366213 -0.08085754 -0.13718608 -0.01209076  0.05961124 -0.07045184
  0.08212541 -0.19261254 -0.1495645   0.01608928 -0.

In [None]:
model = FastText.load(save_path+'fasttext100.model')
print(model.wv['서울'])
print(model.wv.most_similar('서울'))

[ 1.83110371e-01 -7.69571960e-02 -6.46172762e-02 -1.31645822e-04
 -1.32236496e-01 -4.84803282e-02  1.81473479e-01  3.67278270e-02
 -3.63017172e-02  1.47450432e-01 -2.91968379e-02 -1.64860830e-01
  1.34810228e-02  8.95825997e-02  4.95475251e-03 -3.59788090e-01
  4.07129079e-01 -1.27005458e-01 -1.42470568e-01 -1.28258944e-01
 -1.28617138e-01 -3.47385183e-02 -3.39676812e-02  7.09071159e-02
 -2.02852592e-01 -3.58128101e-02 -1.94359720e-01  9.04235840e-02
  2.35087499e-02 -1.51670780e-02  3.45672578e-01 -7.04119354e-02
 -1.58114973e-02  6.75714761e-02  1.26671091e-01  4.05469770e-03
 -3.75138298e-02  9.24652144e-02  4.75994088e-02 -1.04295000e-01
 -3.89975846e-01 -4.53402027e-02 -2.65603244e-01  5.12083098e-02
 -9.03090090e-02 -2.89927453e-01 -2.17674211e-01  1.67816468e-02
  1.86402351e-02  1.39839232e-01  6.67992234e-02  2.00346615e-02
 -9.91572738e-02 -2.43537486e-01 -2.70329595e-01 -8.06845073e-03
 -6.15424365e-02 -4.90635745e-02 -5.37111163e-01  1.11187942e-01
 -5.27992670e-04 -4.04365