#### CUDA 사용 여부 확인

In [1]:
import torch

# CUDA 사용 가능 여부 확인
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print("Torch version:{}".format(torch.__version__))
print("cuda version: {}".format(torch.version.cuda))
print("cudnn version:{}".format(torch.backends.cudnn.version()))

Using device: cuda
Torch version:2.3.0+cu121
cuda version: 12.1
cudnn version:8906


#### Transformers, NLTK 라이브러리 설치

In [2]:
!pip install -U sentence-transformers
!pip install nltk

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [3]:
import pandas as pd
import numpy as np
import random
import re

In [4]:
SEED = 12

np.random.seed(SEED)
random.seed(SEED)

#### 구글 드라이브 연동

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
df = pd.read_csv('/content/drive/MyDrive/open/news.csv')

In [7]:
df.head()

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...


#### title(제목), contents(본문) 을 :로 구분해 연결

In [8]:
df['text'] = df['title'] + " :" + df['contents']

df['text'].head()

0    Spanish coach facing action in race row :MADRI...
1    Bruce Lee statue for divided city :In Bosnia, ...
2    Only Lovers Left Alive's Tilda Swinton Talks A...
3    Macromedia contributes to eBay Stores :Macrome...
4    Qualcomm plans to phone it in on cellular repa...
Name: text, dtype: object

#### 영어 불용어 및 기본형태로 변환하기 위한 리소스 다운로드

In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
all_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
all_stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


#### 전처리 1

In [11]:
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 내용과 무관한 태그 제거
    text = re.sub(r'target=\/\S+','' , text)
    text = re.sub(r'&lt\;\S+', '', text)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    # 불용어 제거
    text = word_tokenize(text)
    tokens_without_sw = [word for word in text if not word in all_stopwords]
    text = (" ").join(tokens_without_sw)

    return text.lower()

#### 전처리 2(6/5추가)

In [12]:
# 단일 문자 제거
def remove_single_char(text, threshold=1):
    words = word_tokenize(text)
    text = ' '.join([word for word in words if len(word) > threshold])
    return text

# 구두점 제거
def remove_punctuation(text):
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

# 불필요한 공백 제거
def remove_extra_whitespaces(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

#### 전처리 3(6/12추가)

In [13]:
# Lemmatization
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, #명사
               "V": wordnet.VERB, #동사
               "J": wordnet.ADJ,  #형용사
               "R": wordnet.ADV   #부사
               }

def lemmatize_words(text):
    pos_tagged_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

* 각 단어를 기본 형태(lemma)로 변환하는 작업
* 예를 들어 동사의 시제, 명사의 복수형 등을 통일
* ex) runnung -> run , quickly -> quickly(부사는 원형이 동일)
* **제출 결과 score가 떨어지는 결과 초래**

#### 전처리 실행

In [14]:
df['text'] = df['text'].apply(preprocess_text)
df['text'] = df['text'].apply(remove_single_char)
df['text'] = df['text'].apply(remove_punctuation)
df['text'] = df['text'].apply(remove_extra_whitespaces)
# df["text"] = df["text"].apply(lemmatize_words)
print(df['text'].head())

0    spanish coach facing action race row madrid af...
1    bruce lee statue divided city in bosnia one ma...
2    only lovers left alive s tilda swinton talks a...
3    macromedia contributes ebay stores macromedia ...
4    qualcomm plans phone cellular repairs over the...
Name: text, dtype: object


#### gte-large Model


In [15]:
from sentence_transformers import SentenceTransformer

device = torch.device("cuda")

model = SentenceTransformer('thenlper/gte-large', device=device)
sentence_embeddings = model.encode(df["text"].tolist())

df_embeddings = pd.DataFrame(sentence_embeddings)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

* 텍스트를 1024차원으로 임베딩

#### MinMaxScaler 정규화

In [16]:
# MinMaxScaler 정규화
from sklearn.preprocessing import MinMaxScaler

# MinMaxScaler 객체 생성
scaler = MinMaxScaler()

# 임베딩 데이터프레임 정규화
df_embeddings_nor = pd.DataFrame(scaler.fit_transform(df_embeddings), columns=df_embeddings.columns)

# 정규화된 데이터프레임 출력
print(df_embeddings_nor.shape)

(60000, 1024)


#### PCA 차원 축소

In [45]:
# PCA 차원 축소
from sklearn.decomposition import PCA

pca = PCA(n_components=200, random_state=SEED)

embed_pca_100 = PCA(n_components=100, random_state=SEED).fit_transform(df_embeddings_nor)

embed_pca_200 = PCA(n_components=200, random_state=SEED).fit_transform(df_embeddings_nor)

embed_pca_500 = PCA(n_components=500, random_state=SEED).fit_transform(df_embeddings_nor)

df_embed_pca_100 = pd.DataFrame(embed_pca_100)

df_embed_pca_200 = pd.DataFrame(embed_pca_200)

df_embed_pca_500 = pd.DataFrame(embed_pca_500)

print(df_embeddings_nor.shape)
print(df_embed_pca_100.shape)
print(df_embed_pca_200.shape)
print(df_embed_pca_500.shape)

(60000, 1024)
(60000, 100)
(60000, 200)
(60000, 500)


#### K-Means Clustering

In [47]:
from sklearn.cluster import KMeans

# KMeans
kmeans = KMeans(n_clusters=6, random_state=SEED)

k_means_res = kmeans.fit_predict(df_embeddings_nor)
k_means_pca_100_res = kmeans.fit_predict(df_embed_pca_100)
k_means_pca_200_res = kmeans.fit_predict(df_embed_pca_200)
k_means_pca_500_res = kmeans.fit_predict(df_embed_pca_500)

df['k_means'] = k_means_res
df['k_means_pca1'] = k_means_pca_100_res
df['k_means_pca2'] = k_means_pca_200_res
df['k_means_pca5'] = k_means_pca_500_res



|Dimension |Lemma|Result|
|------|--|------|
|1024|X|0.787|
|1024|O|0.780(1) |
|100|X|0.786|
|100|O|0.779|
|200|X|0.788|
|200|O|0.780(2) |
|500|X|0.787|
|500|O|0.780(2) |

* K-Means Clustering Model에서는 1024, 500, 200, 100 차원 데이터를 각각 진행

In [49]:
print(df[["k_means", "k_means_pca1", "k_means_pca2", "k_means_pca5"]])

       k_means  k_means_pca1  k_means_pca2  k_means_pca5
0            0             2             4             1
1            5             1             3             4
2            4             4             0             5
3            2             0             5             0
4            2             0             5             0
...        ...           ...           ...           ...
59995        0             2             4             1
59996        5             1             3             4
59997        0             2             4             1
59998        3             5             1             2
59999        4             4             0             5

[60000 rows x 4 columns]


#### Category

|category|info|
|------|---|
|0|Business|
|1|Entertainment|
|2|Politics|
|3|Sport|
|4|Tech|
|5|World|

In [72]:
head_5_idx = df[df['k_means'] == 5]['text'].head(10).index
for i in range(10):
    i_data = df['text'][head_5_idx[i]]
    print(f'{i+1}. {i_data}')

1. bruce lee statue divided city in bosnia one man hero often another man villain citizens decided honour one serbs croats muslims look kung fu great bruce lee
2. fischer s fiancee marriage plans genuine ap ap former chess champion bobby fischer s announcement thathe engaged japanese woman could win sympathy among japanese officials help avoid deportation united states fiancee one supporters said tuesday
3. israel kills palestinians big gaza incursion reuters reuters israeli forces killed three palestinians including two teenagers wednesday after storming northern gaza strip third time as many months quell palestinian rocket fire israel
4. the folly sole superpower writ small authors think little imperial folly s backstory in years invading iraq disbanding saddam hussein s military u s sunk billion standing new iraqi army
5. oil falls below nigeria cease fire london reuters oil prices dropped record highs barrel wednesday u s government reported surprise increase crude stocks rebels ni

In [73]:
head_5_idx = df[df['k_means_pca1'] == 5]['text'].head(10).index
for i in range(10):
    i_data = df['text'][head_5_idx[i]]
    print(f'{i+1}. {i_data}')

1. bump stock maker resumes sales one month after las vegas mass shooting authors move along nothing see
2. congress spikes handout for private equity authors wall street firms almost big
3. deere s color is green with big tractors big sales big earnings deere s hoeing profitable row
4. kmart sears merger price quality average customers know thing ministers high finance understand it price shoppers thursday billings sears store eager find proposed
5. agencies postpone issuing new rules until after election federal agencies delayed range proposed regulations food safety corporate governance election day
6. bribery considered halliburton notes suggest an internal halliburton co investigation uncovered handwritten notes suggesting former employees considered offering bribes nigerian officials decade ago secure work billion project build natural gas liquefaction plant
7. abn amro profit rises buoyed sale asia stake update abn amro holding nv largest dutch bank said profit rose percent thir

In [74]:
head_5_idx = df[df['k_means_pca2'] == 5]['text'].head(10).index
for i in range(10):
    i_data = df['text'][head_5_idx[i]]
    print(f'{i+1}. {i_data}')

1. macromedia contributes ebay stores macromedia announced special version contribute website editing application designed simplify creation customisation ebay stores
2. qualcomm plans phone cellular repairs over the air fixes cell phones comes qualcomm s cdma
3. thomson back both blu ray hd dvd company one core backers blu ray also support rival format
4. ftc files first lawsuit against spyware concerns the federal trade commission formally announced yesterday first assault spyware bits computer code surreptitiously install computers internet users
5. sony psp draws crowds lines first day reuters reuters game fans stood lines chilly tokyo night among first world get their hands sony corp s playstation portable consumer electronics firm s first handheld game machine
6. is e voting secure cbs nearly one third voters many million people expected cast ballots electronically next week presidential election
7. photos macexpo with exhibitors including apple adobe bose epson hp microsoft quar

In [75]:
head_5_idx = df[df['k_means_pca5'] == 5]['text'].head(10).index
for i in range(10):
    i_data = df['text'][head_5_idx[i]]
    print(f'{i+1}. {i_data}')

1. only lovers left alive s tilda swinton talks about almost quitting acting yasmine hamdan performs hal live in nyc huffpo exclusive videos authors yasmine hamdan performs hal also sings film scene two world weary vampires begin heal find way continue living remember power mystery creation
2. harry argy bargy prince charles asked scotland yard in depth report son harry trip argentina reports excessive drinking kidnap plot
3. be top short description
4. cate blanchett set to star as lucille ball in new biopic authors we love news almost much love lucy
5. deep impact space probe aims slam into comet reuters reuters astronomers plan slam an armchair sized impactor comet tempel see what s inside possibly help future scientists determine to keep space rocks colliding earth
6. out v i c t o r y missing tiles missing key piece favorite board game the web s abundance board game sites might help
7. the trouble broadcasting social world authors today marketer re competing customers hearts minds

##### k_means
* 0 -> 3(Sport)
* 1 -> 2(Politics)
* 2 -> 4(Tech)
* 3 -> 0(Business)
* 4 -> 1(Entertainment)
* 5 -> 5(World)

##### k_means_pca(100)
* 0 -> 4(Tech)
* 1 -> 5(World)
* 2 -> 3(Sport)
* 3 -> 2(Politics)
* 4 -> 1(Entertainment)
* 5 -> 0(Business)

##### k_means_pca(200)
* 0 -> 1(Entertainment)
* 1 -> 0(Business)
* 2 -> 2(Politics)
* 3 -> 5(World)
* 4 -> 3(Sport)
* 5 -> 4(Tech)

##### k_means_pca(500)
* 0 -> 4(Tech)
* 1 -> 3(Sport)
* 2 -> 0(Business)
* 3 -> 2(Politics)
* 4 -> 5(World)
* 5 -> 1(Entertainment)

13일 메핑후 제출

#### Data Mapping

In [76]:
mapping_k = {
    0: 3,
    1: 2,
    2: 4,
    3: 0,
    4: 1,
    5: 5
}
mapping_k_pca1 = {
    0: 4,
    1: 5,
    2: 3,
    3: 2,
    4: 1,
    5: 0
}
mapping_k_pca2 = {
    0: 1,
    1: 0,
    2: 2,
    3: 5,
    4: 3,
    5: 4
}
mapping_k_pca5 = {
    0: 4,
    1: 3,
    2: 0,
    3: 2,
    4: 5,
    5: 1
}
df['k_means'] = df['k_means'].apply(lambda x: mapping_k[x])
df['k_means_pca1'] = df['k_means_pca1'].apply(lambda x: mapping_k_pca1[x])
df['k_means_pca2'] = df['k_means_pca2'].apply(lambda x: mapping_k_pca2[x])
df['k_means_pca5'] = df['k_means_pca5'].apply(lambda x: mapping_k_pca5[x])

In [77]:
sample_1 = pd.read_csv('/content/drive/MyDrive/open/sample_submission.csv')

sample_1['category'] = df["k_means"].values
sample_1['category'].head()

0    3
1    5
2    1
3    4
4    4
Name: category, dtype: int64

In [78]:
sample_2 = pd.read_csv('/content/drive/MyDrive/open/sample_submission.csv')

sample_2['category'] = df["k_means_pca1"].values
sample_2['category'].head()

0    3
1    5
2    1
3    4
4    4
Name: category, dtype: int64

In [79]:
sample_3 = pd.read_csv('/content/drive/MyDrive/open/sample_submission.csv')

sample_3['category'] = df["k_means_pca2"].values
sample_3['category'].head()

0    3
1    5
2    1
3    4
4    4
Name: category, dtype: int64

In [80]:
sample_4 = pd.read_csv('/content/drive/MyDrive/open/sample_submission.csv')

sample_4['category'] = df["k_means_pca5"].values
sample_4['category'].head()

0    3
1    5
2    1
3    4
4    4
Name: category, dtype: int64

In [81]:
sample_1.to_csv('./gte_kmeans_1024d.csv', index=False)
sample_2.to_csv('./gte_kmeans_100d.csv', index=False)
sample_3.to_csv('./gte_kmeans_200d.csv', index=False)
sample_4.to_csv('./gte_kmeans_500d.csv', index=False)