In [1]:
!pip install googletrans==4.0.0-rc1
!pip install transformers

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90

In [2]:
import tensorflow as tf
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm


import pandas as pd
import numpy as np
import random
import time
import datetime

import re
from googletrans import Translator
import platform

In [3]:
# 운영 체제 정보 확인
os_info = platform.platform()

# 파이썬 버전 확인
python_version = platform.python_version()

print("운영 체제:", os_info)
print("파이썬 버전:", python_version)

운영 체제: Linux-5.15.120+-x86_64-with-glibc2.35
파이썬 버전: 3.10.12


In [4]:
n_devices = torch.cuda.device_count()
print(n_devices)

for i in range(n_devices):
    print(torch.cuda.get_device_name(i))

1
Tesla T4


## 1. Translate

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
data = pd.read_csv('/content/drive/MyDrive/용인시 SW 해커톤/train.csv')
test = pd.read_csv('/content/drive/MyDrive/용인시 SW 해커톤/test.csv')

data.shape

(32000, 3)

In [7]:
data['sentiment'].value_counts()

2    20095
1     8049
0     3856
Name: sentiment, dtype: int64

In [8]:
data.isnull().sum()

id           0
text         0
sentiment    0
dtype: int64

In [None]:
translator = Translator()

def translate_to_english(text):
    if text.isascii():
        return text
    else:
        try:
            translated_text = translator.translate(text, dest='en').text
            return translated_text
        except:
            return text

# test_data['text']에 있는 각 텍스트에 대해 번역을 적용합니다.
for i, text in enumerate(test['text']):
    test['text'][i] = translate_to_english(text)

for i, text in enumerate(data['text']):
    data['text'][i] = translate_to_english(text)


In [None]:
data = data.dropna()
test['text'].fillna('text', inplace=True)

## 2. Model

In [None]:
texts = data['text'].tolist()
sentiments = data['sentiment'].tolist()


# ROBERTa 토크나이저 초기화
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 데이터 토큰화
encoded_texts = tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors='pt')
input_ids = encoded_texts['input_ids']
attention_masks = encoded_texts['attention_mask']

# 라벨 매핑 (e.g., {'positive': 0, 'neutral': 1, 'negative': 2})
label_mapping = {label: i for i, label in enumerate(set(sentiments))}
labels = [label_mapping[s] for s in sentiments]

# 텐서로 변환
labels = torch.tensor(labels)

# 데이터셋 분할
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=32)

# 모델 초기화
num_labels = len(label_mapping)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 확인: 선택된 device가 'Tesla T4'인지 검증
if "cuda" in device.type:
    current_gpu = torch.cuda.get_device_name(0)
    if current_gpu == 'Tesla T4':
        print("Using Tesla T4 GPU")
    else:
        print(f"Using {current_gpu} GPU instead of Tesla T4")
else:
    print("Using CPU")

model.to(device)


# 옵티마이저 및 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

# 훈련 함수
def train(model, iterator, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(iterator)

# 검증 함수
def evaluate(model, iterator):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(iterator):
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    return total_loss / len(iterator)

# 훈련 및 검증
EPOCHS = 3
for epoch in range(EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, scheduler)
    val_loss = evaluate(model, val_dataloader)
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


In [None]:
# 테스트 데이터 토큰화
test_texts = test['text'].tolist()
encoded_test_texts = tokenizer(test_texts, truncation=True, padding=True, max_length=256, return_tensors='pt')
input_ids_test = encoded_test_texts['input_ids']
attention_masks_test = encoded_test_texts['attention_mask']

# 텐서로 변환
input_ids_test = torch.tensor(input_ids_test)
attention_masks_test = torch.tensor(attention_masks_test)

# 배치 크기 설정
batch_size = 32

# 테스트 데이터 로더 생성
test_dataset = TensorDataset(input_ids_test, attention_masks_test)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

# 모델 평가 함수
def predict_sentiment(model, iterator):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(iterator):
            input_ids, attention_mask = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)
            predictions.extend(predicted.tolist())
    return predictions

# 감정 예측
test_predictions = predict_sentiment(model, test_dataloader)

# 예측 결과를 데이터프레임에 추가
#test['predicted_sentiment'] = test_predictions

In [None]:
submission = pd.DataFrame({"id": test["id"], "sentiment": test_predictions})
submission['sentiment']
# 결과를 CSV 파일로 저장
submission.to_csv('/content/drive/MyDrive/용인시 SW 해커톤/robert2.csv', index=False)