In [3]:
!pip install torch==2.2.2 torchtext==0.17.2 --force-reinstall

Collecting torch==2.2.2
  Downloading torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchtext==0.17.2
  Downloading torchtext-0.17.2-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting filelock (from torch==2.2.2)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting typing-extensions>=4.8.0 (from torch==2.2.2)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting sympy (from torch==2.2.2)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch==2.2.2)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch==2.2.2)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch==2.2.2)
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata 

In [1]:
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sentiment-analysis-company-reviews/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sentiment-analysis-company-reviews/test.csv')



In [2]:
import re

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]","",text)
    return text

In [3]:
train_df['clean_review'] = train_df['Review'].apply(clean_text)
test_df['clean_review'] = test_df['Review'].apply(clean_text)

In [4]:
def tokenizer(text):
    return text.split()

tokenized_phrases = [tokenizer(text) for text in train_df['clean_review']]

print(tokenized_phrases[0])

['very', 'good', 'value', 'and', 'a', 'great', 'tv', 'very', 'happy', 'and', 'delivery', 'next', 'day', 'and', 'free', 'well', 'done', 'hughes']


In [5]:
from torchtext.vocab import build_vocab_from_iterator



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

In [6]:
vocab = build_vocab_from_iterator(tokenized_phrases,specials=["<pad>","<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [7]:
def text_pipeline(text):
    return vocab(tokenizer(text))

train_df['token_ids'] = train_df['clean_review'].apply(text_pipeline)
test_df['token_ids'] = test_df['clean_review'].apply(text_pipeline)

In [8]:
train_df['Rating'] = train_df['Rating'].astype(float)

In [9]:
from torch.utils.data import Dataset
import torch
class RegressionDataset(Dataset):
    def __init__(self,phrases,ratings):
        self.phrases = phrases
        self.ratings = ratings
    def __len__(self):
        return len(self.ratings)
    def __getitem__(self,idx):
        return torch.tensor(self.phrases[idx]),torch.tensor(self.ratings[idx],dtype=torch.float)


In [10]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    text_list,label_list = [],[]
    for text,label in batch:
        text_list.append(text)
        label_list.append(label)
    padded_texts = pad_sequence(text_list,batch_first=True,padding_value=vocab["<pad>"])
    labels = torch.tensor(label_list,dtype=torch.float)
    return padded_texts,labels

In [11]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

train_texts,val_texts,train_ratings,val_ratings = train_test_split(
    train_df['token_ids'].tolist(),
    train_df['Rating'].tolist(),
    test_size=0.2,
    random_state = 42
)

train_dataset = RegressionDataset(train_texts,train_ratings)
val_dataset = RegressionDataset(val_texts,val_ratings)

batch_size = 64

train_dataloader = DataLoader(train_dataset,batch_size = batch_size,shuffle=True,collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset,batch_size= batch_size,shuffle=False,collate_fn=collate_batch)

In [12]:
import torch.nn as nn
class LSTMRegressor(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,padding_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim,padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim,hidden_dim,batch_first=True)
        self.fc = nn.Linear(hidden_dim,1)

    def forward(self,x):
        embedded = self.embedding(x)
        outputs,(hidden, _) = self.lstm(embedded)
        return self.fc(hidden[-1]).squeeze(1)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128
padding_idx = vocab['<pad>']

model = LSTMRegressor(vocab_size,embedding_dim,hidden_dim,padding_idx).to(device)

In [15]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 0.01)

In [19]:
def train(model,dataloader,criterion,optimizer):
    model.train()
    total_loss = 0

    for batch in dataloader:
        inputs,labels = batch
        inputs,labels = inputs.to(device),labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Train Loss: {avg_loss:.4f}")

In [20]:
def evaluate(model,dataloader,criterion):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs,labels = batch
            inputs,labels = inputs.to(device),labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs,labels)
            total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Val Loss: {avg_loss:.4f}")

In [21]:
for epoch in range(5):
    print(f"\nEpoch {epoch+1}/{5}")
    train(model,train_dataloader,criterion,optimizer)
    evaluate(model,val_dataloader,criterion)


Epoch 1/5
Train Loss: 0.2608
Val Loss: 0.4548

Epoch 2/5
Train Loss: 0.2515
Val Loss: 0.4609

Epoch 3/5
Train Loss: 0.2417
Val Loss: 0.4452

Epoch 4/5
Train Loss: 0.2567
Val Loss: 0.4559

Epoch 5/5
Train Loss: 0.2323
Val Loss: 0.4492


In [23]:
class TestDataset(Dataset):
    def __init__(self,token_ids):
        self.token_ids = token_ids

    def __len__(self):
        return len(self.token_ids)

    def __getitem__(self,idx):
        return torch.tensor(self.token_ids[idx])

In [24]:
def collate_test_batch(batch):
    padded = pad_sequence(batch,batch_first=True,padding_value=vocab['<pad>'])
    return padded

In [25]:
test_dataset = TestDataset(test_df['token_ids'].tolist())
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,collate_fn=collate_test_batch)

In [28]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        inputs = batch.to(device)
        outputs = model(inputs)
        preds = outputs.cpu().tolist()
        predictions.extend(preds)

In [38]:
submission = pd.DataFrame({'Id': range(len(predictions)),'Rating':predictions})
submission.to_csv("submission.csv",index=False)

In [34]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"leejaegwa","key":"f716fb458e6aeb4fd613c5e2812cca5f"}'}

In [35]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [39]:
!kaggle competitions submit -c sentiment-analysis-company-reviews -f submission.csv -m "My first regression model"

100% 937k/937k [00:01<00:00, 928kB/s]
Successfully submitted to Sentiment Analysis - Company Reviews

In [48]:
submission = pd.DataFrame({
    "Id": test_df["Id"],
    "Rating": predictions  # 혹은 Rating, Sentiment 등 대회에 맞는 열 이름
})
submission.to_csv("submission.csv", index=False)

In [49]:
!kaggle competitions submit -c sentiment-analysis-company-reviews -f submission.csv -m "Fixed ID issue"

100% 948k/948k [00:01<00:00, 929kB/s] 
Successfully submitted to Sentiment Analysis - Company Reviews