## Modeling

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

### Modeling LSTM

In [None]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[-1])
        output = self.sigmoid(output)
        return output

### Modeling Linear Classifier

In [None]:
# Define the Linear Classifier model
class LinearClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearClassifier, self).__init__()
        self.linear = nn.Linear(input_size, output_size)  # 입력 차원, 출력 차원

    def forward(self, x):
        return torch.sigmoid(self.linear(x))  # 시그모이드 활성화 함수

### Dataset

In [None]:
from torch.utils.data import DataLoader, Dataset

# Define custom Dataset for train and test data
class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __getitem__(self, index):
        # x = self.data[index]
        # y = self.labels.iloc[index]
        x = torch.tensor(self.data[index]) # 데이터를 Tensor로 변환
        y = self.labels.iloc[index]
        return x, y

    def __len__(self):
        return len(self.data)

In [None]:
from sklearn.model_selection import train_test_split
import ast

### Sentence Vector - Simple Averaging

In [None]:
# Load and preprocess the entire dataset from a DataFrame
df = pd.read_csv('/home/kyuyeon/문서/kmu/23-1/bigdata-latest/movie/data/ratings-mec.csv') # replace with your own code to load the data from a CSV file
sentences = df['tokens'].apply(ast.literal_eval) # assuming 'tokens' is the column containing tokenized sentences, modify this according to your DataFrame

# Split the dataset into train and test sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, df['label'], test_size=0.2, random_state=42) # modify this line to include labels
# train_sentences, valid_sentences, train_labels, valid_labels = train_test_split(train_sentences, train_labels, test_size=0.1, random_state=42)

print(f'size of train: {len(train_sentences)}')
# print(f'size of valid: {len(valid_sentences)}')
print(f'size of test: {len(test_sentences)}')

size of train: 160000
size of test: 40000


In [None]:
# Convert tokenized sentences to word embeddings vectors
def sentence2vec(model, sentences):
    vectors = []
    for sentence in sentences:
        word_vectors = []
        for word in sentence:
            if word in model.wv.key_to_index:
                word_vectors.append(model.wv[word])
        if word_vectors:
            word_vectors = np.array(word_vectors)
            sentence_vector = np.mean(word_vectors, axis=0) # 단어 벡터들을 평균내어 문장 벡터 생성
        else:
            sentence_vector = np.array([0]*100)
        vectors.append(sentence_vector)
    vectors = np.array(vectors)
    return vectors

### Word Embedding
1. Word2Vec (CBOW)
2. Word2Vec (Skip-gram)
3. FastText

In [None]:
from gensim.models import Word2Vec

# Train Word2Vec model on the tokenized sentences
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(sentences, total_examples=len(sentences), epochs=10)

train_vectors = sentence2vec(word2vec_model, train_sentences)
test_vectors = sentence2vec(word2vec_model, test_sentences)
print(train_vectors.shape) # (160000, 100)
print(train_labels.shape) # (160000,)

# Create DataLoader for train and test data
train_dataset = TextDataset(train_vectors, train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=False)

test_dataset = TextDataset(test_vectors, test_labels)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, drop_last=False)

(160000, 100)
(160000,)


In [None]:
from gensim.models import FastText

# Train FastText model on the tokenized sentences
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)
fasttext_model.train(sentences, total_examples=len(sentences), epochs=10)

train_vectors = sentence2vec(fasttext_model, train_sentences)
test_vectors = sentence2vec(fasttext_model, test_sentences)
print(train_vectors.shape) # (160000, 100)
print(train_labels.shape) # (160000,)

# Create DataLoader for train and test data
train_dataset = TextDataset(train_vectors, train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=False)

test_dataset = TextDataset(test_vectors, test_labels)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, drop_last=False)

(160000, 100)
(160000,)


### Logistic Train, Test

In [None]:
# Create an instance of the linear classifier model
input_size = train_vectors.shape[1]
output_size = 1 # assuming binary classification (1 or 0)
model = LinearClassifier(input_size, output_size)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
for epoch in tqdm(range(10)):
    model.train()
    for batch in train_loader:
        x, y = batch
        x = x.float()  # 입력 데이터를 float 형태로 변환
        optimizer.zero_grad()
        predictions = model(x)
        loss = criterion(predictions, y.unsqueeze(1).float())
        loss.backward()
        optimizer.step()

    if epoch % 1000 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch, 10000, loss.item()))

# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        x, y = batch
        x = x.float()  # 입력 데이터를 float 형태로 변환
        predictions = model(x)
        predicted_labels = (predictions > 0.5).float()
        total += y.size(0)
        correct += (predicted_labels == y.unsqueeze(1).float()).sum().item()
    accuracy = (correct / total) * 100
    print('Test Accuracy: {:.2f}%'.format(accuracy))

### LSTM Train, Test

In [None]:
input_size = 1 # word embeddings의 크기
hidden_size = 128 # LSTM의 hidden unit 개수
output_size = 1 # 출력 유닛 개수 (이진 분류)

lstm_model = LSTMModel(input_size, hidden_size, output_size) # LSTM 모델 초기화
criterion = nn.BCELoss() # BCE 손실 함수
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001) # Adam 옵티마이저

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # GPU 사용 가능 여부에 따라 디바이스 설정
lstm_model.to(device) # 모델을 GPU로 이동

for epoch in range(10):
    for batch in train_loader:
        inputs, labels = batch
        # 데이터를 텐서로 변환하고 GPU로 이동
        inputs = torch.tensor(inputs, dtype=torch.float32).to(device)
        labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1).to(device)

        optimizer.zero_grad() # 그래디언트 초기화
        inputs = inputs.view(32, 100, -1)
        inputs = inputs.permute(1, 0, 2)
        # print(inputs.shape) # torch.Size([100, 32, 1])
        outputs = lstm_model(inputs) # LSTM 모델에 입력 전달
        loss = criterion(outputs.squeeze(), labels.squeeze()) # 손실 계산
        loss.backward() # 역전파
        optimizer.step() # 가중치 업데이트

    print('Epoch [{}/{}], Batch Loss: {:.4f}'.format(epoch + 1, 10, loss.item()))


  inputs = torch.tensor(inputs, dtype=torch.float32).to(device)
  labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1).to(device)


Epoch [1/10], Batch Loss: 0.6412
Epoch [2/10], Batch Loss: 0.5951
Epoch [3/10], Batch Loss: 0.6103
Epoch [4/10], Batch Loss: 0.4543
Epoch [5/10], Batch Loss: 0.7007
Epoch [6/10], Batch Loss: 0.5084
Epoch [7/10], Batch Loss: 0.4986
Epoch [8/10], Batch Loss: 0.5381
Epoch [9/10], Batch Loss: 0.5535
Epoch [10/10], Batch Loss: 0.7661


In [None]:
lstm_model.eval() # 모델을 평가 모드로 변경
test_loss = 0 # 테스트 손실 초기화
correct = 0 # 정확한 예측 개수 초기화

with torch.no_grad(): # 그래디언트 계산 비활성화
    for batch in test_loader:
        inputs, labels = batch
        inputs = torch.tensor(inputs, dtype=torch.float32).to(device)
        labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1).to(device)

        inputs = inputs.view(32, 100, -1)
        inputs = inputs.permute(1, 0, 2)
        outputs = lstm_model(inputs) # LSTM 모델에 입력 전달
        test_loss += criterion(outputs.squeeze(), labels.squeeze()).item() # 테스트 손실 누적 계산

        predicted = torch.round(outputs.squeeze()) # 예측값을 0 또는 1로 변환
        correct += (predicted == labels.squeeze()).sum().item() # 정확한 예측 개수 누적 계산

test_loss /= len(test_loader) # 배치 수로 나누어 평균 테스트 손실 계산
accuracy = correct / (len(test_loader) * 32) # 전체 예측 개수로 나누어 정확도 계산

print('Test Loss: {:.4f}, Accuracy: {:.2%}'.format(test_loss, accuracy))

  inputs = torch.tensor(inputs, dtype=torch.float32).to(device)
  labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1).to(device)


Test Loss: 0.5258, Accuracy: 72.67%
