In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
import string
import re
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
seed = random.randint(0, 2 ** 32 - 1)
seed = 1985442815
random.seed(seed)
# Tell pytorch to run this model on the GPU.
device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
# device_name = "cpu"
device = torch.device(device_name)
print(f"Will use {device_name} for training with seed: {seed}")
# Model parameters
max_length = 128
dragon_to_number = {}
sentiment_to_number = {}

In [None]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
data = []
with open("./sentiment_training_data.txt") as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if len(line) > 0:
            line_split = line.split(" ")
            dragon = line_split[0]
            sentiment = line_split[1]
            if not dragon in dragon_to_number:
                dragon_to_number[dragon] = len(dragon_to_number)
            if not sentiment in sentiment_to_number:
                sentiment_to_number[sentiment] = len(sentiment_to_number)
            data.append({
                'dragon': dragon,
                'sentiment': sentiment,
                'text': " ".join(line_split[2:])
            })
            
for d in data:
    d['text'] = tokenizer.encode(d['text'], padding='max_length', max_length=max_length)
text_tensor = torch.from_numpy(np.array([d['text'] for d in data]))
dragon_tensor = torch.from_numpy(np.array([dragon_to_number[d['dragon']] for d in data]))
sentiment_tensor = torch.from_numpy(np.array([sentiment_to_number[d['sentiment']] for d in data]))
print(text_tensor.shape, dragon_tensor.shape, sentiment_tensor.shape)

In [None]:
train_data = TensorDataset(text_tensor, dragon_tensor, sentiment_tensor)

In [None]:
for t in train_data:
    print(t)
    break