In [31]:
# imports
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import re

In [32]:
# load data set
df = pd.read_csv("news.csv", encoding='latin-1', header=None) 
df.columns = ['label', 'text']
print(df.head())
print(df.columns)

      label                                               text
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...
Index(['label', 'text'], dtype='object')


In [33]:
# clean text
def clean_text(text):
    text = text.lower() # convert string to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove links
    text = re.sub(r'\@w+|\#','', text)  # remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove numbers and punctuation
    return text
df['text'] = df['text'].astype(str).apply(clean_text) # updates text column to string and cleans

# encode labels
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])  

print("Label classes:", encoder.classes_)

Label classes: ['negative' 'neutral' 'positive']


In [34]:
# train/tests split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)
# here were training it so it can predict what label each text goes under

In [35]:
# create data set class
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from collections import Counter


# splits text at any whitespace
def tokenizer(text):
    return text.split()

# goes through all of the text and yeilds the tokens
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)


def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        counter.update(tokenizer(text))
    # start indices at 2 so we can reserve 0 for PAD and 1 for UNK
    vocab = {word: i+2 for i, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab["<pad>"] = 0
    vocab["<unk>"] = 1
    return vocab

vocab_dict = build_vocab(train_texts, min_freq=2)  
print("Vocab size:", len(vocab_dict))

def numericalize(text, vocab):
    tokens = tokenizer(text)  # split into words
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]



Vocab size: 4039


In [36]:


class NewsDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab

    # number of samples in dataset
    def __len__(self):
        return len(self.texts)

    # call when we want only one sample
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        tokens = torch.tensor(numericalize(text, self.vocab), dtype=torch.long) #self.vocab is for mapping words to numbers
        return tokens, torch.tensor(label, dtype=torch.long) #returns tokens and labels
    
# make data into dataset object
train_dataset = NewsDataset(train_texts, train_labels, vocab_dict)
test_dataset = NewsDataset(test_texts, test_labels, vocab_dict)

def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=0) #makes all lists same length by adding zeros at end
    labels = torch.tensor(labels, dtype=torch.long)
    return texts, labels

# train 32 healines per batch
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch) # reshuffle every epoch
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_batch)

print("Train batches:", len(train_loader))
print("Test batches:", len(test_loader))





Train batches: 122
Test batches: 31
