In [10]:
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer

In [3]:
pos_df = pd.read_csv(r'C:\Users\Roydon\Desktop\it1244-final-project\data\pos.csv', index_col=0)
pos_df = pos_df.drop(columns=['FileName'])
pos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Content  25000 non-null  object
 1   rating   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 585.9+ KB


In [4]:
neg_df = pd.read_csv(r'C:\Users\Roydon\Desktop\it1244-final-project\data\neg.csv', index_col=0)
neg_df = neg_df.drop(columns=['FileName'])
neg_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Content  25000 non-null  object
 1   rating   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 585.9+ KB


In [5]:
# Bring all positive rating values down to range of 5-8
pos_df['rating'] = pos_df['rating'] - 2
pos_df['rating'].describe()

count    25000.00000
mean         6.77064
std          1.15738
min          5.00000
25%          6.00000
50%          7.00000
75%          8.00000
max          8.00000
Name: rating, dtype: float64

In [6]:
# Merging data into one dataset
data = pd.concat([pos_df, neg_df], ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Content  50000 non-null  object
 1   rating   50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [8]:
data.to_csv('processed_ratings.csv')

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }