<a href="https://colab.research.google.com/github/malakkkamrr/Natural-Language-Processing-Project/blob/main/ms3_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertConfig
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import pandas as pd

In [None]:
import gc

def report_gpu():
    print(torch.cuda.list_gpu_processes())
    gc.collect()
    torch.cuda.empty_cache()

report_gpu()


GPU:0
no processes are running


In [None]:
class ProductDataset(Dataset):
    def __init__(self, descriptions, brands, prices, tokenizer, max_length):
        self.descriptions = descriptions
        self.brands = brands
        self.prices = prices
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        description = self.descriptions[idx]
        brand = self.brands[idx]
        price = self.prices[idx]
        input_text = f"{brand} {description}"
        encoding = self.tokenizer(input_text,
                                   add_special_tokens=True,
                                   truncation=True,
                                   max_length=self.max_length,
                                   padding='max_length',
                                   return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'price': torch.tensor(price, dtype=torch.float)
        }


In [None]:
class BertForRegression(torch.nn.Module):
    def __init__(self, config):
        super(BertForRegression, self).__init__()
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.linear = torch.nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        price_prediction = self.linear(pooled_output)
        return price_prediction


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained('bert-base-uncased')
model = BertForRegression(config)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
MAX_LENGTH = 300
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5


In [None]:
file_path = "/kaggle/input/amazon-dataset/final_dataset (2) (2).csv"
df = pd.read_csv(file_path)

train_data, val_test_data = train_test_split(df, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

train_descriptions = train_data['description'].tolist()
train_brands = train_data['brand'].tolist()
train_prices = train_data['price/value'].tolist()

val_descriptions = val_data['description'].tolist()
val_brands = val_data['brand'].tolist()
val_prices = val_data['price/value'].tolist()

test_descriptions = test_data['description'].tolist()
test_brands = test_data['brand'].tolist()
test_prices = test_data['price/value'].tolist()


In [None]:
train_dataset = ProductDataset(train_descriptions, train_brands, train_prices, tokenizer, MAX_LENGTH)
val_dataset = ProductDataset(val_descriptions, val_brands, val_prices, tokenizer, MAX_LENGTH)
test_dataset = ProductDataset(test_descriptions, test_brands, test_prices, tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
criterion = torch.nn.L1Loss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['price'].unsqueeze(1)
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Training Loss: {total_loss / len(train_loader)}')

    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['price'].unsqueeze(1)
            outputs = model(input_ids, attention_mask)
            val_loss = criterion(outputs, labels)
            val_losses.append(val_loss.item())
    print(f'Validation Loss: {np.mean(val_losses)}')


Epoch 1: 100%|██████████| 25/25 [06:37<00:00, 15.90s/it]


Training Loss: 29.993340301513673
Validation Loss: 21.127305030822754


Epoch 2: 100%|██████████| 25/25 [06:26<00:00, 15.48s/it]


Training Loss: 29.32728527069092
Validation Loss: 20.626073837280273


Epoch 3: 100%|██████████| 25/25 [06:26<00:00, 15.44s/it]


Training Loss: 28.851836318969728
Validation Loss: 20.22636890411377


Epoch 4: 100%|██████████| 25/25 [06:29<00:00, 15.57s/it]


Training Loss: 28.487459297180177
Validation Loss: 19.919114589691162


Epoch 5: 100%|██████████| 25/25 [06:30<00:00, 15.63s/it]


Training Loss: 28.183648376464845
Validation Loss: 19.67151379585266


In [None]:
model.eval()
test_losses = []
predicted_prices = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['price'].unsqueeze(1)
        outputs = model(input_ids, attention_mask)
        test_loss = criterion(outputs, labels)
        test_losses.append(test_loss.item())
        predicted_prices.extend(outputs.numpy().flatten())

mse = mean_squared_error(test_prices, predicted_prices)
print(f'Test Mean Squared Error: {mse}')

Test Mean Squared Error: 3668.8944933349335


In [None]:
input_description = "USB Docking Station, JESWO USB 3.0 Laptop Docking Station Dual Monitor (Dual Video HDMI & VGA, Gigabit Ethernet, Audio, and More USB Ports)-Grey"
input_brand = "JESWO"
input_text = f"{input_brand} {input_description}"

input_encoding = tokenizer(input_text,
                           truncation=True,
                           padding='max_length',
                           max_length=MAX_LENGTH,
                           return_tensors='pt')
input_ids = input_encoding['input_ids']
attention_mask = input_encoding['attention_mask']

with torch.no_grad():
    output_price = model(input_ids, attention_mask)

predicted_price = output_price.item()

print("Predicted Price:", predicted_price)

Predicted Price: 14.820196151733398
