In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/cleaned_data.csv')

In [3]:
def convert_to_days(time_str):
    try:
        # Extract numeric values from the string
        times = [float(t) for t in time_str.split() if t.replace('.', '').isdigit()]
        
        # Convert to days based on the presence of "week" or "month"
        if "week" in time_str:
            times = [t * 7 for t in times]
        elif "month" in time_str:
            times = [t * 30 for t in times]  # Assuming 30 days per month
        
        # Return average if multiple values are present, else return the single value
        return sum(times) / len(times) if times else None
    except Exception as e:
        return None


# Preprocessing

### Creating Input Output Sentences for BERT

In [4]:
# Extracting relevant columns related to expiration time for different locations
relevant_columns_with_temp = [
    ("Pantry_Text", "pantry", 60),
    ("Refrigerate_Text", "refrigerator", 40),
    ("DateOfPurchase_Freeze_Text", "freezer", 0),
    # Add more columns as per requirements
]

# Initialize lists to hold the input and output sentences
input_sentences_with_temp = []
output_days_with_temp = []

# Loop through the data
for _, row in data.iterrows():
    for col, location, temp in relevant_columns_with_temp:
        # Check if expiration data is available
        if pd.notna(row[col]):
            # Construct the input sentence with temperature
            input_sentence = f"I am storing {row['Name']} in my {location} at {temp}°F, how long until it spoils or goes bad?"
            # Convert the output to days
            output_day = convert_to_days(row[col])
            
            # Append to respective lists if output_day is not None
            if output_day is not None:
                input_sentences_with_temp.append(input_sentence)
                output_days_with_temp.append(output_day)

# Display first few input-output pairs with temperature consideration
sample_data_days_with_temp = pd.DataFrame({
    "Input": input_sentences_with_temp[:5],
    "Output (Days)": output_days_with_temp[:5]
})
sample_data_days_with_temp


Unnamed: 0,Input,Output (Days)
0,I am storing fruit cocktail in my pantry at 60...,540.0


### Tokenizing

In [6]:
import torch

In [7]:
from transformers import RobertaTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Load the DistilRoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

# Tokenize the input sentences
# Restrict the max length to 64 for computational efficiency; adjust as needed
max_length = 64
input_encodings = tokenizer(input_sentences_with_temp, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')

# Extract input ids and attention masks as PyTorch tensors
input_ids = input_encodings['input_ids']
attention_masks = input_encodings['attention_mask']

# Convert output (days) to PyTorch tensor
output_days_tensor = torch.tensor(output_days_with_temp).view(-1, 1)


Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 12.3MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 115MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 480/480 [00:00<00:00, 4.92MB/s]


In [10]:
# Verify shapes
print("Input IDs Shape:", input_ids.shape)
print("Attention Masks Shape:", attention_masks.shape)
print("Output Days Tensor Shape:", output_days_tensor.shape)

# Decode a tokenized sentence back to text
decoded_sentence = tokenizer.decode(input_ids[0], skip_special_tokens=True)

# Display original and decoded sentences
print("\nOriginal Sentence:", input_sentences_with_temp[0])
print("Decoded Sentence:", decoded_sentence)

Input IDs Shape: torch.Size([1, 64])
Attention Masks Shape: torch.Size([1, 64])
Output Days Tensor Shape: torch.Size([1, 1])

Original Sentence: I am storing fruit cocktail in my pantry at 60°F, how long until it spoils or goes bad?
Decoded Sentence: I am storing fruit cocktail in my pantry at 60°F, how long until it spoils or goes bad?


In [11]:
from transformers import DistilBertConfig, DistilBertForSequenceClassification


In [12]:
# Load configuration and create model
config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=1)
model = DistilBertForSequenceClassification(config).to("cuda" if torch.cuda.is_available() else "cpu")

Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 5.18MB/s]


## Data Loading

In [15]:
from torch.utils.data import DataLoader, TensorDataset, random_split

# Create a TensorDataset
dataset = TensorDataset(input_ids, attention_masks, output_days_tensor)
print(len(dataset))



1


In [None]:
# Compute explicit sizes for training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Ensure non-zero sizes
if train_size == 0 or val_size == 0:
    raise ValueError("Insufficient data for splitting into training and validation sets.")

# Proceed with splitting and data loader creation
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)