In [None]:
!pip install torch scikit-learn pandas -q

# Import librerie
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from sklearn.metrics import classification_report
import torch.optim as optim

# Verifica GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

In [None]:
# Carica il dataset
df = pd.read_csv("/kaggle/input/typeofplate/classified_recipes.csv")
valid_categories = ['Antipasto', 'Primo', 'Secondo', 'Dessert']
df = df[df['Category'].isin(valid_categories)].copy()

# Combina il testo
df['text'] = df['title'].fillna('') + ' ' + df['ingredients'].fillna('') + ' ' + df['directions'].fillna('')

# Encoding etichette
le = LabelEncoder()
df['label'] = le.fit_transform(df['Category'])

# Tokenizza testi
df['tokens'] = df['text'].apply(lambda x: simple_preprocess(x))

In [None]:
# Addestra un modello Word2Vec sui token del dataset e calcola i vettori medi delle frasi

# Addestra Word2Vec (o carica modello pre-addestrato)
sentences = df['tokens'].tolist()
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4, epochs=10)

# Funzione per vettore medio della frase
def get_sentence_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

vector_size = 100
df['vector'] = df['tokens'].apply(lambda tokens: get_sentence_vector(tokens, w2v_model, vector_size))


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['vector'].tolist(), df['label'], test_size=0.2, random_state=42)

# Dataset PyTorch
class EmbeddingDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(np.array(X), dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_dataset = EmbeddingDataset(X_train, y_train)
test_dataset = EmbeddingDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
# Modello lineare
class SimpleLinear(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.fc(x)

model = SimpleLinear(input_dim=vector_size, num_classes=len(le.classes_)).to(device)

# Training
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# Valutazione
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

print(classification_report(all_labels, all_preds, target_names=le.classes_))
    