In [None]:
import torch
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import joblib

# Helper function to normalize text
def normalize_text(text):
    if not isinstance(text, str):
        return str(text) if text is not None else ""
    return text.lower().strip().replace("_", "").replace("?", "").replace("\\", "")

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load and preprocess data
data_clean = pd.read_excel("/content/mat_na_0510_AI_AT_clean_train.xlsx").dropna()
data_clean['product_name'] = data_clean['product_name'].apply(normalize_text)
data_clean['brand_clean'] = data_clean['brand_clean'].apply(normalize_text)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = torch.tensor(label_encoder.fit_transform(data_clean['brand_clean'])).long()
num_labels = len(label_encoder.classes_)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')
phobert_model = AutoModel.from_pretrained('intfloat/e5-large-v2').to(device)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data_clean['product_name'], y_encoded, test_size=0.2, random_state=42)

# Function to encode text data
def encode_texts(texts, tokenizer):
    encoded = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
    return encoded['input_ids'].to(device), encoded['attention_mask'].to(device)

# Encode training and test data
X_train_ids, X_train_mask = encode_texts(X_train.tolist(), tokenizer)
X_test_ids, X_test_mask = encode_texts(X_test.tolist(), tokenizer)

# Custom classifier model definition
class CustomClassifier(nn.Module):
    def __init__(self, phobert_model, num_labels):
        super(CustomClassifier, self).__init__()
        self.phobert = phobert_model
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, num_labels)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        return self.fc3(x)

# Initialize model, loss function, and optimizer
model = CustomClassifier(phobert_model, num_labels).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Prepare DataLoader
train_dataset = TensorDataset(X_train_ids, X_train_mask, y_train)
test_dataset = TensorDataset(X_test_ids, X_test_mask, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Training loop
best_accuracy = 0
for epoch in range(15):
    model.train()
    total_loss = 0
    total_acc_train = 0
    for input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/10"):
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        predictions = torch.argmax(outputs, dim=-1)
        total_acc_train += (predictions == labels).sum().item()
    
    train_accuracy = total_acc_train / len(train_dataset)
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}, Train Accuracy: {train_accuracy}")

    # Validation phase
    model.eval()
    total_acc = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask)
            predictions = torch.argmax(outputs, dim=-1)
            total_acc += (predictions == labels).sum().item()
    
    accuracy = total_acc / len(test_dataset)
    print(f"Epoch {epoch+1}, Validation Accuracy: {accuracy}")

    # Save the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'best_model.pth')
        joblib.dump(label_encoder, 'label_encoder.pkl')
        print("Model and label encoder saved.")

# Function to make predictions on a new dataset
def predict(file_path, file_output):
    df = pd.read_excel(file_path)
    df['product_name'] = df['product_name'].apply(normalize_text)
    input_ids, attention_mask = encode_texts(df['product_name'].tolist(), tokenizer)

    model.eval()
    predictions = []
    scores = []
    with torch.no_grad():
        for i in tqdm(range(0, len(input_ids), 16)):
            outputs = model(input_ids[i:i+16], attention_mask[i:i+16])
            batch_scores = torch.softmax(outputs, dim=-1)
            batch_predictions = torch.argmax(batch_scores, dim=-1)
            predictions.extend(batch_predictions.cpu().numpy())
            scores.extend(batch_scores.max(dim=-1).values.cpu().numpy())

    df['predicted_label'] = label_encoder.inverse_transform(predictions)
    df['scores'] = scores
    df.to_excel(file_output, index=False)
    print(f"Prediction results saved to {file_output}")

# Call predict function
predict("/content/mat_na_0510_AI_AT_clean.xlsx", "/content/mat_na_0510_AI_AT_clean_AI_txt.xlsx")
