In [1]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch.nn as nn
import joblib

## Load model
model_alibaba = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
model_name_or_path = 'Alibaba-NLP/gte-multilingual-base'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
df = pd.read_excel(r"/content/mask_train.xlsx")


## Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

## Function
# def encode(text):
#     result = tokenizer(input_texts, max_length=150, padding=True, truncation=True, return_tensors='pt')
#     return result['input_ids'], result['attention_mask']

def encode(text_list):
    result = tokenizer(text_list, max_length=150, padding=True, truncation=True, return_tensors='pt')
    return result['input_ids'], result['attention_mask']



def normalized(text):
    return str(text).lower().strip()


df['product_name'] = df['product_name'].apply(normalized)
df['brand_clean'] = df['brand_clean'].apply(normalized)


## Label
label_encoder = preprocessing.LabelEncoder()
y_encode = label_encoder.fit_transform(df['brand_clean'])
num_classes = len(df['brand_clean'].unique())
y_encode = torch.tensor(y_encode).long()


## Train test split
X_train, X_test, y_train, y_test = train_test_split(df['product_name'], y_encode, test_size=0.2, random_state=42)

## Encode cho tập huấn luyện và kiểm tra
train_input_ids, train_attention_mask = encode(X_train.to_list())
test_input_ids, test_attention_mask = encode(X_test.to_list())

train_input_ids, train_attention_mask = encode(X_train.to_list())
test_input_ids, test_attention_mask = encode(X_test.to_list())

## Tạo TensorDataset từ tensor đã mã hóa và nhãn tương ứng
train_data = TensorDataset(train_input_ids, train_attention_mask, y_train)
test_data = TensorDataset(test_input_ids, test_attention_mask, y_test)

## DataLoader
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=True)


## Custom model

class Custom_Model(nn.Module):
    def __init__(self,model,num_classes):
        super(Custom_Model,self).__init__()
        self.model = model
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,256)
        self.fc3 = nn.Linear(256,num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state[:, 0, :]  # BERT embeddings
        x = self.relu(self.fc1(last_hidden_state))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        logits = self.fc3(x)  # Logits are the raw predictions
        return logits  # Ensure that logits are returned

## Define model
model = Custom_Model(model = model_alibaba,num_classes = num_classes).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),lr = 3e-5)


best_accuracy = 0
EPOCHS = 20
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    total_acc_train = 0
    for batch in tqdm(train_dataloader,desc = f"Epoch{epoch +1 }/{EPOCHS}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids,attention_mask)
        loss = loss_fn(outputs,labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        predictions = torch.argmax(outputs,dim = -1)
        total_acc_train += (predictions == labels).sum().item()
    train_accuracy = total_acc_train / len(train_data)
    average_loss = total_loss / len(train_dataloader)
    print(f"Average loss: {average_loss:.4f}")
    print(f"Train accuracy: {train_accuracy}")

    ##Evaluate
    model.eval()
    total_acc = 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader,desc = f"Epoch{epoch +1 }/{EPOCHS}"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids,attention_mask)
            predictions = torch.argmax(outputs,dim = -1)
            total_acc += (predictions == labels).sum().item()
        accuracy = total_acc / len(test_data)
        print(f"Test accuracy: {accuracy}")
        if accuracy > best_accuracy:
            joblib.dump(label_encoder, 'big_label_encoder.pkl')
            torch.save(model.state_dict(),'best_model.pth')
            best_accuracy = accuracy






ModuleNotFoundError: No module named 'pandas'

In [None]:
## Predict files

## Load model
model_alibaba = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
model_name_or_path = 'Alibaba-NLP/gte-multilingual-base'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

## Down load encoder
label_encoder = joblib.load('/content/big_label_encoder.pkl')
num_labels = len(label_encoder.classes_)
## Start model
model = Custom_Model(model = model_alibaba,num_classes = num_labels)
model.load_state_dict(torch.load('/content/best_model.pth'))
model.to(device)
print("Model loaded from model.pth")



# tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
# model = AutoModel.from_pretrained("xlm-roberta-large")

def encode_predict(texts):
    encoded = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
    return encoded['input_ids'], encoded['attention_mask']

## Custom model 

class Custom_Model(nn.Module):
    def __init__(self,model,num_classes):
        super(Custom_Model,self).__init__()
        self.model = model
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,256)
        self.fc3 = nn.Linear(256,num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state[:, 0, :]  # BERT embeddings
        x = self.relu(self.fc1(last_hidden_state))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        logits = self.fc3(x)  # Logits are the raw predictions
        return logits  # Ensure that logits are returned

def predict(file_path, file_output):
    df = pd.read_excel(file_path)
    df['product_name'] = df['product_name'].apply(normalized)  # Make sure 'normalized' is defined
    input_ids, attention_mask = encode_predict(df['product_name'].tolist())

    ## Ensure model is in evaluation mode
    model.eval()

    predictions = []
    scores = []

    with torch.no_grad():
        for i in tqdm(range(0, len(input_ids), 16)):
            batch_input_ids = input_ids[i:i+16].to(device)
            batch_attention_mask = attention_mask[i:i+16].to(device)

            outputs = model(batch_input_ids, batch_attention_mask)
            batch_scores = torch.softmax(outputs, dim=-1)
            batch_predictions = torch.argmax(outputs, dim=-1)

            # Accumulate predictions and scores
            predictions.extend(batch_predictions.cpu().numpy())
            scores.extend(batch_scores.max(dim=-1).values.cpu().numpy())

    # Ensure the length of predictions and scores matches the length of the DataFrame
    if len(predictions) != len(df) or len(scores) != len(df):
        raise ValueError(f"Mismatch between predictions/scores and DataFrame size: {len(predictions)} predictions, {len(df)} rows")

    predicted_label = predictions
    scores = scores

    # Convert predictions to labels
    df['predicted_label'] = label_encoder.inverse_transform(predicted_label)
    df['scores'] = scores

    # Save the DataFrame to an Excel file
    df.to_excel(file_output, index=False)

    print(f"Export done: Data saved at {file_output}")

  

predict(r"/content/mat_na_clean.xlsx",r"/content/mat_na_predict.xlsx")













