In [53]:
import os
import re
import string
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import emoji
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

import nltk
nltk.download('punkt')
nltk.download('wordnet')
import nltk
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /kaggle/working/corpora/wordnet.zip


replace /kaggle/working/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


In [54]:
# 1. Lowercasing
def lowercase_text(text):
    return text.lower()

# 2. Remove HTML Tags
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=" ")

# 3. Remove URLs
def remove_urls(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

# 4. Remove Punctuations
def remove_punctuations(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# 5. Handling ChatWords
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "B4": "Before",
    "LOL": "Laughing Out Loud",
    "BRB": "Be Right Back",
    "FYI": "For Your Information",
    "IMO": "In My Opinion",
    "IMHO": "In My Humble Opinion",
    "LMAO": "Laughing My Ass Off",
    "GR8": "Great",
    "IRL": "In Real Life",
    "ILY": "I Love You",
    "BTW": "By The Way",
    "THX": "Thanks",
    "PLS": "Please",
    "WTF": "What The Fuck"
}

def chatword_conversion(text):
    new_text = []
    for word in text.split():
        if word.upper() in chat_words:
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    return " ".join(new_text)

# 6. Spelling Correction
def correct_spelling(text):
    return str(TextBlob(text).correct())

# 8. Remove Emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "[" u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"      # symbols & pictographs
        u"\U0001F680-\U0001F6FF"      # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"      # flags
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# 9. Tokenization
def tokenize_text(text):
    return word_tokenize(text)

# 10. Stemming
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

# 11. Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
    return " ".join(tokens)

# Полная функция очистки текста
def clean_text(text):
    text = lowercase_text(text)
    text = remove_html_tags(text)
    text = remove_urls(text)
    text = remove_emojis(text)
    text = chatword_conversion(text)
    text = remove_punctuations(text)
    # Спеллинг коррекция может быть ресурсозатратной, можно раскомментировать при необходимости
    # text = correct_spelling(text)
    text = stem_words(text)
    text = lemmatize_words(text)
    return text

In [None]:
import nltk
nltk.download('wordnet')
def load_data_from_folders(base_path, dataset_type='train'):
    data = {'review': [], 'sentiment': [], 'rating': []}
    sentiment_labels = {'pos': 1, 'neg': 0}
    
    # Обработка размеченных данных
    for sentiment in ['pos', 'neg']:
        path = os.path.join(base_path, dataset_type, sentiment)
        for filename in tqdm(os.listdir(path), desc=f'Loading {dataset_type}/{sentiment}'):
            if filename.endswith('.txt'):
                with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
                    content = file.read()
                    content = clean_text(content)
                    data['review'].append(content)
                    data['sentiment'].append(sentiment_labels[sentiment])
                    
                   
                    rating = int(filename.split('_')[1].split('.')[0])
                    data['rating'].append(rating)
                    
    # Обработка неразмеченных данных (unsup) если dataset_type == 'train'
    if dataset_type == 'train' and os.path.exists(os.path.join(base_path, dataset_type, 'unsup')):
        path = os.path.join(base_path, dataset_type, 'unsup')
        for filename in tqdm(os.listdir(path), desc='Loading train/unsup'):
            if filename.endswith('.txt'):
                with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
                    content = file.read()
                    content = clean_text(content)
                    data['review'].append(content)
                    data['sentiment'].append(None)  # Метки отсутствуют
                    data['rating'].append(None)     # Рейтинги отсутствуют
                    
    return pd.DataFrame(data)


base_path = '/kaggle/input/imdb-dataset/aclImdb'  

train_df = load_data_from_folders(base_path, 'train')
test_df = load_data_from_folders(base_path, 'test')

print(train_df.head())
print(test_df.head())

In [None]:
print(train_df)

In [55]:
import pandas as pd

data_path = "/kaggle/input/imdb-train-test-processed-data"

train_df = pd.read_csv(f"{data_path}/train_data.csv")


test_df = pd.read_csv(f"{data_path}/test_data.csv")


In [56]:
# Отфильтровать размеченные и неразмеченные данные
labeled_df = train_df.dropna(subset=['sentiment', 'rating'])
unlabeled_df = train_df[train_df['sentiment'].isna()]


In [None]:
print(labeled_df)
print(unlabeled_df)

In [57]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertPreTrainedModel, BertConfig, BertModel
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm import tqdm

# Определение датасета
class IMDBDataset(Dataset):
    def __init__(self, texts, sentiments, ratings, tokenizer, max_len):
        self.texts = texts
        self.sentiments = sentiments
        self.ratings = ratings
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        sentiment = self.sentiments[item]
        rating = self.ratings[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiment': torch.tensor(sentiment, dtype=torch.long),
            'rating': torch.tensor(rating, dtype=torch.float)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Параметры
MAX_LEN = 256
BATCH_SIZE = 16

# Создание датасетов
train_dataset = IMDBDataset(
    texts=labeled_df['review'].tolist(),
    sentiments=labeled_df['sentiment'].tolist(),
    ratings=labeled_df['rating'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = IMDBDataset(
    texts=test_df['review'].tolist(),
    sentiments=test_df['sentiment'].tolist(),
    ratings=test_df['rating'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
from nltk.corpus import stopwords
from wordcloud import WordCloud 
import matplotlib.pyplot as plt

def create_word_cloud(text_series, title):
    stop_words = set(stopwords.words('english'))
    
    # Объединяем все тексты
    text = ' '.join(text_series)
    
    wordcloud = WordCloud(width=800, height=400, background_color ='white', 
                          stopwords = stop_words, min_font_size = 10).generate(text)

    plt.figure(figsize = (10, 5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.title(title)
    plt.show()

# Создаем облака слов для размеченных и неразмеченных данных
create_word_cloud(labeled_df['review'], 'Облако слов для размеченных данных')
create_word_cloud(unlabeled_df['review'], 'Облако слов для неразмеченных данных')


In [None]:
from nltk.corpus import stopwords
from collections import Counter

# Функция для вывода самых частых слов
def print_most_frequent_words(text_series, num_words=20):
    stop_words = set(stopwords.words('english'))
    
    # Объединяем все тексты и разбиваем на слова
    all_words = ' '.join(text_series).lower().split()
    
    # Удаляем стоп-слова
    filtered_words = [word for word in all_words if word not in stop_words]
    
    # Подсчитываем частоту слов
    word_counts = Counter(filtered_words)
    
    # Выводим самые частые слова
    print(f"Top {num_words} most frequent words:")
    for word, count in word_counts.most_common(num_words):
        print(f"{word}: {count}")

# Выводим самые частые слова для размеченных данных
print_most_frequent_words(labeled_df['review'])

# Выводим самые частые слова для неразмеченных данных
print_most_frequent_words(unlabeled_df['review']) 

In [58]:
class BertForSentimentAndRating(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForSentimentAndRating, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        # Голова для классификации тональности
        self.classifier = nn.Linear(config.hidden_size, 2)
        
        # Голова для регрессии рейтинга
        self.regressor = nn.Linear(config.hidden_size, 1)
        
        self.init_weights()
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]  # CLS token representation
        pooled_output = self.dropout(pooled_output)
        
        sentiment_logits = self.classifier(pooled_output)
        rating_output = self.regressor(pooled_output)
        
        return sentiment_logits, rating_output

In [59]:
config = BertConfig.from_pretrained('bert-base-uncased')
model = BertForSentimentAndRating.from_pretrained('bert-base-uncased', config=config)
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSentimentAndRating were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [61]:
# Определение функций потерь и оптимизатора
criterion_sentiment = nn.CrossEntropyLoss()
criterion_rating = nn.MSELoss()

optimizer = optim.AdamW(model.parameters(), lr=2e-5)

In [62]:
def train_epoch(model, data_loader, criterion_sentiment, criterion_rating, optimizer, device, scheduler=None):
    model.train()
    total_loss = 0
    
    for batch in tqdm(data_loader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        sentiments = batch['sentiment'].to(device)
        ratings = batch['rating'].to(device)
        
        optimizer.zero_grad()
        
        logits, outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        loss_sentiment = criterion_sentiment(logits, sentiments)
        loss_rating = criterion_rating(outputs.squeeze(), ratings)
        loss = loss_sentiment + loss_rating
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

def eval_model(model, data_loader, criterion_sentiment, criterion_rating, device):
    model.eval()
    total_loss = 0
    all_preds_sent = []
    all_true_sent = []
    all_preds_rating = []
    all_true_rating = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            sentiments = batch['sentiment'].to(device)
            ratings = batch['rating'].to(device)
            
            logits, outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            loss_sentiment = criterion_sentiment(logits, sentiments)
            loss_rating = criterion_rating(outputs.squeeze(), ratings)
            loss = loss_sentiment + loss_rating
            
            total_loss += loss.item()
            
            # Предсказания для тональности
            preds_sent = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds_sent.extend(preds_sent)
            all_true_sent.extend(sentiments.cpu().numpy())
            
            # Предсказания для рейтинга
            preds_rating = outputs.squeeze().cpu().numpy()
            all_preds_rating.extend(preds_rating)
            all_true_rating.extend(ratings.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_true_sent, all_preds_sent)
    mse = mean_squared_error(all_true_rating, all_preds_rating)
    
    return avg_loss, accuracy, mse

In [None]:
from transformers import BertForMaskedLM, BertTokenizer, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
import warnings

# Отключение предупреждений
warnings.filterwarnings("ignore")

# Загрузка токенизатора и модели MLM
tokenizer_mlm = BertTokenizer.from_pretrained('bert-base-uncased')
model_mlm = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Создание датасета для MLM
class MLMDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

mlm_dataset = MLMDataset(
    texts=unlabeled_df['review'].tolist(),
    tokenizer=tokenizer_mlm,
    max_len=MAX_LEN
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_mlm,
    mlm=True,
    mlm_probability=0.15
)

# Параметры обучения
training_args = TrainingArguments(
    output_dir='./mlm_results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=10000,
    save_total_limit=2,
    prediction_loss_only=True,
    report_to="none",  # Отключаем отчеты wandb
)

trainer = Trainer(
    model=model_mlm,
    args=training_args,
    data_collator=data_collator,
    train_dataset=mlm_dataset,
)

# Обучение модели MLM
trainer.train()

# Сохранение предобученной модели MLM
mlm_model_save_path = 'bert_mlm_pretrained'
model_mlm.save_pretrained(mlm_model_save_path)
tokenizer_mlm.save_pretrained(mlm_model_save_path)

print("MLM pretraining completed and model saved.")

In [76]:
# Загрузка дополнительно предобученной модели
model = BertForSentimentAndRating.from_pretrained(mlm_model_save_path, config=config)
model = model.to(device)

# Продолжение обучения модели на задаче классификации и регрессии
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
EPOCHS = 7
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS} - Post-MLM Pretraining Training')
    train_loss = train_epoch(
        model,
        train_loader,
        criterion_sentiment,
        criterion_rating,
        optimizer,
        device
    )
    print(f'Train loss: {train_loss}')
    
    
        # Оценка модели на тестовых данных
    test_loss, test_accuracy, test_mse = eval_model(
        model,
        test_loader,
        criterion_sentiment,
        criterion_rating,
        device
    )

    print(f'Тестовая потеря: {test_loss}')
    print(f'Точность сентимента: {test_accuracy}')
    print(f'Среднеквадратичная ошибка рейтинга: {test_mse}')
    model_save_path = f'bert_sentiment_rating_model_after_mlm{epoch}'
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

Some weights of the model checkpoint at /kaggle/input/bert_mlm/pytorch/default/1 were not used when initializing BertForSentimentAndRating: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSentimentAndRating from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSentimentAndRating from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSentimentAndRating were not initialized from the model checkpoint at /kaggle/input/bert_mlm/pytorch/default/1 and are newly initialized: ['bert.pooler.dens

Epoch 1/7 - Post-MLM Pretraining Training


Training: 100%|██████████| 1563/1563 [13:16<00:00,  1.96it/s]


Train loss: 4.374041260318426


Evaluating: 100%|██████████| 1563/1563 [06:07<00:00,  4.25it/s]


Тестовая потеря: 3.2651459065790904
Точность сентимента: 0.91336
Среднеквадратичная ошибка рейтинга: 3.045119285583496
Epoch 2/7 - Post-MLM Pretraining Training


Training: 100%|██████████| 1563/1563 [13:16<00:00,  1.96it/s]


Train loss: 2.385647070766334


Evaluating: 100%|██████████| 1563/1563 [06:05<00:00,  4.28it/s]


Тестовая потеря: 3.43493384658642
Точность сентимента: 0.90728
Среднеквадратичная ошибка рейтинга: 3.1907355785369873
Epoch 3/7 - Post-MLM Pretraining Training


Training: 100%|██████████| 1563/1563 [13:15<00:00,  1.96it/s]


Train loss: 1.6297956999303131


Evaluating: 100%|██████████| 1563/1563 [06:04<00:00,  4.29it/s]


Тестовая потеря: 3.380218611568956
Точность сентимента: 0.91872
Среднеквадратичная ошибка рейтинга: 3.1406233310699463
Epoch 4/7 - Post-MLM Pretraining Training


Training: 100%|██████████| 1563/1563 [13:15<00:00,  1.97it/s]


Train loss: 1.348862527818994


Evaluating: 100%|██████████| 1563/1563 [06:05<00:00,  4.28it/s]


Тестовая потеря: 3.2782909617504856
Точность сентимента: 0.91564
Среднеквадратичная ошибка рейтинга: 3.010814905166626
Epoch 5/7 - Post-MLM Pretraining Training


Training: 100%|██████████| 1563/1563 [13:15<00:00,  1.97it/s]


Train loss: 1.1214933928319146


Evaluating: 100%|██████████| 1563/1563 [06:04<00:00,  4.29it/s]


Тестовая потеря: 3.406170955794176
Точность сентимента: 0.91536
Среднеквадратичная ошибка рейтинга: 3.1161348819732666
Epoch 6/7 - Post-MLM Pretraining Training


Training: 100%|██████████| 1563/1563 [13:15<00:00,  1.97it/s]


Train loss: 0.954977834093136


Evaluating: 100%|██████████| 1563/1563 [06:05<00:00,  4.28it/s]


Тестовая потеря: 3.2709136634054508
Точность сентимента: 0.91696
Среднеквадратичная ошибка рейтинга: 2.9695193767547607
Epoch 7/7 - Post-MLM Pretraining Training


Training: 100%|██████████| 1563/1563 [13:15<00:00,  1.97it/s]


Train loss: 0.8158995519158021


Evaluating: 100%|██████████| 1563/1563 [06:04<00:00,  4.29it/s]


Тестовая потеря: 3.270464709551763
Точность сентимента: 0.91744
Среднеквадратичная ошибка рейтинга: 2.9524283409118652


In [None]:
model_save_path = 'bert_sentiment_rating_model_after_mlm'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

In [None]:
# EPOCHS = 5

# for epoch in range(EPOCHS):
#     print(f'Epoch {epoch + 1}/{EPOCHS}')
#     train_loss = train_epoch(
#         model,
#         train_loader,
#         criterion_sentiment,
#         criterion_rating,
#         optimizer,
#         device
#     )
#     print(f'Train loss: {train_loss}')
    
   
#     # Оценка модели на тестовых данных
#     test_loss, test_accuracy, test_mse = eval_model(
#         model,
#         test_loader,
#         criterion_sentiment,
#         criterion_rating,
#         device
#     )

#     print(f'Тестовая потеря: {test_loss}')
#     print(f'Точность сентимента: {test_accuracy}')
#     print(f'Среднеквадратичная ошибка рейтинга: {test_mse}')

In [None]:
import matplotlib.pyplot as plt

# Визуализация данных
def plot_data_distribution(dataframe):
    rating_counts = dataframe['rating'].value_counts().sort_index()
    plt.figure(figsize=(10, 6))
    plt.bar(rating_counts.index, rating_counts.values, color='skyblue')
    plt.xlabel('Рейтинг')
    plt.ylabel('Количество')
    plt.title('Распределение данных по рейтингам')
    plt.show()

plot_data_distribution(test_df)
plot_data_distribution(train_df)

In [63]:
from transformers import BertTokenizer

model_path = "/kaggle/input/bert_fine_tuned_91_8/pytorch/default/1"

tokenizer = BertTokenizer.from_pretrained(model_path)

config = BertConfig.from_pretrained(model_path)

model = BertForSentimentAndRating.from_pretrained(model_path, config=config)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



In [66]:
def predict_review(text, tokenizer, model, device, max_len=256):
    """
    Функция для предсказания сентимента и рейтинга отзыва.
    
    Args:
        text (str): Текст отзыва.
        tokenizer (BertTokenizer): Токенизатор BERT.
        model (BertForSentimentAndRating): Обученная модель.
        device (torch.device): Устройство (CPU или GPU).
        max_len (int): Максимальная длина последовательности.
    
    Returns:
        tuple: (Сентимент, Рейтинг)
    """
    # Очистка текста
    cleaned_text = clean_text(text)
    
    # Токенизация
    encoding = tokenizer.encode_plus(
        cleaned_text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    # Перемещение данных на устройство
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Генерация предсказаний без вычисления градиентов
    with torch.no_grad():
        logits, output = model(input_ids, attention_mask)
        sentiment = torch.argmax(logits, dim=1).item()
        rating = output.squeeze().item()
    
    # Интерпретация сентимента
    sentiment_label = 'Положительный' if sentiment == 1 else 'Отрицательный'
    
    # Округление рейтинга и ограничение его диапазона от 1 до 10
    rating = round(rating)
    rating = max(1, min(10, rating))
    
    return sentiment_label, rating


In [70]:
# Пример отзыва
sample_text = "This film is awesome!!! One of the my favorite movie"

# Предсказание
sentiment, rating = predict_review(sample_text, tokenizer, model, device)

print(f"Тональность: {sentiment}, Рейтинг: {rating}/10")

Тональность: Положительный, Рейтинг: 10/10


In [71]:
# Оценка модели на тестовых данных
test_loss, test_accuracy, test_mse = eval_model(
    model,
    test_loader,
    criterion_sentiment,
    criterion_rating,
    device
)

print(f'Тестовая потеря: {test_loss}')
print(f'Точность сентимента: {test_accuracy}')
print(f'Среднеквадратичная ошибка рейтинга: {test_mse}')

Evaluating: 100%|██████████| 1563/1563 [06:05<00:00,  4.27it/s]

Тестовая потеря: 3.2248652610577455
Точность сентимента: 0.91672
Среднеквадратичная ошибка рейтинга: 2.9626684188842773





In [None]:


# # # Очистка тестовых данных
# # test_df['review'] = test_df['review'].apply(clean_text)

# # Создание датасета
# test_dataset = IMDBDataset(
#     texts=test_df['review'].tolist(),
#     sentiments=test_df['sentiment'].tolist(),
#     ratings=test_df['rating'].tolist(),
#     tokenizer=tokenizer,
#     max_len=MAX_LEN
# )

# # Создание DataLoader для тестовых данных
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# print("Тестовый DataLoader создан.")

In [None]:
# def generate_pseudo_labels(model, data_loader, device, threshold=0.9):
#     model.eval()
#     pseudo_data = []

#     with torch.no_grad():
#         for batch in tqdm(data_loader, desc='Generating Pseudo Labels'):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)

#             logits, outputs = model(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask
#             )

#             # Для тональности
#             probs_sent = torch.softmax(logits, dim=1)
#             max_probs, preds_sent = torch.max(probs_sent, dim=1)

#             # Для рейтинга 
#             preds_rating = outputs.squeeze()  # Предполагается, что outputs - это рейтинг от 0 до 1
#             preds_rating = (preds_rating * 9) + 1  # Масштабируем рейтинг до 1-10 и округляем
#             preds_rating = torch.round(preds_rating).long()  # Округляем до ближайшего целого числа

#             for i in range(len(preds_sent)):
#                 if max_probs[i] >= threshold:
#                     pseudo_data.append({
#                         'review': clean_text(batch['text'][i]),  # Применяем clean_text
#                         'sentiment': preds_sent[i].item(),
#                         'rating': preds_rating[i].item()
#                     })

#     return pd.DataFrame(pseudo_data)

In [None]:
# # Создание датасета для неразмеченных данных
# class UnlabeledIMDBDataset(Dataset):
#     def __init__(self, texts, tokenizer, max_len):
#         self.texts = texts
#         self.tokenizer = tokenizer
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.texts)
    
#     def __getitem__(self, item):
#         text = str(self.texts[item])
        
#         encoding = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             return_token_type_ids=False,
#             padding='max_length',
#             truncation=True,
#             return_attention_mask=True,
#             return_tensors='pt',
#         )
        
#         return {
#             'text': text,
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#         }

# # Создание DataLoader для неразмеченных данных
# unlabeled_dataset = UnlabeledIMDBDataset(
#     texts=unlabeled_df['review'].tolist(),
#     tokenizer=tokenizer,
#     max_len=MAX_LEN
# )

# unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# # Генерация псевдо-маркировок
# pseudo_labels_df = generate_pseudo_labels(model, unlabeled_loader, device, threshold=0.95)
# print(f"Псевдо-размеченных данных: {len(pseudo_labels_df)}")

In [None]:
# print(pseudo_labels_df)
# plot_data_distribution(pseudo_labels_df)

In [None]:
# # Объединение размеченных и псевдо-размеченных данных
# augmented_train_df = pd.concat([labeled_df, pseudo_labels_df], ignore_index=True)

# # Создание новых обучающих и валидационных выборок
# X_aug_train, X_aug_val, y_aug_train_sent, y_aug_val_sent, y_aug_train_rating, y_aug_val_rating = train_test_split(
#     augmented_train_df['review'],
#     augmented_train_df['sentiment'],
#     augmented_train_df['rating'],
#     test_size=0.2,
#     random_state=42
# )

# # Создание новых датасетов
# aug_train_dataset = IMDBDataset(
#     texts=X_aug_train.to_list(),
#     sentiments=y_aug_train_sent.tolist(),
#     ratings=y_aug_train_rating.tolist(),
#     tokenizer=tokenizer,
#     max_len=MAX_LEN
# )

# aug_val_dataset = IMDBDataset(
#     texts=X_aug_val.to_list(),
#     sentiments=y_aug_val_sent.tolist(),
#     ratings=y_aug_val_rating.tolist(),
#     tokenizer=tokenizer,
#     max_len=MAX_LEN
# )

# aug_train_loader = DataLoader(aug_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# aug_val_loader = DataLoader(aug_val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# EPOCHS = 3
# # Переинициализация модели для избежания переобучения
# model = BertForSentimentAndRating.from_pretrained('bert-base-uncased', config=config)
# model = model.to(device)

# # Переопределение оптимизатора
# optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# for epoch in range(EPOCHS):
#     print(f'Epoch {epoch + 1}/{EPOCHS} - Augmented Training')
#     train_loss = train_epoch(
#         model,
#         aug_train_loader,
#         criterion_sentiment,
#         criterion_rating,
#         optimizer,
#         device
#     )
#     print(f'Train loss: {train_loss}')
    
#     val_loss, val_accuracy, val_mse = eval_model(
#         model,
#         aug_val_loader,
#         criterion_sentiment,
#         criterion_rating,
#         device
#     )
#     print(f'Validation loss: {val_loss}')
#     print(f'Validation Sentiment Accuracy: {val_accuracy}')
#     print(f'Validation Rating MSE: {val_mse}')
#     print('-' * 30)

In [None]:
import os

# Создаем папку для сохранения, если она не существует
model_save_path = 'bert_sentiment_rating_model2' 
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# Сохранение модели и токенизатора
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Модель сохранена в {model_save_path}")

In [72]:

all_preds_sent = []
all_true_sent = []
all_preds_rating = []
all_true_rating = []
all_texts = []  # Для хранения текстов отзывов

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Получение предсказаний'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        sentiments = batch['sentiment'].to(device)
        ratings = batch['rating'].to(device)
        texts = batch['text']

        logits, outputs = model(input_ids, attention_mask)

        # Предсказания для тональности
        preds_sent = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds_sent.extend(preds_sent)
        all_true_sent.extend(sentiments.cpu().numpy())

        # Предсказания для рейтинга
        preds_rating = outputs.squeeze().cpu().numpy()
        all_preds_rating.extend(preds_rating)
        all_true_rating.extend(ratings.cpu().numpy())

        all_texts.extend(texts)

Получение предсказаний: 100%|██████████| 1563/1563 [06:06<00:00,  4.27it/s]


In [73]:
errors_sent = [abs(p - t) for p, t in zip(all_preds_sent, all_true_sent)]
errors_rating = [abs(p - t) for p, t in zip(all_preds_rating, all_true_rating)]

# Создание DataFrame для удобства анализа
results_df = pd.DataFrame({
    'text': all_texts,
    'true_sentiment': all_true_sent,
    'pred_sentiment': all_preds_sent,
    'error_sentiment': errors_sent,
    'true_rating': all_true_rating,
    'pred_rating': all_preds_rating,
    'error_rating': errors_rating,
})

# Сортировка по величине ошибки рейтинга
results_df = results_df.sort_values('error_rating', ascending=False)

In [74]:
# Вывод первых 10 примеров с наибольшей ошибкой рейтинга
print("Примеры с наибольшей ошибкой рейтинга:")
pd.set_option('display.max_colwidth', None)  # Устанавливаем неограниченную ширину столбца для полного вывода текста
print(results_df[['text', 'true_rating', 'pred_rating', 'error_rating']].head(10))

# Вывод примеров с ошибкой сентимента
wrong_sentiment_df = results_df[results_df['error_sentiment'] > 0]
print("\nПримеры с ошибкой сентимента:")
print(wrong_sentiment_df[['text', 'true_sentiment', 'pred_sentiment', 'error_sentiment']].head(10))

Примеры с наибольшей ошибкой рейтинга:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [75]:
# Вывод первых 10 примеров с наибольшей ошибкой рейтинга
print(results_df.head(10))

# Вывод примеров с ошибкой сентимента
wrong_sentiment_df = results_df[results_df['error_sentiment'] > 0]
print(wrong_sentiment_df.head(10))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        