In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, recall_score
import pandas as pd


def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    texts = df['total_no_drug'].astype(str).tolist()
    labels = [1 if sentiment == 1 else 0 for sentiment in df['match'].tolist()]
    return texts, labels

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.encodings = tokenizer(texts, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids': self.encodings['input_ids'][idx], 'attention_mask': self.encodings['attention_mask'][idx], 'label': self.labels[idx]}

class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.inference_mode():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return recall_score(actual_labels, predictions), classification_report(actual_labels, predictions)

def predict_sentiment(text, model, tokenizer, device, max_length=5000):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.inference_mode():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 1 else "negative"


In [None]:
from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [None]:

# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 64
num_epochs = 4
learning_rate = 2e-5


# Loading and splitting the data
data_file = '/content/drive/MyDrive/reddit/reddit clean/balance_sample_comm.csv'
texts, labels = load_imdb_data(data_file)
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize tokenizer, dataset, and data loader
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataloader = DataLoader(TextClassificationDataset(train_texts, train_labels, tokenizer, max_length), batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(TextClassificationDataset(val_texts, val_labels, tokenizer, max_length), batch_size=batch_size)

# Set up the device and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training the model
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    recall, report = evaluate(model, val_dataloader, device)
    print(f"Validation recall: {recall:.4f}")
    print(report)

# Saving the final model
torch.save(model.state_dict(), "/content/drive/MyDrive/reddit/bert_classifier.pth")



tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Epoch 1/4


In [None]:
import os

if not os.path.exists(saved_model_path):
    print(f"Error: Model file not found at {saved_model_path}")


In [None]:
# Save the uploaded file to Google Drive
file_name = 'bert_classifier.pth'
#file_content = uploaded[file_name]

# Define the path in Google Drive where you want to save the file
drive_path = '/content/drive/MyDrive/' + file_name

# Save the file
with open(drive_path, 'wb') as f:
    f.write(file_content)

print(f"{file_name} has been saved to Google Drive at {drive_path}")


bert_classifier.pth has been saved to Google Drive at /content/drive/MyDrive/bert_classifier.pth


In [None]:
import torch
from transformers import BertTokenizer
# Make sure to replace 'your_module' with the actual module where your BERTClassifier is defined

# Assuming you have saved the model state_dict as 'bert_classifier.pth'
#import io
#io.BytesIO(uploaded['bert_classifier.pth'])

saved_model_path = '/content/drive/MyDrive/bert_classifier.pth'

# Initialize the model architecture
loaded_model = BERTClassifier(bert_model_name='bert-base-uncased', num_classes=2)  # Make sure to provide the correct configuration

# Load the saved state_dict
#loaded_model.load_state_dict(torch.load(io.BytesIO(uploaded['bert_classifier.pth']), map_location=torch.device('cpu')))
loaded_model.load_state_dict(torch.load('/content/drive/MyDrive/bert_classifier.pth', map_location=torch.device('cpu')))

#loaded_model.load_state_dict(torch.load(saved_model_path))

# Set the model to evaluation mode
loaded_model.eval()

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example text for prediction
example_text = "This is an example text for sentiment analysis."

# Tokenize and prepare the input for the model
encoding = tokenizer(example_text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


In [None]:
import pandas as pd

# Assuming your DataFrame is named df and the text column is 'review_text'
df = pd.read_csv('/content/drive/MyDrive/reddit/balanced_submission/no_include_sample_sub_nb_xg.csv')  # Replace with the actual path to your data file



  df = pd.read_csv('/content/drive/MyDrive/reddit/balanced_submission/no_include_sample_sub_nb_xg.csv')  # Replace with the actual path to your data file


In [None]:
import torch
from transformers import BertTokenizer

# Assuming you have saved the model state_dict as 'bert_classifier.pth'
saved_model_path = '/content/drive/MyDrive/bert_classifier.pth'

# Assuming BERTClassifier is defined in your_module
# Make sure to replace 'your_module' with the actual module where your BERTClassifier is defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
# Initialize the model architecture
loaded_model = BERTClassifier(bert_model_name='bert-base-uncased', num_classes=2).to(device)  # Make sure to provide the correct configuration
#loaded_model = BERTClassifier(bert_model_name='bert-base-uncased', num_classes=2)  # Make sure to provide the correct configuration

# Load the saved state_dict
loaded_model.load_state_dict(torch.load(saved_model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))

# Set the model to evaluation mode
loaded_model.eval()

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example text for prediction
example_text = "This is an example text for sentiment analysis."

# Tokenize example_text
with torch.no_grad():
    encoding = tokenizer(example_text, return_tensors='pt', max_length=128, padding='max_length', truncation=True).to(device)

# Check GPU availability


# Optimized parameters
batch_size = 1000  # Adjust based on your GPU memory
prediction_list = []
length = 0

# Process data in batches
with torch.no_grad():
    for i in range(0, len(df['total_text']), batch_size):
        batch_texts = df['total_text'].iloc[i:i + batch_size].tolist()

        # Tokenize and move data to GPU
        encodings = tokenizer(batch_texts, return_tensors='pt', max_length=128, padding='max_length', truncation=True).to(device)
        input_ids = encodings['input_ids']
        attention_mask = encodings['attention_mask']

        # Model inference on GPU
        output = loaded_model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted_class = torch.max(output, dim=1)

        # Move predictions back to CPU
        predicted_class = predicted_class.cpu().tolist()

        # Extend prediction_list
        prediction_list.extend(predicted_class)

        length = length + batch_size
        # Print progress
        if (length) % 1000 == 0:
            print(f"Processed {length} instances")

print("Processing complete.")


Using device: cuda
Processed 1000 instances
Processed 2000 instances
Processed 3000 instances
Processed 4000 instances
Processed 5000 instances
Processed 6000 instances
Processed 7000 instances
Processed 8000 instances
Processed 9000 instances
Processed 10000 instances
Processed 11000 instances
Processed 12000 instances
Processed 13000 instances
Processed 14000 instances
Processed 15000 instances
Processed 16000 instances
Processed 17000 instances
Processed 18000 instances
Processed 19000 instances
Processed 20000 instances
Processed 21000 instances
Processed 22000 instances
Processed 23000 instances
Processed 24000 instances
Processed 25000 instances
Processed 26000 instances
Processed 27000 instances
Processed 28000 instances
Processed 29000 instances
Processed 30000 instances
Processed 31000 instances
Processed 32000 instances
Processed 33000 instances
Processed 34000 instances
Processed 35000 instances
Processed 36000 instances
Processed 37000 instances
Processed 38000 instances
Pr

In [None]:
df['prediction_list'] = prediction_list
df


Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,author,id,selftext,subreddit,subreddit_id,title,Country,...,total_flags,match,total_text_no_punc,total_text,concat,total_no_drug,year,prediction_nb,xg_boost,prediction_list
0,0,0,1,WonderfulDog69,eiazwx,,portugal,t5_2qmie,"Primeiro vídeo do ano, bora. Um enorme 2020 a ...",Portugal,...,0,0,Primeiro vídeo do ano bora Um enorme 2020 a ...,"Primeiro vídeo do ano, bora. Um enorme 2020 a ...",[],Primeiro vídeo do ano bora Um enorme 2020 a to...,2020,0,0,0
1,1,1,3,meteoritee,eiazxp,Happy New Year 2020!Not only do we celebrate t...,CasualUK,t5_3j2jr,Happy New Year 2020!,UK,...,0,0,Happy New Year 2020 Happy New Year 2020 Not on...,Happy New Year 2020! Happy New Year 2020!Not o...,[],Happy New Year 2020 Happy New Year 2020 Not on...,2020,1,0,1
2,2,2,5,Medical-Outcome,eib00z,,CasualUK,t5_3j2jr,Starting the next decade in a spoons 👍,UK,...,0,0,Starting the next decade in a spoons 👍,Starting the next decade in a spoons 👍,[],Starting the next decade in a spoons 👍,2020,0,0,1
3,3,3,9,Rizlem,eib099,"I hope you have a happy new year, and your nex...",unitedkingdom,t5_2qhqb,HAPPY NEW YEAR!,UK,...,0,0,HAPPY NEW YEAR I hope you have a happy new yea...,HAPPY NEW YEAR! I hope you have a happy new ye...,[],HAPPY NEW YEAR I hope you have a happy new yea...,2020,1,0,0
4,4,4,10,Arculae,eib0d0,,uktrees,t5_2si9d,Happy New Year,uk,...,0,0,Happy New Year,Happy New Year,[],Happy New Year,2020,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1326814,1326814,1326814,46443,Julial12,16wlxmr,,CasualUK,t5_3j2jr,Home alone here 😪,UK,...,0,0,Home alone here 😪,Home alone here 😪,[],Home alone here 😪,2023,1,0,0
1326815,1326815,1326815,46444,Allmychickenbois,16wlxow,"(SFW stuff only, you animals!)You know those n...",CasualUK,t5_3j2jr,What do you do to help yourself fall asleep?,UK,...,0,0,What do you do to help yourself fall asleep SF...,What do you do to help yourself fall asleep? (...,[],What do you do to help yourself fall asleep SF...,2023,1,0,0
1326816,1326816,1326816,46445,Free_Price3574,16wlyso,"Hej!Jag har en situation som precis hände, och...",sweden,t5_2qofe,Granne (Ordförande för BRFen) gick in i min lä...,Sweden,...,0,0,Granne Ordförande för BRFen gick in i min lä...,Granne (Ordförande för BRFen) gick in i min lä...,[],Granne Ordförande för BRFen gick in i min läge...,2023,0,0,0
1326817,1326817,1326817,46447,Remarkable_Put_7952,16wm2d8,,thenetherlands,t5_30hrx,Den Haag (1929),netherlands,...,0,0,Den Haag 1929,Den Haag (1929),[],Den Haag 1929,2023,0,0,0


In [None]:
df.to_csv('/content/drive/MyDrive/reddit/balanced_submission/no_include_sample_sub_nb_xg_bert.csv', index=False)


