In [1]:
import pandas as pd

df = pd.read_csv('/content/combined_sentiment.csv')
df

Unnamed: 0,Text,Sentiment
0,I need a loan to start a food delivery busines...,Positive
1,Seeking funds to purchase a new gaming console...,Neutral
2,Want to borrow money to cover gambling debts a...,Negative
3,Looking for financial assistance to pay for my...,Positive
4,Need a loan to renovate my bathroom and kitche...,Positive
...,...,...
28010,wish we could come see u on Denver husband l...,Negative
28011,I`ve wondered about rake to. The client has ...,Negative
28012,Yay good for both of you. Enjoy the break - y...,Positive
28013,But it was worth it ****.,Positive


In [2]:
nan_values = df.isna().sum()
print(nan_values)


Text         1
Sentiment    0
dtype: int64


In [3]:
df = df.dropna()
nan_values = df.isna().sum()
print(nan_values)

Text         0
Sentiment    0
dtype: int64


In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['Processed_Text'] = df['Text'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Processed_Text'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("Processed Texts:\n", df[['Text', 'Processed_Text']].head())
print("\nTF-IDF Matrix:\n", tfidf_df.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Processed_Text'] = df['Text'].apply(preprocess_text)


Processed Texts:
                                                 Text  \
0  I need a loan to start a food delivery busines...   
1  Seeking funds to purchase a new gaming console...   
2  Want to borrow money to cover gambling debts a...   
3  Looking for financial assistance to pay for my...   
4  Need a loan to renovate my bathroom and kitche...   

                                      Processed_Text  
0  need loan start food delivery business serving...  
1  seeking fund purchase new gaming console acces...  
2  want borrow money cover gambling debt try luck...  
3  looking financial assistance pay child emergen...  
4  need loan renovate bathroom kitchen since they...  

TF-IDF Matrix:
     00  000  0003   01  024   03   04  0405  0430   05  ...  ï½stupidityï½  \
0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   0.0   0.0  0.0  ...            0.0   
1  0.0  0.0   0.0  0.0  0.0  0.0  0.0   0.0   0.0  0.0  ...            0.0   
2  0.0  0.0   0.0  0.0  0.0  0.0  0.0   0.0   0.0  0.0  ...     

In [5]:
df

Unnamed: 0,Text,Sentiment,Processed_Text
0,I need a loan to start a food delivery busines...,Positive,need loan start food delivery business serving...
1,Seeking funds to purchase a new gaming console...,Neutral,seeking fund purchase new gaming console acces...
2,Want to borrow money to cover gambling debts a...,Negative,want borrow money cover gambling debt try luck...
3,Looking for financial assistance to pay for my...,Positive,looking financial assistance pay child emergen...
4,Need a loan to renovate my bathroom and kitche...,Positive,need loan renovate bathroom kitchen since they...
...,...,...,...
28010,wish we could come see u on Denver husband l...,Negative,wish could come see u denver husband lost job ...
28011,I`ve wondered about rake to. The client has ...,Negative,ive wondered rake client made clear net dont f...
28012,Yay good for both of you. Enjoy the break - y...,Positive,yay good enjoy break probably need hectic week...
28013,But it was worth it ****.,Positive,worth


In [6]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score
import torch.nn as nn
from torch.optim import AdamW

text_column = "Processed_Text"
label_column = "Sentiment"

In [7]:
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, text_column, label_column, max_length=128):
        # Initialize data
        self.texts = df[text_column].tolist()  # List of processed strings
        self.labels = df[label_column].map({"Positive": 0, "Negative": 1, "Neutral": 2}).values

        # Convert labels to PyTorch tensors
        self.labels = torch.tensor(self.labels, dtype=torch.long)

        # Initialize tokenizer and other settings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0).long()
        attention_mask = encoding['attention_mask'].squeeze(0).long()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': self.labels[idx]
        }

In [8]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score
import torch.nn as nn
from torch.optim import AdamW

In [9]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Split dataset into training and validation sets
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_dataset = SentimentDataset(train_df, tokenizer, text_column, label_column)
val_dataset = SentimentDataset(val_df, tokenizer, text_column, label_column)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

In [12]:
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training and validation loop with early stopping
epochs = 3
patience = 2
min_delta = 0.001

train_losses, val_losses = [], []
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

In [13]:
# for epoch in range(epochs):
#     if early_stop:
#         print(f"Early stopping triggered at epoch {epoch}")
#         break

#     # Training
#     model.train()
#     total_train_loss = 0
#     train_true, train_pred = [], []

#     for batch in train_loader:
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['label'].to(device)

#         optimizer.zero_grad()

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

#         total_train_loss += loss.item()
#         train_true.extend(labels.cpu().numpy())
#         train_pred.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

#     # Training metrics
#     train_loss = total_train_loss / len(train_loader)
#     train_precision = precision_score(train_true, train_pred, average='weighted')
#     train_recall = recall_score(train_true, train_pred, average='weighted')
#     train_f1 = f1_score(train_true, train_pred, average='weighted')

#     # Validation
#     model.eval()
#     total_val_loss = 0
#     val_true, val_pred = [], []

#     with torch.no_grad():
#         for batch in val_loader:
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['label'].to(device)

#             outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs.loss
#             total_val_loss += loss.item()

#             val_true.extend(labels.cpu().numpy())
#             val_pred.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

#     # Validation metrics
#     val_loss = total_val_loss / len(val_loader)
#     val_precision = precision_score(val_true, val_pred, average='weighted')
#     val_recall = recall_score(val_true, val_pred, average='weighted')
#     val_f1 = f1_score(val_true, val_pred, average='weighted')

#     # Early stopping check
#     if val_loss < best_val_loss - min_delta:
#         best_val_loss = val_loss
#         epochs_no_improve = 0
#         # torch.save(model.state_dict(), 'best_sentiment_model.pth')
#     else:
#         epochs_no_improve += 1
#         if epochs_no_improve >= patience:
#             early_stop = True
#             print(f"Early stopping after {epoch + 1} epochs")

#     print(f"Epoch {epoch+1}/{epochs}")
#     print(f"Train Loss: {train_loss:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}")
#     print(f"Val Loss: {val_loss:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")

In [14]:
df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Neutral,11256
Positive,8892
Negative,7866


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

for epoch in range(epochs):
    if early_stop:
        print(f"Early stopping triggered at epoch {epoch}")
        break

    # Training
    model.train()
    total_train_loss = 0
    train_true, train_pred = [], []

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        train_true.extend(labels.cpu().numpy())
        train_pred.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

    # Training metrics
    train_loss = total_train_loss / len(train_loader)
    train_precision = precision_score(train_true, train_pred, average='weighted')
    train_recall = recall_score(train_true, train_pred, average='weighted')
    train_f1 = f1_score(train_true, train_pred, average='weighted')
    train_accuracy = accuracy_score(train_true, train_pred)  # Calculate accuracy

    # Validation
    model.eval()
    total_val_loss = 0
    val_true, val_pred = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            val_true.extend(labels.cpu().numpy())
            val_pred.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

    # Validation metrics
    val_loss = total_val_loss / len(val_loader)
    val_precision = precision_score(val_true, val_pred, average='weighted')
    val_recall = recall_score(val_true, val_pred, average='weighted')
    val_f1 = f1_score(val_true, val_pred, average='weighted')
    val_accuracy = accuracy_score(val_true, val_pred)  # Calculate accuracy

    # Early stopping check
    if val_loss < best_val_loss - min_delta:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # torch.save(model.state_dict(), 'best_sentiment_model.pth')
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            early_stop = True
            print(f"Early stopping after {epoch + 1} epochs")

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}, Accuracy: {train_accuracy:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}, Accuracy: {val_accuracy:.4f}")


Epoch 1/3
Train Loss: 0.6908, Precision: 0.7136, Recall: 0.7092, F1: 0.7095, Accuracy: 0.7092
Val Loss: 0.6169, Precision: 0.7619, Recall: 0.7505, F1: 0.7494, Accuracy: 0.7505
Epoch 2/3
Train Loss: 0.5475, Precision: 0.7858, Recall: 0.7840, F1: 0.7841, Accuracy: 0.7840
Val Loss: 0.6106, Precision: 0.7656, Recall: 0.7623, F1: 0.7625, Accuracy: 0.7623
Epoch 3/3
Train Loss: 0.4450, Precision: 0.8316, Recall: 0.8304, F1: 0.8304, Accuracy: 0.8304
Val Loss: 0.6278, Precision: 0.7659, Recall: 0.7626, F1: 0.7633, Accuracy: 0.7626


In [16]:
torch.save(model.state_dict(), 'best_sentiment_model.pth')

In [19]:
df['Text'].iloc[0], df['Text'].iloc[2], df['Text'].iloc[1]

('I need a loan to start a food delivery business serving healthy meals to busy professionals.',
 'Want to borrow money to cover gambling debts and try my luck again at the casino.',
 'Seeking funds to purchase a new gaming console and accessories for entertainment purposes.')

In [21]:
from transformers import AutoTokenizer
import torch

# Define a function to preprocess and predict new text
def predict_sentiment(new_texts, model, tokenizer, device):
    model.eval()  # Set the model to evaluation mode

    # Tokenize the input text
    inputs = tokenizer(new_texts, padding=True, truncation=True, return_tensors='pt', max_length=512).to(device)

    # Perform the prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # Get the logits (output before softmax)
        predictions = torch.argmax(logits, dim=1)  # Get the class with the highest score

    return predictions

new_texts = ["I need a loan to buy a house.", "Seeking funds to purchase a new gaming console and accessories for entertainment purposes.", "Want to borrow money to cover gambling debts and try my luck again at the casino."]

predictions = predict_sentiment(new_texts, model, tokenizer, device)

sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
predicted_labels = [sentiment_map[label.item()] for label in predictions]

for text, label in zip(new_texts, predicted_labels):
    print(f"Text: {text} -> Predicted Sentiment: {label}")


Text: I need a loan to buy a house. -> Predicted Sentiment: Positive
Text: Seeking funds to purchase a new gaming console and accessories for entertainment purposes. -> Predicted Sentiment: Negative
Text: Want to borrow money to cover gambling debts and try my luck again at the casino. -> Predicted Sentiment: Positive
