In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.metrics import classification_report
import numpy as np
import nltk
from nltk import pos_tag, word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer

# Make sure to download the NLTK punkt package for tokenization
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Expanded sample data for the model
data = [
    ("I'm feeling great!", "Happy"),
    ("I'm so anxious about the exam.", "Anxious"),
    ("I'm feeling really stressed out at work.", "Stressed"),
    ("I'm calm and relaxed today.", "Calm"),
    ("Life is beautiful and I'm very happy!", "Happy"),
    ("I have a lot on my mind and it's making me anxious.", "Anxious"),
    ("Work is piling up, and I'm feeling overwhelmed.", "Stressed"),
    ("Everything is okay, and I feel calm.", "Calm"),
    ("I am very excited about the upcoming event!", "Happy"),
    ("This is the worst day ever.", "Stressed"),
    ("I don't know what to do anymore.", "Anxious"),
    ("I'm at peace with myself today.", "Calm"),
    ("I feel sad and depressed.", "Anxious"),
    ("I am so happy to see my friends!", "Happy"),
    ("I can't take the stress of this situation.", "Stressed"),
    ("The world feels heavy on my shoulders.", "Stressed"),
    ("I'm just enjoying a peaceful moment.", "Calm"),
    # Add more sample data as needed
]

# Preprocessing and dataset preparation
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Initializing BERT model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Creating a function to determine n-gram complexity based on POS tagging
def choose_ngram(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)

    verb_count = sum(1 for word, tag in pos_tags if tag.startswith('VB'))  # Count verbs
    noun_count = sum(1 for word, tag in pos_tags if tag.startswith('NN'))   # Count nouns

    total_count = verb_count + noun_count

    if total_count <= 3:  # Simple query
        return 1
    elif 3 < total_count <= 6:  # Moderately complex
        return 2
    else:  # Complex query
        return 3

class AdaptiveNGramVectorizer:
    def __init__(self, max_features=1000):
        self.max_features = max_features
        self.vectorizers = {}
        self.feature_size = None

    def fit(self, X, y=None):
        for n in range(1, 4):
            vectorizer = CountVectorizer(ngram_range=(1, n), max_features=self.max_features)
            vectorizer.fit(X)
            self.vectorizers[n] = vectorizer

        self.feature_size = sum(len(v.get_feature_names_out()) for v in self.vectorizers.values())
        return self

    def transform(self, X):
        result = []
        for text in X:
            n = choose_ngram(text)
            vectorizer = self.vectorizers[n]
            features = vectorizer.transform([text]).toarray()[0]

            if len(features) < self.feature_size:
                features = np.pad(features, (0, self.feature_size - len(features)))
            elif len(features) > self.feature_size:
                features = features[:self.feature_size]

            result.append(features)
        return np.array(result)

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# Prepare the dataset
texts, labels = zip(*data)
vectorizer = AdaptiveNGramVectorizer()
ngram_features = vectorizer.fit_transform(texts)

# Convert labels to numeric format
label_to_index = {label: index for index, label in enumerate(set(labels))}
index_to_label = {index: label for label, index in label_to_index.items()}
numeric_labels = [label_to_index[label] for label in labels]

# Create a DataLoader for training
dataset = TextDataset(texts, numeric_labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

class BERTWithNGramsClassifier(torch.nn.Module):
    def __init__(self, bert_model, num_labels, ngram_size):
        super(BERTWithNGramsClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size + ngram_size, num_labels)

    def forward(self, input_ids, attention_mask, ngrams):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        combined = torch.cat((pooled_output, ngrams), dim=1)
        logits = self.classifier(combined)
        return logits

# Initialize and train the model
model = BERTWithNGramsClassifier(bert_model, len(label_to_index), vectorizer.feature_size).to(device)

# Training the model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train for a few more epochs to help improve accuracy
for epoch in range(5):  # Increased epochs for better learning
    model.train()
    for batch_texts, batch_labels in dataloader:
        optimizer.zero_grad()
        encoding = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt', max_length=128)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        ngram_tensor = torch.tensor(vectorizer.transform(batch_texts), dtype=torch.float).to(device)
        outputs = model(input_ids, attention_mask, ngram_tensor)
        loss = criterion(outputs, torch.tensor(batch_labels).to(device))
        loss.backward()
        optimizer.step()

# Function to classify user input
def classify_query(query):
    model.eval()
    preprocessed_query = query
    ngram = vectorizer.transform([preprocessed_query])

    encoding = tokenizer.encode_plus(
        preprocessed_query,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    ngram_tensor = torch.tensor(ngram, dtype=torch.float).to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, ngrams=ngram_tensor)
        _, preds = torch.max(outputs, dim=1)

    return index_to_label[preds.item()]

# User input for classification
while True:
    user_input = input("Enter a sentence to classify (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    classification = classify_query(user_input)
    print(f"Classification: {classification}\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  loss = criterion(outputs, torch.tensor(batch_labels).to(device))


Enter a sentence to classify (or 'exit' to quit): I feel sad and depressed
Classification: Anxious

Enter a sentence to classify (or 'exit' to quit): I'm feeling really stressed out at work
Classification: Stressed

Enter a sentence to classify (or 'exit' to quit): Work is piling up, and I'm feeling overwhelmed
Classification: Stressed

Enter a sentence to classify (or 'exit' to quit): i m so nervous that i have give presentation next
Classification: Anxious

Enter a sentence to classify (or 'exit' to quit): i feel like crying
Classification: Stressed

Enter a sentence to classify (or 'exit' to quit): the day is really going good
Classification: Happy

Enter a sentence to classify (or 'exit' to quit): i m so composed
Classification: Stressed

Enter a sentence to classify (or 'exit' to quit): I'm just enjoying a peaceful moment.
Classification: Calm

Enter a sentence to classify (or 'exit' to quit): I have a lot on my mind
Classification: Stressed

Enter a sentence to classify (or 'exit