In [None]:
from IPython.display import HTML, display
def set_css():
 display(HTML('''
 <style>
 pre {
 white-space: pre-wrap;
 }
 </style>
 '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
!pip install transformers torch nltk



In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('dialogs.txt', sep='\t', names=['intent', 'text'])
# https://stackoverflow.com/questions/42320834/sklearn-changing-string-class-label-to-int
# df.intent = pd.factorize(df.intent)[0]
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,intent,text
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [None]:
# Create a dictionary to map intent strings to numeric labels
intent_labels = {intent: i for i, intent in enumerate(df['intent'].unique())}

# Convert the intent column to numeric labels
df['intent'] = df['intent'].map(intent_labels)
print(df.head())

   intent                                      text
0       0             i'm fine. how about yourself?
1       1       i'm pretty good. thanks for asking.
2       2         no problem. so how have you been?
3       3          i've been great. what about you?
4       4  i've been good. i'm in school right now.


In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Create a custom dataset class for our conversational dataset
class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        utterance = self.df.iloc[idx, 1]
        intent = self.df.iloc[idx, 0]

        encoding = self.tokenizer.encode_plus(
            utterance,
            max_length=50,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(intent)
        }

# Create a data loader for our dataset
dataset = IntentDataset(df, tokenizer)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# Fine-tune the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)

        print(dir(outputs))
        print(outputs)

        loss = criterion(outputs.last_hidden_state, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')

model.eval()

['__annotations__', '__class__', '__class_getitem__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__or__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'attentions', 'clear', 'copy', 'cross_attentions', 'fromkeys', 'get', 'hidden_states', 'items', 'keys', 'last_hidden_state', 'move_to_end', 'past_key_values', 'pooler_output', 'pop', 'popitem', 'setdefault', 'to_tuple', 'update', 'values']
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 9.4836e-02,  9.4297e-02, -2.6712e-01,  ..., -2.1028e-01,
           3.6087e-01,  5.5132e-01],
       

RuntimeError: Expected target size [32, 768], got [32]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class IntentDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx, 0]
        intent = self.data.iloc[idx, 1]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'intent': torch.tensor(intent)
        }

dataset = IntentDataset(data, tokenizer)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
import torch.nn as nn

class IntentClassifier(nn.Module):
    def __init__(self, model):
        super(IntentClassifier, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.model.config.hidden_size, 8)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        outputs = self.classifier(pooled_output)
        return outputs

model = IntentClassifier(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        intent = batch['intent'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, intent)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')

model.eval()

In [None]:
from transformers import pipeline

ner_model = pipeline('ner', model='bert-base-uncased')

def extract_entities(text):
    outputs = ner_model(text)
    entities = [(entity['word'], entity['score']) for entity in outputs]
    return entities

user_input = 'Book a flight to New York'
entities = extract_entities(user_input)
print(entities)

In [3]:
# ATTEMPT 2
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Load the conversational dataset
dataset = pd.read_csv('dialogs.txt', sep='\t', names=['intent', 'text'])

# Display the dataset
print(dataset.head())

# Convert strings to numerical values using LabelEncoder
le = LabelEncoder()
dataset['intent'] = le.fit_transform(dataset['intent'])

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Fine-tune the model to classify intents
class IntentClassifier(torch.nn.Module):
    def __init__(self):
        super(IntentClassifier, self).__init__()
        self.bert = model
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, len(le.classes_))

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        outputs = self.classifier(pooled_output)
        return outputs

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = IntentClassifier()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Train the model
for epoch in range(5):
    optimizer.zero_grad()
    input_ids = torch.tensor([tokenizer.encode_plus(text,
                                                      add_special_tokens=True,
                                                      max_length=512,
                                                      return_attention_mask=True,
                                                      return_tensors='pt')['input_ids'][0] for text in dataset['text']])
    attention_mask = torch.tensor([tokenizer.encode_plus(text,
                                                          add_special_tokens=True,
                                                          max_length=512,
                                                          return_attention_mask=True,
                                                          return_tensors='pt')['attention_mask'][0] for text in dataset['text']])
    labels = torch.tensor(dataset['intent'])
    outputs = model(input_ids, attention_mask)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Load pre-trained NER model
ner_model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased-ner')

# Define a function to extract entities from user input
def extract_entities(text):
    inputs = tokenizer.encode_plus(text,
                                    add_special_tokens=True,
                                    max_length=512,
                                    return_attention_mask=True,
                                    return_tensors='pt')
    outputs = ner_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    entities = []
    for entity in outputs:
        entities.append((entity['word'], entity['score'], entity['entity']))
    return entities

# Define a function to classify intent and extract entities
def chatbot_response(text):
    input_ids = torch.tensor([tokenizer.encode_plus(text,
                                                      add_special_tokens=True,
                                                      max_length=512,
                                                      return_attention_mask=True,
                                                      return_tensors='pt')['input_ids'].flatten()])
    attention_mask = torch.tensor([tokenizer.encode_plus(text,
                                                          add_special_tokens=True,
                                                          max_length=512,
                                                          return_attention_mask=True,
                                                          return_tensors='pt')['attention_mask'].flatten()])
    outputs = model(input_ids, attention_mask)
    intent = le.inverse_transform(torch.argmax(outputs))
    entities = extract_entities(text)
    return intent, entities

# Sample run
user_input = 'I want to book a flight to New York'
intent, entities = chatbot_response(user_input)
print(f'Intent: {intent}')
print(f'Entities: {entities}')

                                intent  \
0               hi, how are you doing?   
1        i'm fine. how about yourself?   
2  i'm pretty good. thanks for asking.   
3    no problem. so how have you been?   
4     i've been great. what about you?   

                                       text  
0             i'm fine. how about yourself?  
1       i'm pretty good. thanks for asking.  
2         no problem. so how have you been?  
3          i've been great. what about you?  
4  i've been good. i'm in school right now.  


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


TypeError: only integer tensors of a single element can be converted to an index

In [6]:
# ATTEMPT 3
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Create a small dataset of conversational phrases and responses
dataset = {
    'text': ['Hello', 'How are you?', 'What is your name?', 'I want to book a flight'],
    'intent': ['greeting', 'query', 'query', 'booking'],
    'response': ['Hello! How can I assist you?', 'I am doing well, thank you.', 'My name is Chatbot.', 'Which airline would you like to book with?']
}
dataset = pd.DataFrame(dataset)

# Convert strings to numerical values using LabelEncoder
le = LabelEncoder()
dataset['intent'] = le.fit_transform(dataset['intent'])

# Load pre-trained GPT model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Fine-tune the model to classify intents
class IntentClassifier(torch.nn.Module):
    def __init__(self):
        super(IntentClassifier, self).__init__()
        self.gpt = model
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.gpt.config.hidden_size, len(le.classes_))

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        outputs = self.classifier(pooled_output)
        return outputs

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = IntentClassifier()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Train the model
for epoch in range(5):
    optimizer.zero_grad()
    input_ids = torch.tensor([tokenizer.encode_plus(text,
                                                  add_special_tokens=True,
                                                  max_length=512,
                                                  return_attention_mask=True,
                                                  return_tensors='pt')['input_ids'].flatten().tolist()[0] for text in dataset['text']])
    attention_mask = torch.tensor([tokenizer.encode_plus(text,
                                                        add_special_tokens=True,
                                                        max_length=512,
                                                        return_attention_mask=True,
                                                        return_tensors='pt')['attention_mask'].flatten().tolist()[0] for text in dataset['text']])
    labels = torch.tensor(dataset['intent'])
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    outputs = model(input_ids, attention_mask)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Load pre-trained NER model
ner_model = torch.hub.load('huggingface/transformers', 'model', 'gpt2-ner')

# Define a function to extract entities from user input
def extract_entities(text):
    inputs = tokenizer.encode_plus(text,
                                    add_special_tokens=True,
                                    max_length=512,
                                    return_attention_mask=True,
                                    return_tensors='pt')
    outputs = ner_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    entities = []
    for entity in outputs:
        entities.append((entity['word'], entity['score'], entity['entity']))
    return entities

# Define a function to classify intent and extract entities
def chatbot_response(text):
    input_ids = torch.tensor([tokenizer.encode_plus(text,
                                                      add_special_tokens=True,
                                                      max_length=512,
                                                      return_attention_mask=True,
                                                      return_tensors='pt')['input_ids'][0]])
    attention_mask = torch.tensor([tokenizer.encode_plus(text,
                                                          add_special_tokens=True,
                                                          max_length=512,
                                                          return_attention_mask=True,
                                                          return_tensors='pt')['attention_mask'][0]])
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    outputs = model(input_ids, attention_mask)
    intent = le.inverse_transform(torch.argmax(outputs))
    entities = extract_entities(text)
    return intent, entities

# Sample run
user_input = 'I want to book a flight to New York'
intent, entities = chatbot_response(user_input)
print(f'Intent: {intent}')
print(f'Entities: {entities}')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


IndexError: too many indices for tensor of dimension 2

In [7]:
# ATTEMPT 4
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load pre-trained NER model
ner_model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased-ner')

# Define a small dataset of conversational phrases and responses
dataset = [
    {"text": "Hello, how are you?", "intent": "greeting", "entities": []},
    {"text": "What's your name?", "intent": "question", "entities": ["name"]},
    {"text": "I'm feeling sad today.", "intent": "emotion", "entities": ["emotion", "sad"]},
    {"text": "Can you tell me a joke?", "intent": "entertainment", "entities": ["joke"]},
    # Add more examples here...
]

# Convert text data to numerical values using BERT tokenizer
input_ids = []
attention_mask = []
labels = []
entities = []
for example in dataset:
    inputs = tokenizer.encode_plus(
        example["text"],
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(inputs['input_ids'].flatten().tolist())
    attention_mask.append(inputs['attention_mask'].flatten().tolist())
    labels.append(example["intent"])
    entities.append(example["entities"])

# Convert labels to numerical values using LabelEncoder
le = LabelEncoder()
labels = le.fit_transform(labels)

# Create tensors for input IDs, attention masks, and labels
input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)
labels = torch.tensor(labels)

# Define a custom model for intent classification
class IntentClassifier(nn.Module):
    def __init__(self):
        super(IntentClassifier, self).__init__()
        self.bert = model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, len(le.classes_))

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        outputs = self.classifier(pooled_output)
        return outputs

# Initialize the intent classification model and optimizer
model = IntentClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Fine-tune the model on the dataset
for epoch in range(5):
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Define a function to extract entities from user input
def extract_entities(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    outputs = ner_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    entities = []
    for entity in outputs:
        entities.append(entity['word'])
    return entities

# Define a function to classify intent and extract entities from user input
def chatbot_response(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    intent = le.inverse_transform(torch.argmax(outputs))
    entities = extract_entities(text)
    return intent, entities

# Sample run
user_input = "I'm feeling happy today."
intent, entities = chatbot_response(user_input)
print(f"Intent: {intent}, Entities: {entities}")

Downloading: "https://github.com/huggingface/transformers/zipball/main" to /root/.cache/torch/hub/main.zip


RuntimeError: Missing dependencies: sacremoses

In [10]:
# ATTEMPT 5
!pip install fastBPE sacremoses subword_nmt
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder

# Load pre-trained GPT model and tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)

# Load pre-trained NER model
ner_model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-NER')

# Define a small dataset of conversational phrases and responses
dataset = [
    {"text": "Hello, how are you?", "intent": "greeting", "entities": []},
    {"text": "What's your name?", "intent": "question", "entities": ["name"]},
    {"text": "I'm feeling sad today.", "intent": "emotion", "entities": ["emotion", "sad"]},
    {"text": "Can you tell me a joke?", "intent": "entertainment", "entities": ["joke"]},
    # Add more examples here...
]

# Convert text data to numerical values using tokenizer
input_ids = []
attention_mask = []
labels = []
entities = []
for example in dataset:
    inputs = tokenizer.encode_plus(
        example["text"],
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(inputs['input_ids'].flatten().tolist())
    attention_mask.append(inputs['attention_mask'].flatten().tolist())
    labels.append(example["intent"])
    entities.append(example["entities"])

# Convert labels to numerical values using LabelEncoder
le = LabelEncoder()
labels = le.fit_transform(labels)

# Create tensors for input IDs, attention masks, and labels
input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)
labels = torch.tensor(labels)

# Fine-tune the model on the dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(5):
    optimizer.zero_grad()
    outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
    loss = criterion(outputs, labels.to(device))
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Define a function to extract entities from user input
def extract_entities(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    outputs = ner_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    entities = []
    for entity in outputs:
        entities.append(entity['word'])
    return entities

# Define a function to classify intent and extract entities from user input
def chatbot_response(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    outputs = model(inputs['input_ids'].to(device), attention_mask=inputs['attention_mask'].to(device))
    intent = le.inverse_transform(torch.argmax(outputs))
    entities = extract_entities(text)
    return intent, entities

# Sample run
user_input = "I'm feeling happy today."
intent, entities = chatbot_response(user_input)
print(f"Intent: {intent}, Entities: {entities}")



Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using cache found in /root/.cache/torch/hub/huggingface_transformers_main


OSError: bert-base-ner is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [12]:
# ATTEMPT 6
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)
dir(ner_results)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-PER', 'score': 0.9990139, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.999645, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [14]:
# attempt 7

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import pipeline
from transformers import AutoModelForTokenClassification, AutoTokenizer
import nltk
from nltk.tokenize import word_tokenize

# Load pre-trained NER model
ner_model = pipeline("ner", model="dslim/bert-base-NER")

# Define a small dataset of conversational phrases and responses
dataset = [
    {"text": "Hello, how are you?", "intent": "greeting", "entities": []},
    {"text": "What's your name?", "intent": "question", "entities": ["name"]},
    {"text": "I'm feeling sad today.", "intent": "emotion", "entities": ["emotion", "sad"]},
    {"text": "Can you tell me a joke?", "intent": "entertainment", "entities": ["joke"]},
    # Add more examples here...
]

# Convert text data to numerical values using tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
input_ids = []
attention_mask = []
labels = []
entities = []
for example in dataset:
    inputs = tokenizer.encode_plus(
        example["text"],
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(inputs['input_ids'].flatten().tolist())
    attention_mask.append(inputs['attention_mask'].flatten().tolist())
    labels.append(example["intent"])
    entities.append(example["entities"])

# Create tensors for input IDs, attention masks, and labels
input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)

# Load pre-trained GPT model
model_name = "distilgpt2"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)

# Fine-tune the model on the dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(5):
    optimizer.zero_grad()
    outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device))
    loss = criterion(outputs.logits, torch.tensor([0, 1, 2, 3]).to(device))
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Define a function to extract entities from user input
def extract_entities(text):
    return ner_model(text)

# Define a function to classify intent and extract entities from user input
def chatbot_response(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    outputs = model(inputs['input_ids'].to(device), attention_mask=inputs['attention_mask'].to(device))
    intent = torch.argmax(outputs.logits)
    entities = extract_entities(text)
    return intent, entities

# Sample run
user_input = "I'm feeling happy today."
intent, entities = chatbot_response(user_input)
print(f"Intent: {intent}, Entities: {entities}")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a 

ValueError: expected sequence of length 6 at dim 1 (got 5)

In [17]:
# attempt 8
from transformers import pipeline
import random

# Define a small dataset of conversational phrases and responses
dataset = {
    "greeting": {
        "phrases": ["Hello, how are you?", "Hi, what's up?", "Hey, how's it going?"],
        "responses": ["I'm good, thanks!", "I'm doing well, thanks for asking!", "I'm great, thanks!"]
    },
    "question": {
        "phrases": ["What's your name?", "How old are you?", "Where are you from?"],
        "responses": ["My name is Chatbot.", "I'm ageless.", "I'm from the internet."]
    },
    "emotion": {
        "phrases": ["I'm feeling sad today.", "I'm feeling happy today.", "I'm feeling angry today."],
        "responses": ["Sorry to hear that. Would you like to talk about it?", "That's great to hear! What's making you happy?", "I'm here to listen. What's making you angry?"]
    }
}

# Load pre-trained NER model
ner_model = pipeline("ner", aggregation_strategy="simple")

# Define a function to extract entities from user input
def extract_entities(text):
    return ner_model(text)

# Define a function to classify intent and extract entities from user input
def chatbot_response(text):
    entities = extract_entities(text)
    for intent, data in dataset.items():
        for phrase in data["phrases"]:
            if phrase in text:
                return random.choice(data["responses"]), entities
    return "I didn't understand that. Please try again.", entities

# Sample run
user_input = input("Say something: ")
response, entities = chatbot_response(user_input)
print(f"Response: {response}, Entities: {entities}")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Say something: Hi, what's up?
Response: I'm great, thanks!, Entities: []


In [23]:
# attempt 9
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import random

# Define a small conversational dataset
dataset = [
    {"text": "Hello, how are you?", "intent": "greeting"},
    {"text": "What's your name?", "intent": "question"},
    {"text": "I'm feeling sad today.", "intent": "emotion"},
    {"text": "Can you tell me a joke?", "intent": "entertainment"},
    # Add more examples here...
]

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert text data to numerical values using tokenizer
input_ids = []
attention_mask = []
labels = []
for example in dataset:
    inputs = tokenizer.encode_plus(
        example["text"],
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt',
        # truncation=True
    )

# Create tensors for input IDs, attention masks, and labels
print(input_ids)
input_ids.append(inputs['input_ids'].flatten().tolist())
attention_mask.append(inputs['attention_mask'].flatten().tolist())
labels.append(dataset.index(example))

# Fine-tune the model on the dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(5):
    optimizer.zero_grad()
    outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device))
    loss = criterion(outputs.logits, labels.to(device))
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Load pre-trained NER model
ner_model = pipeline("ner", aggregation_strategy="simple")

# Define a function to extract entities from user input
def extract_entities(text):
    return ner_model(text)

# Define a function to classify intent and extract entities from user input
def chatbot_response(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    outputs = model(inputs['input_ids'].to(device), attention_mask=inputs['attention_mask'].to(device))
    intent = torch.argmax(outputs.logits)
    entities = extract_entities(text)
    return intent, entities

# Sample run
user_input = "I'm feeling happy today."
intent, entities = chatbot_response(user_input)
print(f"Intent: {intent}, Entities: {entities}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[]


AttributeError: 'list' object has no attribute 'to'

In [24]:
# ATTEMPT 10
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import random

# Define a small conversational dataset
dataset = [
    {"text": "Hello, how are you?", "intent": "greeting"},
    {"text": "What's your name?", "intent": "question"},
    {"text": "I'm feeling sad today.", "intent": "emotion"},
    {"text": "Can you tell me a joke?", "intent": "entertainment"},
    {"text": "Goodbye, see you later!", "intent": "greeting"},
    {"text": "How old are you?", "intent": "question"},
    {"text": "I'm feeling happy today.", "intent": "emotion"},
    {"text": "Can you recommend a movie?", "intent": "entertainment"},
]

# Define a dictionary that maps intent names to class labels
intent_to_label = {
    "greeting": 0,
    "question": 1,
    "emotion": 2,
    "entertainment": 3
}

# Create the labels tensor
labels = []
for example in dataset:
    labels.append(intent_to_label[example["intent"]])

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split([example["text"] for example in dataset], labels, test_size=0.2, random_state=42)

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert text data to numerical values using tokenizer
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Create tensors for input IDs, attention masks, and labels
train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_mask = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

val_input_ids = torch.tensor(val_encodings['input_ids'])
val_attention_mask = torch.tensor(val_encodings['attention_mask'])
val_labels = torch.tensor(val_labels)

# Define a custom dataset class for our data
class ConversationalDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

# Create dataset instances for training and validation
train_dataset = ConversationalDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = ConversationalDataset(val_input_ids, val_attention_mask, val_labels)

# Create data loaders for training and validation
batch_size = 16
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Set the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# Train the model
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

    model.eval()
    with torch.no_grad():
        total_correct = 0
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, dim=1)
            total_correct += (predicted == labels).sum().item()

        accuracy = accuracy_score(val_labels.cpu().numpy(), predicted.cpu().numpy())
        print(f"Epoch {epoch+1}, Val Accuracy: {accuracy:.4f}")

# Use a pre-trained NER model for entity extraction
ner_model = pipeline("ner", model="bert-base-uncased")

def extract_entities(text):
    entities = ner_model(text)
    entity_list = [(entity["word"], entity["score"], entity["entity"]) for entity in entities]
    return entity_list

# Test the intent recognition model
def recognize_intent(text):
    encoding = tokenizer.encode_plus(
        text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].flatten()
    attention_mask = encoding["attention_mask"].flatten()

    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    outputs = model(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
    _, predicted = torch.max(outputs.logits, dim=1)
    intent = list(intent_to_label.keys())[list(intent_to_label.values()).index(predicted.item())]
    return intent

# Test the entity extraction model
text = "I want to book a flight from New York to Los Angeles."
entities = extract_entities(text)
print("Entities:", entities)

# Test the intent recognition model
intent = recognize_intent(text)
print("Intent:", intent)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 1.485089898109436
Epoch 1, Val Accuracy: 0.0000
Epoch 2, Loss: 1.3906131982803345
Epoch 2, Val Accuracy: 0.0000
Epoch 3, Loss: 1.3574992418289185
Epoch 3, Val Accuracy: 0.0000
Epoch 4, Loss: 1.2784881591796875
Epoch 4, Val Accuracy: 0.0000
Epoch 5, Loss: 1.2487515211105347
Epoch 5, Val Accuracy: 0.0000


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Entities: [('i', 0.52504736, 'LABEL_1'), ('want', 0.5932004, 'LABEL_1'), ('to', 0.60439324, 'LABEL_0'), ('book', 0.53773963, 'LABEL_0'), ('a', 0.51646715, 'LABEL_0'), ('flight', 0.6879823, 'LABEL_0'), ('from', 0.689661, 'LABEL_0'), ('new', 0.7227008, 'LABEL_0'), ('york', 0.6387848, 'LABEL_0'), ('to', 0.65641654, 'LABEL_0'), ('los', 0.71226305, 'LABEL_0'), ('angeles', 0.72163844, 'LABEL_0'), ('.', 0.592537, 'LABEL_0')]
Intent: greeting
