# Interactive Chat Bot

## Objective
Build and fine-tune a system that leverages pre-trained transformer models to address real-world tasks. This system will help a student seeking an internship by performing various tasks involving natrual language understanding and generation.

## Specifications
1. Interactive Command Line Interface
    * Create a text-based program that interacts with users through open-ended plain-text prompts.
3. Sentence Auto-Completion
    * Implement functionality that predicts and completes common sentences based on context. Use your transformer model to generate plausible continuations for user-provided text.
5. Cover Letter Analysis
    * Design a feature where the system reads a provided cover letter and extracts meaningful information about the applicant. 
7. Dynamic Cover Letter Generation
    * Enable the system to generate a personalized cover letter based on user-specified constraints, such as desired role, key skills, or industry. Combine a flexible template with dynamic content generation to meet specific user needs.

In [4]:
from datasets import load_dataset
import pandas as pd

menu_df = pd.read_csv("input.csv")
dataset = load_dataset("csv", data_files={"train":"input.csv"}, split='train')

In [5]:
dataset

Dataset({
    features: ['Text', 'Label'],
    num_rows: 303
})

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [7]:
from sklearn.preprocessing import LabelBinarizer

enc = LabelBinarizer()
enc.fit(menu_df['Label'])

In [8]:
enc.transform(menu_df['Label'])[:10]

array([[0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]])

In [10]:
import torch
def add_labels(example):
    example['labels'] = torch.tensor(enc.transform([example['Label']])[0]*1.0)
    return example

tokenized_datasets = tokenized_datasets.map(add_labels)

Map:   0%|          | 0/303 [00:00<?, ? examples/s]

In [11]:
tokenized_datasets

Dataset({
    features: ['Text', 'Label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 303
})

In [12]:
tokenized_datasets = tokenized_datasets.remove_columns(["Text"])
tokenized_datasets = tokenized_datasets.remove_columns(["Label"])
#tokenized_datasets = tokenized_datasets.rename_column("Label", "labels")
tokenized_datasets.set_format("torch")

In [13]:
tokenized_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 303
})

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

model_ckpt = "bert-base-uncased"  # etc.
num_labels = 3  # etc.
# 1. Example: Help me write a cover letter
# 2. Example: Help me read a cover letter
# 3. Example: Exit

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=num_labels,
    problem_type="multi_label_classification",  # this is important
)

# Tokenize input text
text = "This is a great example."
inputs = tokenizer(text, return_tensors="pt")

# Get model output
outputs = model(**inputs)

# Process output
logits = outputs.logits
predictions = torch.argmax(logits, dim=1)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
logits.shape

torch.Size([1, 3])

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [17]:
from torch.utils.data import TensorDataset, DataLoader, Dataset

In [18]:
# here is a loop just showing how we are going to do this

# Create a DataLoader
train_dataloader = DataLoader(tokenized_datasets, batch_size=5, shuffle=True)

# Iterate through the DataLoader
for batch in train_dataloader:
    break

In [19]:
batch

{'input_ids': tensor([[ 101, 9689, 3844,  ...,    0,    0,    0],
         [ 101, 2071, 2017,  ...,    0,    0,    0],
         [ 101, 2064, 2017,  ...,    0,    0,    0],
         [ 101, 2292, 1521,  ...,    0,    0,    0],
         [ 101, 2064, 2017,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[1., 0., 0.],
         [0., 0., 1.],
         [0., 0., 1.],
         [0., 0., 1.],
         [0., 1., 0.]])}

In [20]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [21]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [22]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [23]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/183 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
text = "Stop this darn program."
inputs = tokenizer(text, return_tensors="pt").to(device)

# Get model output
outputs = model(**inputs)

# Process output
logits = outputs.logits
predictions = torch.argmax(logits, dim=1)

In [None]:
logits,predictions,enc.inverse_transform(logits.cpu().detach().numpy())