In [1]:
import data_loading_code as loader

#### **Load data loader**

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        new_row = pd.DataFrame([{
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }])
        df_ = pd.concat([df_, new_row], ignore_index=True)
    return data

# If this is the primary file that is executed (ie not an import of another file)

# get data, pre-process and split
data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)                             # pre-process
training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)

# vectorize data using TFIDF and transform for PyTorch for scalability
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
training_data = training_data.todense()                             # convert to dense matrix for Pytorch
vocab_size = len(word_vectorizer.vocabulary_)
validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()
train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()


#### **Create chatbot**

In [3]:
#Here, a very simple text classifier is created.

class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)  
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim) 

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [4]:
def train_model(model, train_loader, validation_loader, num_epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()  
        running_loss = 0.0
        for texts, labels in train_loader:
            optimizer.zero_grad()   
            outputs = model(texts)  
            loss = criterion(outputs, labels)  
            loss.backward()         
            optimizer.step()        

            running_loss += loss.item()

        model.eval()  
        validation_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        with torch.no_grad():
            for texts, labels in validation_loader:
                outputs = model(texts)
                loss = criterion(outputs, labels)
                validation_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_predictions += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader)}, Validation Loss: {validation_loss/len(validation_loader)}, Accuracy: {correct_predictions/total_predictions:.2f}')

    print('Finished Training')


batch_size = 64
input_dim = vocab_size  # From how many input dimensions we have
hidden_dim = 100        # 100 hidden neurons
output_dim = 2          # Binary classification. Either positive or negative response

model = TextClassifier(input_dim, hidden_dim, output_dim)
#Cross entropy loss for classification problem
criterion = nn.CrossEntropyLoss()

#Learning rate of 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

validation_dataset = TensorDataset(validation_x_tensor, validation_y_tensor)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

#Train model
train_model(model, train_loader, validation_loader, num_epochs=10)

#Save model
torch.save(model.state_dict(), 'text_classifier_model.pth')


Epoch 1/10, Training Loss: 0.6911979556083679, Validation Loss: 0.6835850775241852, Accuracy: 0.47
Epoch 2/10, Training Loss: 0.6496347268422444, Validation Loss: 0.6446263492107391, Accuracy: 0.81
Epoch 3/10, Training Loss: 0.5676612734794617, Validation Loss: 0.5835311114788055, Accuracy: 0.84
Epoch 4/10, Training Loss: 0.4428472578525543, Validation Loss: 0.5153145492076874, Accuracy: 0.85
Epoch 5/10, Training Loss: 0.31120123465855914, Validation Loss: 0.45443807542324066, Accuracy: 0.85
Epoch 6/10, Training Loss: 0.20794046719868978, Validation Loss: 0.40954481065273285, Accuracy: 0.85
Epoch 7/10, Training Loss: 0.13493663867314656, Validation Loss: 0.38315702974796295, Accuracy: 0.84
Epoch 8/10, Training Loss: 0.08983398750424385, Validation Loss: 0.3627917468547821, Accuracy: 0.86
Epoch 9/10, Training Loss: 0.06355140333374341, Validation Loss: 0.3502170890569687, Accuracy: 0.86
Epoch 10/10, Training Loss: 0.04759390217562517, Validation Loss: 0.34327663481235504, Accuracy: 0.85

In [7]:
def prepare_input(text, word_vectorizer):
    vectorized_text = word_vectorizer.transform([text]).todense()  # Vectorize the text
    input_tensor = torch.from_numpy(np.array(vectorized_text)).type(torch.FloatTensor)
    return input_tensor

def generate_response(class_id):
    responses = {
        0: "Negative response.",
        1: "Positive response."
    }
    return responses.get(class_id, "Unknown class response.")

def predict_and_respond(input_text, model, word_vectorizer):
    input_tensor = prepare_input(input_text, word_vectorizer)
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted_class = torch.max(output, 1)
        return generate_response(predicted_class.item())

def main_loop(model, word_vectorizer):
    model.eval()  # Set the model to evaluation mode
    print("Type 'quit' to exit the program.")
    while True:
        input_text = input("You: ")
        if input_text.lower() == 'quit':
            print("Exiting the program.")
            break
        response = predict_and_respond(input_text, model, word_vectorizer)
        print(f'User: {input_text}')
        print("Bot:", response)

#Load previously trained model
model.load_state_dict(torch.load('text_classifier_model.pth'))

#Create a loop which takes inputs, and generates output
main_loop(model, word_vectorizer)


Type 'quit' to exit the program.
User: heya
Bot: Negative response.
User: I think the product did not produce the expected outcome 
Bot: Negative response.
Exiting the program.
