In [1]:
import nltk
import json
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('punkt')
nltk.download('wordnet')

stemmer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to C:\Users\Vishnu
[nltk_data]     Priya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Vishnu
[nltk_data]     Priya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
with open("intents_big.json", "r") as f:
    intents = json.load(f)

In [13]:
words = []
classes = []
documents = []
ignore_words = ['?']

for intent in intents['intents']:
    if 'pattern' in intent:  # Check if the key exists
        for pattern in intent['pattern']:
            # Tokenize each word in the sentence
            w = nltk.word_tokenize(pattern)
            # Add to our words list
            words.extend(w)
            # Add to documents in our corpus
            documents.append((w, intent['tag']))
            # Add to our classes list
            if intent['tag'] not in classes:
                classes.append(intent['tag'])
    else:
        print(f"Warning: 'patterns' key missing in intent: {intent}")
        
words = [stemmer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
classes = sorted(list(set(classes)))

print(len(documents), "documents")
print(len(classes), "classes", classes)
print(len(words), "unique lemmatized words", words)

14479 documents
90 classes ['Acidic Soil', 'Agriculture', 'Alkaline Soil', 'Aphid Infestation', 'Arid Climate', 'Beans', 'Beneficial Insects', 'Cabbage', 'Carrots', 'Chemical_Pest_Control', 'Clay Soil', 'Cold Climate', 'Companion Planting', 'Cover Crops', 'Crop Rotation', 'Crop_Rotation', 'Cucumbers', 'Diatomaceous Earth', 'Disease_Management', 'Fungal Disease Prevention', 'Garlic', 'Garlic and Chili Spray', 'Grains', 'Handpicking', 'Harvesting_Practices', 'Herbs', 'Identifying_Pests', 'Insecticidal Soap', 'Integrated_Pest_Management', 'Irrigation Management', 'Irrigation_Strategies', 'Legumes', 'Lettuce', 'Loamy Soil', 'Marigolds', 'Melons', 'Mulching', 'Neem Oil', 'Nutrient_Management', 'Organic Matter Addition', 'Organic_Pest_Control', 'Peppers', 'Pest_Control', 'Plant_Diseases', 'Planting_Techniques', 'Powdery Mildew', 'Preventative_Measures', 'Regular Testing', 'Root Crops', 'Row Covers', 'Sandy Soil', 'Soil Amendments', 'Specific_Pest_Issues', 'Spider Mites Control', 'Temperate C

In [32]:
import pickle
with open('words.pkl', 'wb') as f:
    pickle.dump(words, f)
with open('classes.pkl', 'wb') as f:
    pickle.dump(classes, f)

In [14]:
training = []
output = []
output_empty = [0] * len(classes)

# Training set, bag of words for each sentence
for doc in documents:
    # Initialize our bag of words
    bag = []
    # List of tokenized words for the pattern
    pattern_words = doc[0]
    # Lemmatize each word
    pattern_words = [stemmer.lemmatize(word.lower()) for word in pattern_words]
    # Create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # Output is a '0' for each tag and '1' for the current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

random.shuffle(training)

In [15]:
def synonym_replacement(tokens, limit):
    augmented_sentences = []
    for i in range(len(tokens)):
        synonyms = []
        for syn in wordnet.synsets(tokens[i]):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if len(synonyms) > 0:
            num_augmentations = min(limit, len(synonyms))
            sampled_synonyms = random.sample(synonyms, num_augmentations)
            for synonym in sampled_synonyms:
                augmented_tokens = tokens[:i] + [synonym] + tokens[i + 1:]
                augmented_sentences.append(' '.join(augmented_tokens))
    return augmented_sentences

In [16]:
# Augment the training data using synonym replacement
augmented_data = []
limit_per_tag = 100

for i, doc in enumerate(training):
    bag, output_row = doc
    tokens = [words[j] for j in range(len(words)) if bag[j] == 1]
    augmented_sentences = synonym_replacement(tokens, limit_per_tag)
    for augmented_sentence in augmented_sentences:
        augmented_bag = [1 if augmented_sentence.find(word) >= 0 else 0 for word in words]
        augmented_data.append([augmented_bag, output_row])


combined_data = np.concatenate((training, augmented_data), axis=0)
random.shuffle(combined_data)

from sklearn.model_selection import train_test_split

In [17]:
def separate_data_by_tags(data):
    data_by_tags = {}
    for d in data:
        tag = tuple(d[1])
        if tag not in data_by_tags:
            data_by_tags[tag] = []
        data_by_tags[tag].append(d)
    return data_by_tags.values()


separated_data = separate_data_by_tags(combined_data)

In [18]:
from sklearn.model_selection import train_test_split
# Lists to store training and testing data
training_data = []
testing_data = []

# Split each tag's data into training and testing sets
combined_data = []
for tag_data in separated_data:
    combined_data.extend(tag_data)

if len(combined_data) > 1:
    train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42)
    training_data.extend(train_data)
    testing_data.extend(test_data)
else:
    print("Insufficient data for splitting.")


random.shuffle(training_data)
random.shuffle(testing_data)

# Convert training and testing data back to np.array
train_x = np.array([d[0] for d in training_data])
train_y = np.array([d[1] for d in training_data])
test_x = np.array([d[0] for d in testing_data])
test_y = np.array([d[1] for d in testing_data])

In [19]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.bn1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.bn2(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        output = self.softmax(x)
        return output

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return self.softmax(x)

In [20]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [21]:
def accuracy(predictions, targets):
    predicted_labels = torch.argmax(predictions, dim=1)
    true_labels = torch.argmax(targets, dim=1)
    correct = (predicted_labels == true_labels).sum().item()
    total = targets.size(0)
    return correct / total

In [22]:
def test_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = len(test_loader)

    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item() * inputs.size(0)
            total_accuracy += accuracy(outputs, targets) * inputs.size(0)

    average_loss = total_loss / len(test_loader.dataset)
    average_accuracy = total_accuracy / len(test_loader.dataset)
    return average_loss, average_accuracy

In [23]:
# Create DataLoader for training and testing data
train_x = torch.tensor(train_x).float()
train_y = torch.tensor(train_y).float()
test_x = torch.tensor(test_x).float()
test_y = torch.tensor(test_y).float()

batch_size = 64
train_dataset = CustomDataset(train_x, train_y)
test_dataset = CustomDataset(test_x, test_y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the model, loss function, and optimizer
input_size = len(train_x[0])
hidden_size = 8
output_size = len(train_y[0])
model = NeuralNetwork(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

In [24]:
# Train the model and evaluate on the testing set
num_epochs = 250
for epoch in range(num_epochs):
    # Training
    model.train()
    running_loss = 0.0
    running_acc = 0.0
    for inputs, targets in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update statistics
        running_loss += loss.item() * inputs.size(0)
        running_acc += accuracy(outputs, targets) * inputs.size(0)

    # Calculate average training loss and accuracy for the epoch
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = running_acc / len(train_loader.dataset)

    # Print training loss and accuracy for each epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Training Accuracy: {epoch_acc:.4f}")

    # Evaluate on the testing set
    test_loss, test_accuracy = test_model(model, test_loader, criterion)
    print(f"Epoch [{epoch+1}/{num_epochs}], Testing Loss: {test_loss:.4f}, Testing Accuracy: {test_accuracy:.4f}")

Epoch [1/250], Training Loss: 0.0201, Training Accuracy: 0.7861
Epoch [1/250], Testing Loss: 0.0163, Testing Accuracy: 0.8165
Epoch [2/250], Training Loss: 0.0151, Training Accuracy: 0.8367
Epoch [2/250], Testing Loss: 0.0144, Testing Accuracy: 0.8535
Epoch [3/250], Training Loss: 0.0136, Training Accuracy: 0.8677
Epoch [3/250], Testing Loss: 0.0135, Testing Accuracy: 0.8683
Epoch [4/250], Training Loss: 0.0131, Training Accuracy: 0.8753
Epoch [4/250], Testing Loss: 0.0132, Testing Accuracy: 0.8746
Epoch [5/250], Training Loss: 0.0129, Training Accuracy: 0.8786
Epoch [5/250], Testing Loss: 0.0131, Testing Accuracy: 0.8767
Epoch [6/250], Training Loss: 0.0128, Training Accuracy: 0.8801
Epoch [6/250], Testing Loss: 0.0130, Testing Accuracy: 0.8777
Epoch [7/250], Training Loss: 0.0127, Training Accuracy: 0.8811
Epoch [7/250], Testing Loss: 0.0129, Testing Accuracy: 0.8790
Epoch [8/250], Training Loss: 0.0126, Training Accuracy: 0.8817
Epoch [8/250], Testing Loss: 0.0129, Testing Accuracy:

In [25]:
# Save the trained model
torch.save(model.state_dict(), 'data.pth')