In [2]:
import pandas as pd
import numpy as np
import pickle as pk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
import re
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.layers import Dense

In [2]:
# Load the dataset and prepare it to the train the model

# Importing dataset and splitting into words and labels
dataset = pd.read_csv('intents.csv', names=["Query", "Intent"])

queries = dataset["Query"]

intent = list(dataset["Intent"])
unique_intent_list = list(set(intent))

# print(queries)
# print(unique_intent_list)
print("Dataset successfully loaded!")

Dataset successfully loaded!


In [13]:
queryCorpus = []
ps = PorterStemmer()

for query in queries:
    query = re.sub('[^a-zA-Z0-9]', ' ', query)

    # Tokenize sentence
    query = query.split(' ')

    # Lemmatizing
    tokenized_query = [ps.stem(word.lower()) for word in query]

    # Recreate the sentence from tokens
    tokenized_query = ' '.join(tokenized_query)

    # Add to corpus
    queryCorpus.append(tokenized_query)
# print(queryCorpus)
print("Corpus created")

Corpus created


In [4]:
intent_CV= CountVectorizer(max_features=1500)
corpus = intent_CV.fit_transform(queryCorpus).toarray()
print(corpus)
# print(len(corpus))
print("Bag of words created!")

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]
Bag of words created!


In [5]:
# Save the CountVectorizer of Intent
pk.dump(intent_CV, open('saved_state/intent_cv', 'wb'))
print('Intent vector saved!')

Intent vector saved!


In [6]:
# Encode the intents
labelencoder_intent = LabelEncoder()
intent = labelencoder_intent.fit_transform(intent)
print("Encoded the classes!")
print(intent)

Encoded the classes!
[ 2  2 10  2 25 25 25 15 20  7  2  9  1  2  2  7 23 23 23 23 23 23 23 23
 23 23 23 23  3  3 19 20 20 10 21 21 21 21 20 13 13 13 17 17 17 12 11 16
  7 15  4 15 15 15 15 12 12 12 12 15  3 16  3  1 12 23 21  8  8  8  8  8
  8  8 22 22 22 22 22 24 24 24 24 24 24 24 24 24 24  5  5  5  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 14 14  6 14  6  6  6  6  6  6  6  6 14 14
 14 18 18 18 18 18 18 18]


In [7]:
# Return a dict mapping labels to their integer values
res = {}
for cl in labelencoder_intent.classes_:
    res.update({cl:labelencoder_intent.transform([cl])[0]})

intent_label_map = res
print(intent_label_map)
print("Label mapping obtained!")

{'AskingHelp': 0, 'BestPractice': 1, 'Cultivation': 2, 'CultivationSeason': 3, 'Diseases': 4, 'EnquireAboutDay': 5, 'Family': 6, 'Fertilizer': 7, 'Greeting': 8, 'HarvestTime': 9, 'Irrigation': 10, 'Location': 11, 'MarketPrice': 12, 'Maturity': 13, 'OutOfScope': 14, 'Pesticide': 15, 'Rainfall': 16, 'ReapingSeason': 17, 'Sarcasm': 18, 'SeedDensity': 19, 'Soil': 20, 'Varieties': 21, 'Wassup': 22, 'Weather': 23, 'Wellness': 24, 'Yield': 25}
Label mapping obtained!


In [8]:
# Splitting the dataset into the Training set and Test set
query_train, query_test, intent_train, intent_test = train_test_split(corpus, intent, test_size = 0.15, random_state = 19)

print("Dataset split into train and test set")
print(query_train)
print(intent_train)

Dataset split into train and test set
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]]
[11  2 21 12  6 22 18 21  8  0  6  2 22  3 23  8 18 18 17 22  3 24 25 23
 16 10 20  6  0 17  0 18  2 24 15  7 23 13 14 12  0  2 23 23 17  6 24 24
 14  1 18  2  5  3  9  0  5 15 23  5 23 12 24  1  0 15  8 15 20  8  3 12
 12 21 15 14 24  0 24 25  8 22  7  8 14 24  0  7 13  6  4 24  2  0 25 21
 23 12 21 20 22 15 23 13  0  6  6  0]


In [9]:
# Fit the classifier to dataset
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(query_train, intent_train)
print("Model trained successfully!")

Model trained successfully!


In [10]:
intent_pred = classifier.predict(query_test)
print(intent_test)
print(intent_pred)

[ 0 18 19 10 14 23  0  6 23 23 24  0 16  6 23 15  8 14 20 18]
[ 0 24 12  7 18 23  0  6 23 23 24  0 12  6 23 15  8 18 20  6]


In [11]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(intent_test, intent_pred)

# Model Performace
accuracy = (cm[1][1]+cm[0][0])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
precision = cm[1][1]/(cm[0][0]+cm[0][1])
recall = cm[1][1]/(cm[1][1]+cm[1][0])

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

Accuracy:  1.0
Precision:  0.6666666666666666
Recall:  1.0
