In [207]:
import pandas as pd

In [208]:
data = pd.read_csv("classification.csv")

In [209]:
data.head(10)

Unnamed: 0,id,question,answer,intent
0,1,order kasari garne?,kunei product lai add to cart garera checkout ...,order
1,2,product order process kasto cha?,product page maa gayera 'Add to Cart' button m...,order
2,3,online order kasari thapne?,account maa login garera chahine product selec...,order
3,4,product kasari kinne?,"product select garera add to cart maa rakhne, ...",order
4,5,kasari order garna milcha?,tapai le product select garera cart maa add ga...,order
5,6,online shopping kasari garne?,"online shopping garna, pahila product select g...",order
6,7,checkout kasari garne?,"checkout garna, add to cart gareko product che...",order
7,8,add to cart garepachi kei garne?,"add to cart garepachi, checkout page maa gayer...",order
8,9,cart maa rakhne bhandaa pachi ke garne?,"cart maa rakhne bhandaa pachi, checkout button...",order
9,10,order kasari thapne?,pahila product select garera add to cart maa r...,order


In [210]:
data.tail(10)

Unnamed: 0,id,question,answer,intent
1490,491,Yo product ajhai stock ma chha ki chhaina?,Yo product ajhai stock ma chha.,inquiry
1491,492,Yo item ajhai available chha ki chhaina?,Yo item ajhai available chha.,inquiry
1492,493,Kripaya yo saman ko stock bare janakari dinuhos.,Yo saman ajhai stock ma chha.,inquiry
1493,494,Yo product ajhai pani available chha ki chhaina?,Yo product ajhai pani available chha.,inquiry
1494,495,Yo item ko stock ajhai chha ki chhaina?,Yo item ko stock ajhai chha.,inquiry
1495,496,Yo saman ajhai available chha ki chhaina?,Yo saman ajhai available chha.,inquiry
1496,497,Yo product ajhai pani stock ma chha ki chhaina?,Yo product ajhai pani stock ma chha.,inquiry
1497,498,Yo item ajhai available chha ki chhaina?,Yo item ajhai available chha.,inquiry
1498,499,Kripaya yo saman ko stock bare janakari dinuhos.,Yo saman ajhai stock ma chha.,inquiry
1499,500,Yo product ajhai available chha ki chhaina?,Yo product ajhai available chha.,inquiry


In [211]:
labels = data["intent"]
labels.value_counts()

intent
order        500
greetings    500
inquiry      500
Name: count, dtype: int64

In [212]:
X = data["question"]
X[:5]

0                 order kasari garne?
1    product order process kasto cha?
2         online order kasari thapne?
3               product kasari kinne?
4          kasari order garna milcha?
Name: question, dtype: object

## Text Classification

In [213]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [215]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1200,), (300,), (1200,), (300,))

In [216]:
tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [217]:
X_train_tfidf.shape

(1200, 186)

In [218]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

clf.fit(X_train_tfidf, y_train)

In [219]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test_tfidf)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 1.0


In [220]:
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n   greetings       1.00      1.00      1.00        86\n     inquiry       1.00      1.00      1.00       110\n       order       1.00      1.00      1.00       104\n\n    accuracy                           1.00       300\n   macro avg       1.00      1.00      1.00       300\nweighted avg       1.00      1.00      1.00       300\n'

In [221]:
clf.predict(tfidf.transform(["Hello there"]))

array(['greetings'], dtype='<U9')

In [222]:
clf.predict(tfidf.transform(["Is the store open?"]))

array(['inquiry'], dtype='<U9')

In [223]:
clf.predict(tfidf.transform(["how can i buy from your store?"]))

array(['inquiry'], dtype='<U9')

In [224]:
clf.predict(tfidf.transform(["how to order?"]))

array(['order'], dtype='<U9')

In [225]:
clf.predict(tfidf.transform(["good morning"]))

array(['greetings'], dtype='<U9')

## MLP

In [226]:
data.head()

Unnamed: 0,id,question,answer,intent
0,1,order kasari garne?,kunei product lai add to cart garera checkout ...,order
1,2,product order process kasto cha?,product page maa gayera 'Add to Cart' button m...,order
2,3,online order kasari thapne?,account maa login garera chahine product selec...,order
3,4,product kasari kinne?,"product select garera add to cart maa rakhne, ...",order
4,5,kasari order garna milcha?,tapai le product select garera cart maa add ga...,order


In [227]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [228]:
label = data["intent"]

In [229]:
label.value_counts()

intent
order        500
greetings    500
inquiry      500
Name: count, dtype: int64

In [230]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(label)

In [231]:
y

array([2, 2, 2, ..., 1, 1, 1])

In [232]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(y, num_classes=3)

In [233]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

In [234]:
max_words = 1000
max_len = 30
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [235]:
X_train[0]

[14, 2, 16, 6, 35, 73, 18, 139, 34, 5]

In [236]:
tokenizer.word_index.items()

dict_items([('chha', 1), ('yo', 2), ('order', 3), ('kasari', 4), ('ki', 5), ('ko', 6), ('stock', 7), ('ajhai', 8), ('chhaina', 9), ('garne', 10), ('ma', 11), ('product', 12), ('kasto', 13), ('kripaya', 14), ('tapai', 15), ('saman', 16), ('item', 17), ('garna', 18), ('pani', 19), ('janakari', 20), ('good', 21), ('thikai', 22), ('sab', 23), ('available', 24), ('malai', 25), ('dinuhos', 26), ('lai', 27), ('hello', 28), ('confirm', 29), ('bhanidinuhos', 30), ('hi', 31), ('k', 32), ('cha', 33), ('huncha', 34), ('availability', 35), ('lagi', 36), ('namaste', 37), ('nai', 38), ('update', 39), ('bhane', 40), ('din', 41), ('thap', 42), ('process', 43), ('confirmation', 44), ('bhanera', 45), ('chahincha', 46), ('garnu', 47), ('dinus', 48), ('bhayo', 49), ('online', 50), ('ke', 51), ('morning', 52), ('na', 53), ('evening', 54), ('barema', 55), ('sanchai', 56), ('checkout', 57), ('afternoon', 58), ('parchha', 59), ('hunchha', 60), ('day', 61), ('hunuhunchha', 62), ('aauxa', 63), ('maile', 64), ('a

In [237]:
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [238]:
X_train[:10]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,  14,   2,  16,   6,  35,  73,
         18, 139,  34,   5],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         28,  15,  13,   1],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,  14,  20,  26,   2,  12,   8,
         24,   1,   5,   9],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  21,  54,
         15,  27,  13,   1],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   2,  16,   8,  19,   7,  11,
         24,   1,   5,   9],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,

In [239]:
mlp = Sequential([
    Dense(64, activation="relu", input_dim=X_train.shape[1]),
    Dropout(0.5),
    Dense(32, activation="relu"),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation="softmax")
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [240]:
import tensorflow

In [241]:
mlp.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])


In [242]:
history = mlp.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3418 - loss: 11.6314 - val_accuracy: 0.3778 - val_loss: 1.3820
Epoch 2/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4071 - loss: 3.9548 - val_accuracy: 0.5333 - val_loss: 1.0804
Epoch 3/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3636 - loss: 2.9007 - val_accuracy: 0.7259 - val_loss: 0.8446
Epoch 4/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4686 - loss: 1.9563 - val_accuracy: 0.7556 - val_loss: 0.8441
Epoch 5/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4908 - loss: 1.5287 - val_accuracy: 0.7704 - val_loss: 0.8407
Epoch 6/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5530 - loss: 1.2403 - val_accuracy: 0.7778 - val_loss: 0.8256
Epoch 7/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━

In [243]:
loss, accuracy = mlp.evaluate(X_test, y_test)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6964 - loss: 0.7989 


In [245]:
y_preds = mlp.predict(X_test[:5])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


In [251]:
pred_labels = y_preds.argmax(axis=1)

In [252]:
pred_labels

array([0, 1, 1, 0, 2], dtype=int64)

In [262]:
query = ""

transformed_question = tokenizer.texts_to_sequences([query])

padded_question = pad_sequences(transformed_question, maxlen=max_len)

intent = mlp.predict(padded_question).argmax(axis=1)

label_encoder.inverse_transform(intent)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


array(['greetings'], dtype=object)

## another: support vector machine

In [263]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [264]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data["question"])

In [272]:
X

<1500x193 sparse matrix of type '<class 'numpy.float64'>'
	with 9056 stored elements in Compressed Sparse Row format>

In [265]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data["intent"])

In [268]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

In [269]:
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train, y_train)

In [270]:
y_pred = svm_model.predict(X_test)

In [271]:
print("Classification report: ", classification_report(y_test, y_pred))

Classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           1       1.00      1.00      1.00        51
           2       1.00      1.00      1.00        47

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

