In [1]:
import numpy as np
import pandas as pd
from rule_based import total_score

In [2]:
mapping = {0: "weather", 1: "religious time", 2: "time", 3: "date", 4: "unknown"}

In [3]:
df = pd.read_csv("mh_clean.csv", index_col=0)
df

Unnamed: 0,sentence,class
0,آیا فردا هوا ابری است,0
1,هوا فردا آلودست,0
2,شاخص آلودگی چنده,0
3,یعنی فردا بارون میباره,0
4,پس کی برف میباره,0
...,...,...
904,تقویم چه کشورایی با خورشید تنظیم نشده,4
905,ماه گرد نامزدی هم جشن میگیرن,4
906,چندم میرسی مشتی,4
907,دوست دارم یه بار روز جهانی بدون سوتین رو جشن ب...,4


In [4]:
from sklearn.model_selection import train_test_split

x = df["sentence"].values
y = df["class"].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 42)

In [5]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)

In [6]:
maxlen = max([len(s.split()) for s in x])

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

enc_docs = tokenizer.texts_to_sequences(x_train)
Xtrain = pad_sequences(enc_docs, maxlen=maxlen, padding='post')

In [8]:
enc_docs = tokenizer.texts_to_sequences(x_test)
Xtest = pad_sequences(enc_docs, maxlen=maxlen, padding='post')

In [9]:
vocab_size = len(tokenizer.word_index) + 1

In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(Conv1D(filters=32, kernel_size=16, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
# model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(5, activation='sigmoid'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 28, 100)           108500    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 13, 32)            51232     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 6, 32)             0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 192)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                1930      
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 55        
Total params: 161,717
Trainable params: 161,717
Non-trainable params: 0
________________________________________________

In [46]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(Xtrain, y_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fd9880d3100>

In [47]:
model.evaluate(Xtest, y_test)



[0.6583083271980286, 0.8201754093170166]

In [102]:
def predict(sent: str) -> int:
    enc_docs = tokenizer.texts_to_sequences(np.array([sent]))
    s = pad_sequences(enc_docs, maxlen=maxlen, padding='post')
    pred = model.predict(s)
    ind = np.argpartition(pred, -2)[-2:].flatten().tolist()
    ind.reverse()

    print(f"The first predicted class: {mapping[ind[0]]}")
    print(f"The second predicted class: {mapping[ind[1]]}")
    
    sc = total_score(sent)
    sc[4] = 0
    print(sc)
    
    if sc[ind[0]] >= 2 and sc[ind[1]] <= 2:
        return mapping[ind[0]]
    elif abs(sc[ind[0]] - sc[ind[1]]) <= 2:
        return mapping[ind[0]]  # CHECK THIS PLEASE
    elif ind[0] == 4 and max(list(sc.values())) <= 2 and sc[ind[1]] != 2:
        return 'unknown'
    # HANDLE NN BEING RIGHT AND RULE-BASED BEING WRONG
    else:
        return mapping[ind[0]]

In [103]:
sent = "برف سالگرد رحلت امام بیشتر بود یا دیروز؟"

In [104]:
predict(sent)

The first predicted class: weather
The second predicted class: date
{0: 2, 1: 0, 2: 0, 3: 8, 4: 0}


'weather'

In [110]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
vec.fit(x_train)
x_train = vec.transform(x_train)
x_test = vec.transform(x_test)

In [111]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Dense(64, input_dim=x_train.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(5, activation='sigmoid'))
model.compile(loss="sparse_categorical_crossentropy",
              optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=16, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7ff58a2574f0>

In [112]:
score = model.evaluate(x_test, y_test)



In [120]:
from sklearn.feature_extraction.text import CountVectorizer

sent = np.array(["امروز چه مناسبی داریم؟"])
vec = CountVectorizer()
vec.fit(sent)
sent = vec.transform(sent)

In [121]:
sent

<1x4 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [123]:
model.predict(x_test)

array([[8.32831483e-06, 3.53544950e-04, 5.14000654e-04, 7.32241631e-07,
        4.29838896e-04],
       [1.60789490e-03, 9.08839356e-05, 1.76593661e-03, 2.26557255e-04,
        5.03525138e-03],
       [2.46572185e-08, 2.99583077e-02, 3.48827743e-05, 5.59064347e-07,
        9.87519888e-06],
       ...,
       [5.14230815e-05, 2.21431255e-04, 8.18222761e-04, 1.74521804e-02,
        1.21080875e-03],
       [6.27995729e-02, 1.21861277e-09, 1.24441357e-07, 1.80661210e-08,
        7.68125057e-04],
       [1.09369694e-07, 6.91767871e-01, 2.21032533e-05, 2.97554067e-07,
        1.83314085e-04]], dtype=float32)

In [81]:
from sklearn.model_selection import train_test_split

x = df["sentence"].values
y = df["class"].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state = 42)

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
vec.fit(x_train)
x_train = vec.transform(x_train)
x_test = vec.transform(x_test)

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(x_train, y_train)
accuracy_score(y_test, lr.predict(x_test))

0.85