In [10]:
from string import punctuation
from nltk.corpus import stopwords
import nltk
from os import listdir 
from pickle import dump

nltk.download('stopwords')

# load the data
def load_document(file_name):
    file = open(file_name,'r')
    text=file.read()
    return text

# prepracessing the data & creat tokens

def document_tokens(document):
    tokens=document.split()

    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]



    tokens = [w for w in tokens if w.isalpha()]

    # remove stop words from each token
    stop_word = stopwords.words('english')
    tokens = [w for w in tokens if w not in stop_word]


    # remove words that have length less than 2
    tokens = [w for w in tokens if len(w) > 1]


    tokens = " ".join(tokens)

    return tokens



def handel_documents(directory, is_train):
    documents=list()

    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue

        path = directory + '/' + filename
        document = load_document(path)

        tokens = document_tokens(document)

        documents.append(tokens)

    return documents

# save tokens to a file
def save_dataset(dataset, filename):
    dump(dataset, open(filename, 'wb'))
    print('Saved: %s' % filename)





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mouay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
neg_docs_train = handel_documents('neg', is_train=True)
pos_docs_train = handel_documents('pos', is_train=True)


neg_docs_test = handel_documents('neg', is_train=False)
pos_docs_test = handel_documents('pos', is_train=False)




In [14]:
trainx=neg_docs_train + pos_docs_train
trainy=[0 for _ in range(900)] + [1 for _ in range(900)]

save_dataset([trainx , trainy], 'train.pkl')

testx=neg_docs_test + pos_docs_test
testy=[0 for _ in range(100)] + [1 for _ in range(100)]
save_dataset([testx,testy], 'test.pkl')

len(trainx)

Saved: train.pkl
Saved: test.pkl


1800

In [16]:
from pickle import load
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def load_dataset(filename):
    return load(open(filename, 'rb'))


# بهي الدالة يقوم بانشاء جدول فيه لكل اسم رقم خاص فيه 
def build_tokenizer(words):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(words)
    return tokenizer


def max_length(docs):
    return max([len(s.split()) for s in docs])

#بهي الدالة بجيب الجدول يلي تم انشاءه في توكن وبحول النصوص الى ارقام
def enccodeding(tokenizer,words,length):
    encoded = tokenizer.texts_to_sequences(words)
    
    # جعل جميع الطلمات بنفس الطول 
    padded_pad = pad_sequences(encoded, maxlen=length, padding='post')
    return padded_pad

len(trainx)

1800

In [17]:
trainx, trainy = load_dataset('train.pkl')

# Debug statements to check the size of the datasets
print(f'Number of training samples: {len(trainx)}')
print(f'Number of testing samples: {len(testx)}')

t = build_tokenizer(trainx)

# max_ size of the document
length = max_length(trainx)

# the number of vocabulary
vocab_size = len(t.word_index) + 1

print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)

trai_datax = enccodeding(t, trainx, length)

print(trai_datax.shape)
len(trainx)

Number of training samples: 1800
Number of testing samples: 200
Max document length: 1380
Vocabulary size: 44277
(1800, 1380)


1800

In [18]:
from keras.models import Model
from keras.layers import  Input , Dense , Flatten , Dropout , Embedding
from keras.layers import Input, Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D
from keras.layers import concatenate


def build_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    #تحويل الداتا الى فيكتور
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)

    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)

    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)

    # merge
    channels = concatenate([flat1, flat2, flat3])

    # interpretation
    dense1 = Dense(10, activation='relu')(channels)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)

    # compile

    return model

In [None]:
from keras.utils import plot_model
model=build_model(length,vocab_size)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



model.fit([trai_datax,trai_datax,trai_datax],array(trainy),epochs=10,batch_size=16)
model.save('mymodel.h5')


Epoch 1/10




[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 69ms/step - accuracy: 0.5275 - loss: 0.6882
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 69ms/step - accuracy: 0.6562 - loss: 0.6119
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 67ms/step - accuracy: 0.9202 - loss: 0.3905
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 67ms/step - accuracy: 0.9830 - loss: 0.3026
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 67ms/step - accuracy: 0.9918 - loss: 0.2655
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 72ms/step - accuracy: 0.9948 - loss: 0.2410
Epoch 7/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 125ms/step - accuracy: 0.9918 - loss: 0.2334
Epoch 8/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 184ms/step - accuracy: 0.9954 - loss: 0.2062
Epoch 9/10
[1m113/113[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x1b54a186690>

In [None]:
plot_model(model,"my_model.jpg")
print(model.summary())


None


In [20]:
from keras.models import load_model

my_model= load_model('mymodel.h5')
testx,testy=load_dataset('test.pkl')

test_datax=enccodeding(t,testx,length)

loss_test, acc_test, = my_model.evaluate([test_datax,test_datax,test_datax], array(testy), verbose=0)
print('Test Accuracy: %f' % (acc_test*100))

print('Test Loss: %f' % loss_test)
loss_train, acc_train = my_model.evaluate([trai_datax,trai_datax,trai_datax], array(trainy), verbose=0)

print('Train Accuracy: %f' % (acc_train*100))
print('Train Loss: %f' % loss_train)



Test Accuracy: 86.000001
Test Loss: 0.378254
Train Accuracy: 100.000000
Train Loss: 0.000831


In [None]:


# تحميل النموذج والتوكنيزر
my_model = load_model('mymodel.h5')

# دالة تنظيف الجملة الجديدة مثلما فعلنا مع البيانات الأصلية
def preprocess_text(text, tokenizer, length):

    table = str.maketrans('', '', punctuation)
    tokens = text.split()
    tokens = [w.translate(table) for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]

    # إزالة الكلمات الشائعة
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w.lower() not in stop_words]

    # تحويل النص إلى رقم باستخدام التوكنيزر
    encoded = tokenizer.texts_to_sequences([" ".join(tokens)])
    padded = pad_sequences(encoded, maxlen=length, padding='post')

    return padded

# مثال على جملة جديدة
new_text = " . I would not recommend "

# معالجة الجملة الجديدة
test_data = preprocess_text(new_text, t, length)


# تمريرها إلى النموذج للحصول على التوقع
prediction = my_model.predict([test_data, test_data, test_data])

# طباعة النتيجة
print("Prediction Score:", prediction[0][0])
if prediction[0][0] > 0.5:
    print("good sentence✅")
else:
    print("Bad sentence ❌")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step
Prediction Score: 0.49271375
Bad sentence ❌
