In [1]:
import os
import pandas as pd
import numpy as np
import gensim
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
def read_data_from_dir(data_dir):
    x_data = []
    y_data = []
    for class_dir in os.listdir(data_dir):
        class_dir_pth = os.path.join(data_dir, class_dir)
        print(f"working on {class_dir} ...")
        n_files = len(os.listdir(class_dir_pth))
        for i, document in enumerate(os.listdir(class_dir_pth)):
            document_pth = os.path.join(class_dir_pth, document)
            with open(document_pth, encoding="UTF-8") as f:
                lines = f.readlines()
                x_data.append(lines[1])
                y_data.append(class_dir)
            print(f"{int(100 * i / n_files)} %", end='\r')
    print("done!")
#     df = pd.DataFrame(data=text, columns=["text"])
#     df["class"] = text_class
    return x_data, y_data

In [3]:
data_dir = "E:\Technical\Electro pi\SANAD_ Single-Label Arabic News Articles Dataset for Automatic Text Categorization\SANAD_SUBSET\khaleej"
train_data_dir = os.path.join(data_dir, "Train")
test_data_dir = os.path.join(data_dir, "Test")

x_train, y_train = read_data_from_dir(train_data_dir)
x_test, y_test = read_data_from_dir(test_data_dir)

working on Culture ...
working on Finance ...
working on Medical ...
working on Politics ...
working on Religion ...
working on Sports ...
working on Tech ...
done!
working on Culture ...
working on Finance ...
working on Medical ...
working on Politics ...
working on Religion ...
working on Sports ...
working on Tech ...
done!


In [4]:
x_train_cleaned = map(gensim.utils.simple_preprocess, x_train)
x_test_cleaned = map(gensim.utils.simple_preprocess, x_test)

x_train_cleaned = np.array(list(x_train_cleaned), dtype=object)
x_test_cleaned = np.array(list(x_test_cleaned), dtype=object)

In [5]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

model.build_vocab(x_train_cleaned, progress_per=1000)

model.train(x_train_cleaned, total_examples=model.corpus_count, epochs=model.epochs)

model.save("word2vec-SANAD-khaleej-arabic-language.model")

In [6]:
def text2vect(text):
    text_vect = np.zeros(100)
    for word in text:
        try:
            text_vect += model.wv[word]
        except KeyError:
            text_vect += np.zeros(100)
    return text_vect

In [7]:
x_train_cleaned_vec = map(text2vect, x_train_cleaned)
x_test_cleaned_vec = map(text2vect, x_test_cleaned)

x_train_cleaned_vec = np.array(list(x_train_cleaned_vec))
x_test_cleaned_vec = np.array(list(x_test_cleaned_vec))

In [8]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [9]:
svc = SVC()
svc.fit(x_train_cleaned_vec, y_train)

y_train_pred = svc.predict(x_train_cleaned_vec)
y_test_pred = svc.predict(x_test_cleaned_vec)

In [10]:
training_accuracy = accuracy_score(y_train, y_train_pred)
tseting_accuracy  = accuracy_score(y_test, y_test_pred)

print(f"training_accuracy: {training_accuracy}")
print(f"tseting_accuracy: {tseting_accuracy}")

training_accuracy: 0.9658608058608059
tseting_accuracy: 0.9586813186813187
