In [11]:
import pandas as pd
import os,sys
parent_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(os.path.abspath(parent_path))
from constants import HAWAJEZ,STATUS_WORDS,STATUS
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.feature_selection import SelectKBest, chi2
from utils.clean_data import remove_arabic_stop_words,remove_any_numbers
import nltk
from nltk.stem import ISRIStemmer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import joblib

# Download NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nasser\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# Read the CSV file
df = pd.read_csv("../data/labeld_data_huge.csv")

In [13]:

# Apply the cleaning function to the text column
df['full_text'] = df['full_text'].apply(remove_arabic_stop_words)

# Remove || from the text
df['full_text'] = df['full_text'].apply(lambda x: x.replace("|||"," "))

# Remove extra spaces
df['full_text'] = df['full_text'].apply(lambda x: " ".join(x.split()))

# Remove duplicate words in text
def remove_duplicate_in_text(text):
    return " ".join(set(text.split()))

df['full_text'] = df['full_text'].apply(remove_duplicate_in_text)

# Remove any numbers in text
df['full_text'] = df['full_text'].apply(remove_any_numbers)


In [14]:

# Lemmatize Arabic text
def lemmatize_arabic(text):
    stemmer = ISRIStemmer()
    tokens = nltk.word_tokenize(text)
    lemmas = [stemmer.suf32(word) for word in tokens]
    return " ".join(lemmas)

df['full_text'] = df['full_text'].apply(lemmatize_arabic)

In [15]:
# Initialize a CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the data
X = vectorizer.fit_transform(df['full_text'])
y = df.iloc[:,1:]


In [16]:

# Initialize a LinearSVC classifier
clf = OneVsRestClassifier(LinearSVC(max_iter=10000,verbose=100), verbose=100, n_jobs=1)

# Train the classifier on the data
clf.fit(X, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[LibLinear][Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.7s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.8s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   27.4s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   31.8s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   38.4s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   49.6s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   54.9s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.0min remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.1min remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.3min remaining:    0.0s
[LibLinea

In [17]:
# Print accuracy on data
print("Accuracy:", clf.score(X, y))

Accuracy: 0.9180222393174348


In [18]:
# Save the trained model to a file
joblib.dump(clf, 'model.joblib')

['model.joblib']

In [19]:
import pickle
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [20]:
# Use the fitted model to make predictions on the test set
y_pred = clf.predict(X)

# Generate classification report
report = classification_report(y, y_pred, target_names=HAWAJEZ+STATUS_WORDS, output_dict=True)

# Convert report to pandas DataFrame
df_report = pd.DataFrame.from_dict(report).transpose()

# Print the DataFrame
df_report


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
حوارة,0.999941,0.995711,0.997822,102592.0
دير شرف,0.991597,0.959231,0.975146,31740.0
صرة,0.99866,0.98382,0.991184,26514.0
دوار سلمان,0.998334,0.946891,0.971932,19620.0
بيت فوريك,0.999676,0.984134,0.991844,18782.0
شافي شمرون,0.99864,0.9801,0.989283,24723.0
الحمرا,0.999346,0.989204,0.994249,29362.0
جيت,0.972033,0.986116,0.979023,24848.0
عراق بورين,0.998986,0.97268,0.985658,19253.0
يتسهار,0.999221,0.993058,0.99613,37455.0


In [21]:
import pickle
# save the vectorizer to disk
filename = 'vectorizer.sav'
pickle.dump(vectorizer, open(filename, 'wb'))


In [26]:
# load the model from disk
import pickle
model = pickle.load(open('finalized_model.sav', 'rb'))
vectorizer = pickle.load(open('vectorizer.sav', 'rb'))
sentance = "حوارة سال "
sentance = remove_arabic_stop_words(sentance)
sentance = remove_any_numbers(sentance)
sentance = lemmatize_arabic(sentance)
sentance = vectorizer.transform([sentance])
model.predict(sentance)


array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])