In [9]:
import pandas as pd
import os,sys
parent_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(os.path.abspath(parent_path))
from constants import HAWAJEZ,STATUS_WORDS,STATUS
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.feature_selection import SelectKBest, chi2
from utils.clean_data import remove_arabic_stop_words,remove_any_numbers
import nltk
from nltk.stem import ISRIStemmer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Download NLTK data
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nasser\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Read the CSV file
df = pd.read_csv("../data/labeld_data_huge.csv")

In [4]:

# Apply the cleaning function to the text column
df['full_text'] = df['full_text'].apply(remove_arabic_stop_words)

# Remove || from the text
df['full_text'] = df['full_text'].apply(lambda x: x.replace("|||"," "))

# Remove extra spaces
df['full_text'] = df['full_text'].apply(lambda x: " ".join(x.split()))

# Remove duplicate words in text
def remove_duplicate_in_text(text):
    return " ".join(set(text.split()))

df['full_text'] = df['full_text'].apply(remove_duplicate_in_text)

# Remove any numbers in text
df['full_text'] = df['full_text'].apply(remove_any_numbers)

# Lemmatize Arabic text
def lemmatize_arabic(text):
    stemmer = ISRIStemmer()
    tokens = nltk.word_tokenize(text)
    lemmas = [stemmer.suf32(word) for word in tokens]
    return " ".join(lemmas)

df['full_text'] = df['full_text'].apply(lemmatize_arabic)

In [5]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['full_text'], df.iloc[:,1:], test_size=0.2, random_state=12)

# Initialize a CountVectorizer
vectorizer = CountVectorizer(binary=True)

# Fit the vectorizer to the training data
X_train = vectorizer.fit_transform(X_train)

# Transform the test data using the fitted vectorizer
X_test = vectorizer.transform(X_test)

In [6]:

# Initialize a LinearSVC classifier
clf = OneVsRestClassifier(LinearSVC(max_iter=10000,verbose=100), verbose=100, n_jobs=1)

# Train the classifier on the training data
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[LibLinear][Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.8s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.1s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   20.0s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   24.0s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   32.5s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   40.6s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   45.2s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   49.3s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   52.7s remaining:    0.0s
[LibLinear][Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.0min remaining:    0.0s
[LibLinea

In [7]:

# Print accuracy on train data
print("Train accuracy:", clf.score(X_train, y_train))

# Print accuracy on test data
print("Test accuracy:", clf.score(X_test, y_test))

Train accuracy: 0.9179715364710819
Test accuracy: 0.9034757675361913


In [8]:
# Use the fitted model to make predictions on the test set
y_pred = clf.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred, target_names=HAWAJEZ+STATUS_WORDS, output_dict=True)

# Convert report to pandas DataFrame
df_report = pd.DataFrame.from_dict(report).transpose()

# Print the DataFrame
df_report


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
حوارة,0.999902,0.994149,0.997017,20510.0
دير شرف,0.990922,0.956801,0.973563,6389.0
صرة,0.998427,0.980691,0.98948,5179.0
دوار سلمان,0.992254,0.943852,0.967448,3936.0
بيت فوريك,0.998914,0.985274,0.992047,3735.0
شافي شمرون,0.998536,0.976873,0.987585,4886.0
الحمرا,0.99798,0.987835,0.992882,6001.0
جيت,0.956805,0.976253,0.966431,4969.0
عراق بورين,0.998637,0.972392,0.98534,3767.0
يتسهار,0.998538,0.990771,0.99464,7585.0
