In [1]:
import numpy as np 
import pandas as pd

In [5]:
df = pd.read_csv('ar_reviews_100k.tsv',sep='\t')

In [6]:
df.head()

Unnamed: 0,label,text
0,Positive,ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...
1,Positive,أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...
2,Positive,هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...
3,Positive,خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...
4,Positive,ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...


In [7]:
df.shape

(99999, 2)

In [8]:
df.label.value_counts()

label
Positive    33333
Mixed       33333
Negative    33333
Name: count, dtype: int64

In [9]:
text = df['text']
labels = df['label']

In [10]:
!pip install pyarabic



In [11]:
import nltk 
from nltk.stem.isri import ISRIStemmer
from nltk.corpus import stopwords
import pyarabic.araby as pa

In [12]:
## Cleaning text data 

st = ISRIStemmer()


mystop_words = stopwords.words('arabic')
mystop_words.remove('مش')
mystop_words.remove('ليس')


def text_preprocessing(text):
    no_teshkel = pa.strip_tashkeel(text)
    hemza = pa.normalize_hamza(no_teshkel)
    txt_tokanize = pa.tokenize(hemza)
    text_no_stopwords = [i for i in txt_tokanize if i not in mystop_words]
    text_stem = [st.stem(i) for i in text_no_stopwords]
    final_text = ' '.join(text_stem)
    return final_text



In [13]:
text_cleaned = text.apply(text_preprocessing)

----

In [14]:
## Count vectorizer 
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
cv = CountVectorizer(max_features=3000)
words = cv.fit_transform(text_cleaned).toarray()

In [18]:
label = np.array(cv.get_feature_names_out())
ndf = pd.DataFrame(columns=label , data=words)

In [19]:
ndf.head()

Unnamed: 0,ءءة,ءءت,ءءثار,ءءخذ,ءءخر,ءءدم,ءءراء,ءءس,ءءل,ءءلاف,...,يمو,يمي,ينس,ينف,ينم,يني,يهد,يهم,يوج,يوم
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(ndf,labels,test_size=.20)

## Build Model

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


knn = KNeighborsClassifier()
knn.fit(X_train,y_train)


In [22]:
y_pred = knn.predict(X_test)

accuracy_score(y_test,y_pred)

0.52385

In [23]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train,y_train)
y_pred = nb.predict(X_test)
accuracy_score(y_test,y_pred)

0.4579

In [24]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()
lg.fit(X_train,y_train)
y_pred = lg.predict(X_test)
accuracy_score(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.63105

------

In [25]:
review = 'الفيلم جميل جدا'

cv_text = cv.transform([review]).toarray()

cv_text

array([[0, 0, 0, ..., 0, 0, 0]])

In [27]:
lg.predict(cv_text)[0]



'Positive'

In [28]:
review2 = '  الاكل كان بارد وطعمه مش حلو'

cv_text = cv.transform([review2]).toarray()

cv_text

array([[0, 0, 0, ..., 0, 0, 0]])

In [29]:
lg.predict(cv_text)[0]



'Mixed'