**<h3>Arabic Sentiment Twitter Corpus</h3>**
<h5>This notebook contains a simple task of text classification for Arabic tweets <br>
    Using logistic regression and support vector machine</h5>
<h5>You can find the dataset here:</h5> <a>https://www.kaggle.com/mksaad/arabic-sentiment-twitter-corpus/code </a>

In [12]:
import pandas as pd

train_positive = pd.read_csv('archive/train_Arabic_tweets_positive_20190413.tsv', sep='\t', header=None)
train_negative = pd.read_csv('archive/train_Arabic_tweets_negative_20190413.tsv', sep='\t', header=None)
train = pd.concat([train_positive, train_negative],ignore_index=True )
train.set_axis(['label', 'tweet'], axis=1, inplace=True)

test_positive = pd.read_csv('archive/test_Arabic_tweets_positive_20190413.tsv', sep='\t', header=None)
test_negative = pd.read_csv('archive/test_Arabic_tweets_negative_20190413.tsv', sep='\t', header=None)
test = pd.concat([test_positive, test_negative],ignore_index=True )
test.set_axis(['label', 'tweet'], axis=1, inplace=True)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [14]:
vectorizer = CountVectorizer()
logReg = LogisticRegression()
pipeline = make_pipeline(vectorizer, logReg)
pipeline.fit(train.tweet, train.label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression', LogisticRegression())])

In [15]:
from sklearn import metrics

def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipeline, test.tweet, test.label)

              precision    recall  f1-score   support

         neg       0.77      0.82      0.80      5768
         pos       0.81      0.76      0.78      5752

    accuracy                           0.79     11520
   macro avg       0.79      0.79      0.79     11520
weighted avg       0.79      0.79      0.79     11520

accuracy: 0.792


In [16]:
# eli5 helps to explore sklearn models.
import eli5
# see weights for each feature.
eli5.show_weights(logReg, vec=vectorizer, top=20)

Weight?,Feature
+2.779,الإخونج
+2.414,وصباحك
+2.216,هالسنه
+2.093,ابريل
+2.092,السحب
+2.086,الزرقاء
+2.075,برونو
+2.029,اللوك
+1.887,الطيب
+1.880,حكمة


In [17]:
observation = test.iloc[1, :]
print(f"true label: {observation['label']}")
display(eli5.show_prediction(logReg, observation['tweet'], vec=vectorizer))

true label: pos


Contribution?,Feature
1.288,Highlighted in text (sum)
-0.334,<BIAS>


In [18]:
# try tf-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

Tfidf_Vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), min_df=.01, max_df=.3)
svc = LinearSVC()
pipeline = make_pipeline(Tfidf_Vectorizer, svc)
pipeline.fit(train.tweet, train.label)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', max_df=0.3, min_df=0.01,
                                 ngram_range=(3, 5))),
                ('linearsvc', LinearSVC())])

In [19]:
print_report(pipeline, test.tweet, test.label)

              precision    recall  f1-score   support

         neg       0.85      0.82      0.84      5768
         pos       0.83      0.86      0.84      5752

    accuracy                           0.84     11520
   macro avg       0.84      0.84      0.84     11520
weighted avg       0.84      0.84      0.84     11520

accuracy: 0.838


In [20]:
eli5.explain_weights(svc, vec=Tfidf_Vectorizer, top=20)

Weight?,Feature
+5.905,😂
+5.245,💙
+5.222,💛
+4.580,🌹
+3.866,🤣
+3.858,❤
+3.727,😍
+2.864,تويت
+2.555,💪
… 772 more positive …,… 772 more positive …


In [21]:
observation = test.iloc[1, :]
print('Actual label', observation['label'])
display(eli5.show_prediction(svc, observation['tweet'], vec=Tfidf_Vectorizer))

Actual label pos


Contribution?,Feature
0.721,Highlighted in text (sum)
0.084,<BIAS>
