In [None]:
# !pip install farasapy
!pip install flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
import pandas as pd
import numpy as np
import json
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sys

import pickle


%matplotlib inline

In [None]:
rand_seed = 0  # random state for reproducibility

In [None]:
np.random.seed(rand_seed)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# IMPORTANT NOTE: change the experiment_folder to point to your own experiment folder in your Google Drive if you are using Google Colab.
# If you are running the notebook in your machine, change the experiment folder to any path you like
experiment_folder = '/content/drive/My Drive/PSU_sentiment_analysis_session'
sys.path.append(experiment_folder)

In [None]:
# reading our prepared data
data = pd.read_csv(f'{experiment_folder}/balanced_data_psu.csv')
data = data.dropna()
data.head()

Unnamed: 0,Tweet_id,normalized_text,sentiment,text
0,1221776528259854336,و للمره المليون الاب والله اولا و عاشرا و اخيرا,Positive,و للمرة المليون الأب والله أولاً و عاشراً و أخ...
1,1217682671469461504,يا وحشه غيابك ويا شين فرقاه ويا كبر ذنبه لو هق...,Neutral,يا وحشة غيابك ويا شين فرقاه\nويا كبر ذنبه لو ه...
2,1146251606167556099,خيبكم الله حولتم بلاد التوحيد الي مزارات للدعا...,Negative,خيبكم الله ، حولتم بلاد التوحيد إلى مزارات للد...
3,1243483945917956097,من امس ارسل علي الخاص و اتصل علي لا يوجد اي تج...,Negative,@stccare_ksa @stc_ksa @stc من أمس ارسل على الخ...
4,1080541514470703105,اعوذ بالله من زوجه ما تبغي الطايف في شهر العسل,Neutral,أعوذ بالله من زوجة ما تبغى الطايف في شهر العسل


In [None]:
data = data[data['sentiment'] != 'Neutral']
data

Unnamed: 0,Tweet_id,normalized_text,sentiment,text
0,1221776528259854336,و للمره المليون الاب والله اولا و عاشرا و اخيرا,Positive,و للمرة المليون الأب والله أولاً و عاشراً و أخ...
2,1146251606167556099,خيبكم الله حولتم بلاد التوحيد الي مزارات للدعا...,Negative,خيبكم الله ، حولتم بلاد التوحيد إلى مزارات للد...
3,1243483945917956097,من امس ارسل علي الخاص و اتصل علي لا يوجد اي تج...,Negative,@stccare_ksa @stc_ksa @stc من أمس ارسل على الخ...
11,1223397954033213441,عنوان يوم الخميس ما ربوني اهلي وهذبت نفسي عبث,Positive,عنوان يوم الخميس ..\nما ربوني أهلي، وهذبت نفسي...
12,1223398487536152576,اخوياا اساسي انتا شريكي كده كده,Positive,@MAyymaan اخوياا اساسي انتا شريكي كده كده 😂😂😂
...,...,...,...,...
13522,1221883123367055360,مبروك يسلمونيلا ربنا يسعدك في حياتك,Positive,@Salmina97641778 مبروك يسلمونيلا ربنا يسعدك في...
13523,1250780092265115649,وزاره التعليم من زمان وهي علي نفس النهج تحمي ا...,Negative,وزاره التعليم من زمان وهي على نفس النهج تحمي ا...
13524,1080479443565662208,خطوتنا الثانيه يارب وفق دور ال الشباب يقابل ال...,Positive,خطوتنا الثانية يارب وفق ❤\n\n▪دور الــ ٣٢\n#ال...
13526,1147191534607065089,مءسف مشروع سكك السلام سيمر عبر مدينه نيوم يا ب...,Negative,"#مؤسف\nمشروع""#سكك_السلام"" سيمر عبر مدينة #نيوم..."


Now, we need to split our dataset into training, validation, and testing dataset. To do so, we will use the following function.
The function receives a Pandas DataFrame, a list of columns names that represent our features, the labels column name, and the
percentage of splitting. The function will do stratified sampling, i.e., it will use the labels column to split data into homogeneous groups and then do random sampling with the given fraction from each group and combine them to form our splitted dataset.


In [1]:
def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data

In [None]:
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'sentiment' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print(len(train_data))
print(len(val_data))
print(len(test_data))
print(len(train_data)+len(val_data)+len(test_data))
print(len(data))

output: sentiment
features: ['Tweet_id', 'normalized_text', 'text']
7215
902
902
9019
9019


![](https://drive.google.com/uc?export=view&id=1MA1Y31ovLtG7Zia4YwfQCjGoyiZhtNz3)

In [None]:
train_data.head()

Unnamed: 0,Tweet_id,normalized_text,text,sentiment
7491,1242553660284035074,ممكن تقولون متي بيزين النت ترا ما صارت,@stc ممكن تقولون متى بيزين النت ترا ما صارت,Negative
12731,1221776751166140418,اقسم بالله افضل تعليق علي مباراه الاهلي فكرتنا...,اقسم بالله افضل تعليق علي مباراة الاهلي فكرتنا...,Positive
12978,1223397785589907462,كسوله ف كل حاجه معنديش مشكله بس مش لدرجه بكسل ...,كسوله ف كل حاجه معنديش مشكله بس مش لدرجه بكسل ...,Negative
924,1143266146319249408,فايبر ب ريال شركه عروضها ترويجيه وهميه وخدمه و...,#فايبر_STC_ب99ريال\nشركه عروضها ترويجيه وهميه ...,Negative
8657,1223398346511044609,انت خلصت الذبادي اللي بحطه ع وشي ذبادي ايه الل...,انت خلصت الذبادى اللي بحطه ع وشي:\nذبادي ايه ا...,Positive


In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
# vectorizer = CountVectorizer(ngram_range=(1, 2))

train_data_features = vectorizer.fit_transform(train_data['normalized_text'].values.astype('U'))
val_data_features = vectorizer.transform(val_data['normalized_text'].values.astype('U'))
test_data_features = vectorizer.transform(test_data['normalized_text'].values.astype('U'))


![](https://drive.google.com/uc?export=view&id=1wsPAz5ayzjAJVc-IpFqd-fQxn3SVDayR)

In [None]:
train_data_features.shape, val_data_features.shape, test_data_features.shape

((7215, 98258), (902, 98258), (902, 98258))


![](https://drive.google.com/uc?export=view&id=1JYQVODnJQJJ95owcQNAyZjvmRGho4WlX)

In [None]:
def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels):
    clf.fit(train_features, train_labels) # please learn patterns from the data

    print("score on training data:")
    print(clf.score(train_features, train_labels))
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    print('accuracy_score: ')
    print(accuracy_score(test_labels, pred_y))
    
    print('f1_score: ')
    print(f1_score(test_labels, pred_y, average='macro'))

In [None]:
logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.9756063756063756
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7682926829268293
f1_score: 
0.767982129560685


In [None]:
mnb = MultinomialNB()

train_n_test_classifier(mnb, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.9902979902979903
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7815964523281597
f1_score: 
0.7815639663029221


In [None]:
svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.995010395010395
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7671840354767184
f1_score: 
0.7669594744223813


In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
1.0
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7195121951219512
f1_score: 
0.7148775767480157


In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp, train_data_features, train_data[output],
                        val_data_features, val_data[output])

Iteration 1, loss = 0.72957201
Iteration 2, loss = 0.67294843
Iteration 3, loss = 0.51706793
Iteration 4, loss = 0.23443257
Iteration 5, loss = 0.04008542
Iteration 6, loss = 0.00828595
Iteration 7, loss = 0.00373170
Iteration 8, loss = 0.00206441
Iteration 9, loss = 0.00131020
Iteration 10, loss = 0.00097879
Iteration 11, loss = 0.00079916
Iteration 12, loss = 0.00066818
Iteration 13, loss = 0.00059498
Iteration 14, loss = 0.00054414
Iteration 15, loss = 0.00050645
Iteration 16, loss = 0.00048056
Iteration 17, loss = 0.00045583
Iteration 18, loss = 0.00043734
Iteration 19, loss = 0.00042202
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
score on training data:
1.0
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7649667405764967
f1_score: 
0.7649101066131633


In [None]:
# adaboost = AdaBoostClassifier(n_estimators=100)
# train_n_test_classifier(adaboost, train_data_features, train_data[output],
#                         val_data_features, val_data[output])

In [None]:
# gboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
# train_n_test_classifier(gboost, train_data_features, train_data[output],
#                         val_data_features, val_data[output])

In [None]:
# ensemble_model = VotingClassifier(estimators=[
#                                     ('rf',  rf),
#                                     ('mnb', mnb),
#                                     ('logistic_reg', logistic_reg),
#                                     ('svm', svm),
#                                     ('mlp', mlp)
#                                     ],
#                                   weights=[1, 2.5, 1.5, 2, 3],
#                                   voting='soft')


# train_n_test_classifier(ensemble_model, train_data_features,
#                         train_data[output], val_data_features, val_data[output])

    

In [None]:
# estimators = [
#                 ('rf',  rf),
#                 ('mnb', mnb),
#                 ('logistic_reg', logistic_reg),
#                 ('svm', svm),
#                 ('mlp', mlp)
#              ]

In [None]:
# def get_features_from_estimators(features, estimators):
#     clfs_outputs = []
#     for name, clf in estimators:
#         clfs_outputs.append(clf.predict_proba(features)[:, 0].reshape(-1, 1))
#     return np.concatenate(clfs_outputs, axis=1)

In [None]:
# train_data_estimators_features = get_features_from_estimators(train_data_features, estimators)
# val_data_estimators_features = get_features_from_estimators(val_data_features, estimators)
# test_data_estimators_features = get_features_from_estimators(test_data_features, estimators)

# train_data_estimators_features.shape, val_data_estimators_features.shape, test_data_estimators_features.shape

In [None]:
# en_clf = LogisticRegression(fit_intercept=True)
# en_clf = MLPClassifier(hidden_layer_sizes=(10,10), verbose=True, tol=0.001, random_state=0)
# en_clf = RandomForestClassifier()
# train_n_test_classifier(en_clf, train_data_estimators_features, train_data[output],
#                         val_data_estimators_features, val_data[output])

In [None]:
data = pd.read_csv(f'{experiment_folder}/unbalanced_data_psu.csv')
data.head()

Unnamed: 0,Tweet_id,normalized_text,sentiment,text
0,1221875106206638080,والله حسب الارقام سيكون مخيب للامال ولكن الاهل...,Positive,@nas_alharbi8 والله حسب الأرقام سيكون مخيب للآ...
1,1226422627436310528,الحب الحقيقي هو اقتسام بعض نفسك مع شخص اخر اقر...,Positive,الحب الحقيقي هو اقتسام بعض نفسك مع شخص أخر أقر...
2,1221880820815798277,النهضه في فتيل,Positive,@Mo_Fat7 النهضة في فتيل 😂
3,1221884400377499651,ليس حبا في ايران بقدر ماهو نكايه بترامب وحزبه,Neutral,@halgawi @DmfMohe ليس حباً في ايران بقدر ماهو ...
4,1221881406168731649,ابي اعرف الحاكم العربي المسلم اشلون ينام مايخا...,Neutral,@adalfahadduwail أبي أعرف الحاكم العربي المسلم...


In [None]:
data.groupby('sentiment').count()

Unnamed: 0_level_0,Tweet_id,normalized_text,text
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,4510,4510,4510
Neutral,20808,20808,20808
Positive,4607,4606,4607


In [None]:
positive_data = data[data['sentiment'] == 'Positive'].dropna()
negative_data = data[data['sentiment'] == 'Negative'].dropna()
neutral_data = data[data['sentiment'] == 'Neutral'].dropna()
len(positive_data), len(negative_data), len(neutral_data)

(4606, 4510, 20808)

In [None]:
non_neutral_data = positive_data.append(negative_data).sample(frac=1).reset_index(drop=True)
non_neutral_data['sentiment'] = 'NonNeutral'

In [None]:
# min_class = min([len(neutral_data), len(non_neutral_data)])

In [None]:
# neutral_data = neutral_data.sample(n=min_class).reset_index(drop=True)
# non_neutral_data = non_neutral_data.sample(n=min_class).reset_index(drop=True)
# neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)
# neu_data.head()

In [None]:
neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)
neu_data

Unnamed: 0,Tweet_id,normalized_text,sentiment,text
0,1251895249313697793,عندي استفسار علي الخاص ولم يرد علي احد معامله ...,Neutral,@mhrsd_care عندي استفسار على الخاص ولم يرد علي...
1,1221777689834590208,لا يلتقي الراءعون في بدايه العمر ابدا لا يلتقي...,NonNeutral,لاَ يلتقي الرائعونَ في بداية العمر أبداً ، لا ...
2,1221562400496934914,زيدان في الشوط الثاني لعبنا بشكل افضل وكنا اسر...,Neutral,"🎙زيدان:""في الشوط الثاني لعبنا بشكل أفضل وكنا أ..."
3,1252906650446581760,السلام عليكم لدي مدرسه اهليه ولدي معلمين يرغبو...,Neutral,@mhrsd_care السلام عليكم لدي مدرسة أهلية ولدي ...
4,1221883877884682246,انا اذا جعت احس بخمول ما اعصب الانتظار يقهر ما...,NonNeutral,@AdelAliBinAli أنا إذا جعت أحس بخمول ما أعصب \...
...,...,...,...,...
29919,1113864293760499713,رخصه القياده انتهت ولها الان اكثر من شهور منته...,Neutral,@eMoroor \n\nرخصة القيادة انتهت ولها الآن اكثر...
29920,1254769290974134275,السلام عليكم هل فعلا قسم الولاده بمستشفي الملك...,Neutral,@SaudiMOH \nالسلام عليكم \n\nهل فعلاً قسم الول...
29921,1242839689280851969,يعني انا ادفع علي نت الشريحه مبلغ وقدره عشان ي...,NonNeutral,@Mobily1100 يعني انا ادفع على نت الشريحه مبلغ ...
29922,1221904575722029056,كل شوق يمر بسلام الا الشوق للميت اللهم ارحم اب...,Neutral,كُل شوق يمر بسلام إلا الشوق للميت، اللهُم أرحم...


In [None]:
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

# seed = 0  # random state for reproducibility
output = 'sentiment' # output label column
features = neu_data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

neu_train_data, neu_tmp = random_split(neu_data, features, output, train_fraction, rand_seed)
neu_val_data, neu_test_data = random_split(neu_tmp, features, output, val_fraction, rand_seed)

print(len(neu_train_data))
print(len(neu_val_data))
print(len(neu_test_data))
print(len(neu_train_data)+len(neu_val_data)+len(neu_test_data))
print(len(neu_data))

output: sentiment
features: ['Tweet_id', 'normalized_text', 'text']
23939
2992
2993
29924
29924


In [None]:
neu_vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
# neu_vectorizer = CountVectorizer(ngram_range=(1, 2))
neu_train_data_features = neu_vectorizer.fit_transform(neu_train_data['normalized_text'].values.astype('U'))
neu_val_data_features = neu_vectorizer.transform(neu_val_data['normalized_text'].values.astype('U'))
neu_test_data_features = neu_vectorizer.transform(neu_test_data['normalized_text'].values.astype('U'))


In [None]:
neu_logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(neu_logistic_reg, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.8607711266134759
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7486631016042781
f1_score: 
0.6599835581991392


In [None]:
neu_mlp = MLPClassifier(hidden_layer_sizes=(100,100), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(neu_mlp, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

Iteration 1, loss = 0.54422918
Iteration 2, loss = 0.13787891
Iteration 3, loss = 0.01841150
Iteration 4, loss = 0.00853890
Iteration 5, loss = 0.00647483
Iteration 6, loss = 0.00536670
Iteration 7, loss = 0.00368105
Iteration 8, loss = 0.00313344
Iteration 9, loss = 0.00297656
Iteration 10, loss = 0.00233240
Iteration 11, loss = 0.00234145
Iteration 12, loss = 0.00220499
Iteration 13, loss = 0.00213740
Iteration 14, loss = 0.00197370
Iteration 15, loss = 0.00171805
Iteration 16, loss = 0.00153091
Iteration 17, loss = 0.00131570
Iteration 18, loss = 0.00131365
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
score on training data:
0.9999164543214002
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7332887700534759
f1_score: 
0.6796817982776238


In [None]:
neu_mnb = MultinomialNB()
train_n_test_classifier(neu_mnb, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.8512469192531016
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7169117647058824
f1_score: 
0.49354972476440856


In [None]:
neu_svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(neu_svm, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.9791971260286562
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7563502673796791
f1_score: 
0.6941828981909388


In [None]:
neu_rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(neu_rf, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.9999164543214002
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7479946524064172
f1_score: 
0.6587638727461609


In [None]:
pickle.dump(vectorizer, open(f'{experiment_folder}/vectorizer.pkl', 'wb'))
pickle.dump(logistic_reg, open(f'{experiment_folder}/logistic_reg.pkl', 'wb'))
pickle.dump(mnb, open(f'{experiment_folder}/mnb.pkl', 'wb'))
pickle.dump(svm, open(f'{experiment_folder}/svm.pkl', 'wb'))
pickle.dump(rf, open(f'{experiment_folder}/rf.pkl', 'wb'))
pickle.dump(mlp, open(f'{experiment_folder}/mlp.pkl', 'wb'))

In [None]:
pickle.dump(neu_vectorizer, open(f'{experiment_folder}/neu_vectorizer.pkl', 'wb'))
pickle.dump(neu_logistic_reg, open(f'{experiment_folder}/neu_logistic_reg.pkl', 'wb'))
pickle.dump(neu_mnb, open(f'{experiment_folder}/neu_mnb.pkl', 'wb'))
pickle.dump(neu_svm, open(f'{experiment_folder}/neu_svm.pkl', 'wb'))
pickle.dump(neu_rf, open(f'{experiment_folder}/neu_rf.pkl', 'wb'))
pickle.dump(neu_mlp, open(f'{experiment_folder}/neu_mlp.pkl', 'wb'))

# How To Use in Production

In [None]:
experiment_folder = '/content/drive/My Drive/PSU_sentiment_analysis_session'
sys.path.append(experiment_folder)

In [None]:
vectorizer = pickle.load(open(f'{experiment_folder}/vectorizer.pkl', 'rb'))
logistic_reg = pickle.load(open(f'{experiment_folder}/logistic_reg.pkl', 'rb'))
mnb = pickle.load(open(f'{experiment_folder}/mnb.pkl', 'rb'))
svm = pickle.load(open(f'{experiment_folder}/svm.pkl', 'rb'))
rf = pickle.load(open(f'{experiment_folder}/rf.pkl', 'rb'))
mlp = pickle.load(open(f'{experiment_folder}/mlp.pkl', 'rb'))

neu_vectorizer = pickle.load(open(f'{experiment_folder}/neu_vectorizer.pkl', 'rb'))
neu_logistic_reg = pickle.load(open(f'{experiment_folder}/neu_logistic_reg.pkl', 'rb'))
neu_mnb = pickle.load(open(f'{experiment_folder}/neu_mnb.pkl', 'rb'))
neu_svm = pickle.load(open(f'{experiment_folder}/neu_svm.pkl', 'rb'))
neu_rf = pickle.load(open(f'{experiment_folder}/neu_rf.pkl', 'rb'))
neu_mlp = pickle.load(open(f'{experiment_folder}/neu_mlp.pkl', 'rb'))

In [None]:
def predict_multi_level(X, neu_vectorizer, neu_clf, vectorizer, clf):
    #return clf.predict(vectorizer.transform(X))
    neu_y_pred = neu_clf.predict(neu_vectorizer.transform(X))
    if len(X[neu_y_pred == 'NonNeutral']) > 0:
        y_pred = clf.predict(vectorizer.transform(X[neu_y_pred == 'NonNeutral'])) # classify non neutral into positive or negative
        neu_y_pred[neu_y_pred == 'NonNeutral'] = y_pred
    
    final_y_pred = neu_y_pred
    return final_y_pred

In [None]:
X = test_data.dropna()['normalized_text'].values
y = test_data.dropna()['sentiment'].values
pred_y = predict_multi_level(X, neu_vectorizer, neu_mlp, vectorizer, mnb)

In [None]:
print('accuracy_score: ')
print(accuracy_score(y, pred_y))

print('f1_score: ')
print(f1_score(y, pred_y, average='macro'))

accuracy_score: 
0.8569844789356984
f1_score: 
0.5985305491105954


In [None]:
# sampled_test1 = neu_test_data.sort_values(by='normalized_text')[neu_test_data['sentiment'] != 'NonNeutral'].reset_index(drop=True)
# sampled_test2 = test_data.sort_values(by='normalized_text').reset_index(drop=True)

In [None]:
# global_test_data = sampled_test1.append(sampled_test2).sample(frac=1).reset_index(drop=True)
# global_test_data

In [None]:
# X = global_test_data.dropna()['normalized_text'].values
# y = global_test_data.dropna()['sentiment'].values
# pred_y = predict_multi_level(X, neu_vectorizer, neu_mlp, vectorizer, mnb)

In [None]:
# print('accuracy_score: ')
# print(accuracy_score(y, pred_y))

# print('f1_score: ')
# print(f1_score(y, pred_y, average='macro'))

# Flask App To Serve Our Models

In [None]:
import aranorm as aranorm

from flask import Flask, flash, request, redirect, url_for
from werkzeug.utils import secure_filename
from flask_ngrok import run_with_ngrok
app = Flask(__name__)
run_with_ngrok(app)   


@app.route('/', methods=['GET', 'POST'])
def upload_file():
    if request.method == 'POST':
        # check if the post request has the file part
        if 'input_text' not in request.form:
            flash('No text found!')
            return redirect(request.url)

        text = request.form['input_text']
        # text = stemmer.stem(text)
        text = aranorm.normalize_arabic_text(text)
        if text == '':
            return 'Please, write an Arabic sentance. Symbols and non-Arabic characters will be removed from the text....'
        print(f'text: {text}')
        predcited_sentiment = predict_multi_level(np.array([text]), neu_vectorizer, neu_svm, vectorizer, mnb)
        predcited_sentiment = str(predcited_sentiment.squeeze())
        print("Predicted Sentiment:", predcited_sentiment)
        return predcited_sentiment
    
    return '''<!doctype html>
<title>تحليل المشاعر من التغريدات</title>
<script>
function myFunction()
{
    // clear the output text box from the text
    output_text_box = document.getElementById('output_text');
    output_text_box.innerHTML = '';
   
    var elements = document.getElementsByClassName("formVal");
    var formData = new FormData(); 
    
    for(var i=0; i<elements.length; i++)
    {
        formData.append(elements[i].name, elements[i].value);
    }
    var xmlHttp = new XMLHttpRequest();
        xmlHttp.onreadystatechange = function()
        {
            if(xmlHttp.readyState == 4 && xmlHttp.status == 200)
            {
                response = xmlHttp.responseText;
                output_text_box = document.getElementById('output_text');
                console.log(response);
                output_text_box.innerHTML = response;
            }
        }
        xmlHttp.open("post", "/"); 
        xmlHttp.send(formData); 
}
</script>
<h1>تحليل المشاعر من التغريدات</h1>
<p>مثال: أحب أمي و أبي</p>
<form method=post enctype=multipart/form-data>
  <textarea id="input_text"class='formVal' rows="5" cols="50" type="text" name="input_text" placeholder="التغريدة"></textarea> <br>
  <textarea id="output_text" class='formVal' rows="5" cols="50" type="text" name="output_text" placeholder="المشاعر المتوقعة"></textarea>
  <input type="submit" value="submit_now" onclick="myFunction(); return false;">
</form>

</html>
    '''
app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://e6dd-35-237-164-154.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [19/Oct/2021 10:41:42] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Oct/2021 10:41:43] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [19/Oct/2021 10:42:08] "[37mPOST / HTTP/1.1[0m" 200 -


text: احب امي و ابي
Predicted Sentiment: Positive


127.0.0.1 - - [19/Oct/2021 10:42:11] "[37mPOST / HTTP/1.1[0m" 200 -


text: احب امي و ابي
Predicted Sentiment: Positive


127.0.0.1 - - [19/Oct/2021 10:42:41] "[37mPOST / HTTP/1.1[0m" 200 -


text: احب امي و ابي
Predicted Sentiment: Positive


127.0.0.1 - - [19/Oct/2021 10:42:59] "[37mPOST / HTTP/1.1[0m" 200 -


text: اكره البيتزا
Predicted Sentiment: Negative


127.0.0.1 - - [19/Oct/2021 10:43:21] "[37mPOST / HTTP/1.1[0m" 200 -


text: احب صديقي يوسف
Predicted Sentiment: Positive


127.0.0.1 - - [19/Oct/2021 10:43:38] "[37mPOST / HTTP/1.1[0m" 200 -


text: ضرب احمد محمد
Predicted Sentiment: Positive


127.0.0.1 - - [19/Oct/2021 10:43:57] "[37mPOST / HTTP/1.1[0m" 200 -


text: قتل محمد احمد
Predicted Sentiment: Positive


In [None]:
!pip freeze > requirements.txt

# Testing Heroku Sentiment Analysis API That We've Create

In [None]:
api_link = 'https://psu-sentiment-session-1.herokuapp.com/'

In [None]:
import requests

In [None]:
sampled_test_set_api = test_data.head(10).reset_index(drop=True)
sampled_test_set_api.head(10)

Unnamed: 0,Tweet_id,normalized_text,text,sentiment
0,1223395720927023104,انا معكم من امس وانا الهم بندول والحين ودي بمو...,@2013_zahraaa @nada9sa أنا معكم من أمس وأنا أل...,Negative
1,1171483323111096320,شي مخجل منتخب فضيحه ياحسافه الملايين اتمني الت...,شي مخجل 👎🏼 منتخب فضيحة .ياحسافة الملايين .اتمن...,Negative
2,1245162434228346888,اسوء بنك محد بيخدمك كل الموظفين يرمون العملاء ...,@AlAhliNCB اسوء بنك محد بيخدمك كل الموظفين يرم...,Negative
3,1223167134915801090,احبكمممم تكفون تعالو في مدينه اسمه القويعيه تك...,@McDonaldsKSA احبكمممم تكفون تعالو في مدينه اس...,Positive
4,1245164841486221312,اني متاكد كورونا من صناعه النساء لاسباب وقفت د...,أني متأكد كورونا من صناعة النساء لأسباب\n1_وقف...,Positive
5,1221087434131365893,تخسون اقاطع الحب سوق كوم وربي انه جبر خاطر عند...,تخسون اقاطع الحب سوق كوم وربي انه جبر خاطر عند...,Positive
6,1221786977508433920,كلللل شي في هالحياه ضدي كل مره اقول بيتعدل كل ...,كلللل شي في هالحياه ضدي كل مره اقول بيتعدل كل ...,Negative
7,1132185940519211008,وش الفايده من هذا العرض العرض يفيدك باول اشهر ...,وش الفايدة من هذا العرض العرض يفيدك بأول 6 اشه...,Negative
8,1080191970134450177,هي مصر كلها كانت ف امبارح,هي مصر كلها كانت ف water way امبارح😂,Positive
9,1153131254423924736,ادمنت افلام و مسلسلات و رياضه مش عشان بستمتع ف...,ادمنت افلام و مسلسلات و رياضة مش عشان بستمتع ف...,Negative


In [None]:
for i in range(len(sampled_test_set_api)):
  tweet = sampled_test_set_api.iloc[i]['text']
  sentiment = sampled_test_set_api.iloc[i]['sentiment']
  api_response = requests.post(url=api_link, data={'input_text': tweet})
  predicted_sentiment = api_response.text
  print('Tweet:', tweet)
  print('Real Sentiment:', sentiment)
  print('Predicted Sentiment:', predicted_sentiment)
  print('-'*85)

Tweet: @2013_zahraaa @nada9sa أنا معكم من أمس وأنا ألهم بندول وألحين ودي بموية وموب قادرة أنزل من التعب 🤒
Real Sentiment: Negative
Predicted Sentiment: Negative
-------------------------------------------------------------------------------------
Tweet: شي مخجل 👎🏼 منتخب فضيحة .ياحسافة الملايين .اتمنى التركيز على رياضة اخرى بدل هالرياضة اللي فتحت رزق للحوج المزاجيين
 #السعوديه_اليمن
Real Sentiment: Negative
Predicted Sentiment: Negative
-------------------------------------------------------------------------------------
Tweet: @AlAhliNCB اسوء بنك محد بيخدمك كل الموظفين يرمون العملاء على بعض
Real Sentiment: Negative
Predicted Sentiment: Negative
-------------------------------------------------------------------------------------
Tweet: @McDonaldsKSA احبكمممم تكفون تعالو في مدينه اسمه القويعيه تكفون افتحو ماك في مدينه القويعيه او في القويعيه مول
Real Sentiment: Positive
Predicted Sentiment: Positive
-------------------------------------------------------------------------------------
Tw