## How do sentiments towards COVID-19 vaccine compare before and after the emergence of the Omicron variant in the Arab world?
### A Twitter comparative sentiment analysis of the pre-Omicron and post-Omicron phases by Arabic users.

### Dependency - The analysis leverages CAMeL Tools
### CAMeL Tools is a suite of Arabic natural language processing tools developed by the CAMeL Lab at New York University Abu Dhabi.
#### pip3 install camel_tools

In [None]:
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.utils.dediac import dediac_ar
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
import os
os.environ["CAMELTOOLS_DATA"] = "~/.camel_tools"
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

text = 'ﷺ'

sentence = "sentence from tweet"

sent_norm = normalize_unicode(sentence)


# Normalize alef variants to 'ا'
sent_norm = normalize_alef_ar(sentence)

# Normalize alef maksura 'ى' to yeh 'ي'
sent_norm = normalize_alef_maksura_ar(sent_norm)

# Normalize teh marbuta 'ة' to heh 'ه'
sent_norm = normalize_teh_marbuta_ar(sent_norm)


In [None]:
# Load the morphological database.
# The MorphologyDB database is used for analyzing modern Standard Arabic. 
db = MorphologyDB.builtin_db()

analyzer = Analyzer(db)

analyses = analyzer.analyze('موظف')

In [None]:
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer
import arabicstopwords.arabicstopwords as stp
# The tokenizer expects pre-tokenized text
sentence = simple_word_tokenize("sentence from tweet")

# Load a pretrained disambiguator to use with a tokenizer
mle = MLEDisambiguator.pretrained('calima-msa-r13')

# Without providing additional arguments, the tokenizer will output undiacritized
# morphological tokens for each input word delimited by an underscore.
tokenizer = MorphologicalTokenizer(mle, scheme='d3tok')
tokens = tokenizer.tokenize(sentence)

# By specifying `split=True`, the morphological tokens are output as seperate
# strings.
tokenizer = MorphologicalTokenizer(mle, scheme='d3tok', split=True)
tokens = tokenizer.tokenize(sentence)

# We can output diacritized tokens by setting `diac=True`
tokenizer = MorphologicalTokenizer(mle, scheme='d3tok', split=True, diac=True)
tokens = tokenizer.tokenize(sentence)

## Import raw data
### Please see Data_by_day notebook for the method by which the pre and post files were created
#### Tweets using the keywords “لقاحات” or “تطعيم” or “لقاح” or “تطعيمات”  - ”vaccines”, “inoculation”, ”vaccine”, “inoculations” 

In [None]:
import pandas as pd

df1_tweets = pd.read_csv('put_your_file_location_for_pre_omicron.csv', index_col=0)
df1_tweets = df1_tweets[ df1_tweets['text'].str.startswith('RT') == False ]

df2_tweets = pd.read_csv('put_your_file_location_for_post_omicron.csv', index_col=0)
df2_tweets = df2_tweets[ df2_tweets['text'].str.startswith('RT') == False ]

In [35]:
#add Arabic stopwords from this file
with open('ar_stopwords.txt', 'r') as file:
    stopwords = file.read()

### Processing

In [None]:
from camel_tools.utils.normalize import normalize_unicode
#remove stopwords and all characters that are not arabic letters or # numbers and lemmatize the words
def preprocess_ar(text):
    processedText = []
    
    my_stp = stp.stopwords_list() | { u'كورونا', u'ان' }

    # Create Lemmatizer and Stemmer.
    st = ISRIStemmer()

    for t in text:
        t = ''.join(c for c in t if ud.category(c) == 'Lo' or ud.category(c) == 'Nd' or c == ' ')
   
        commentwords = ''
        for word in t.split():
            # Checking if the word is a stopword.

            if word not in my_stp:
                if len(word)>1:
                    # Lemmatizing the word.
                    word = st.suf32(word)
                    commentwords += (word+' ')
        processedText.append(normalize_unicode(commentwords))
    
    return processedText

In [None]:
import time
from nltk.stem.isri import ISRIStemmer
import unicodedata as ud
import arabicstopwords.arabicstopwords as stp

df1_tweets = pd.read_csv('put_your_file_location_for_tweet_data.csv', index_col=0)

df1_tweets = df1_tweets[ df1_tweets['text'].str.startswith('RT') == False ]

t = time.time()

processedtext_ar1 = preprocess_ar( df1_tweets.text )

In [None]:
import csv
with open('put_your_file_location_for_pre_omicron_df.csv', 'w') as f:
    write = csv.writer(f)
    write.writerows(processedtext_ar1)


In [None]:
df2_tweets = pd.read_csv('put_your_file_location_for_data_by_day_post_omicron.csv', index_col=0)

df2_tweets = df2_tweets[ df2_tweets['text'].str.startswith('RT') == False ]

t = time.time()
processedtext_ar2 = preprocess_ar(df2_tweets.text)

### Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ar_wordcloud import ArabicWordCloud
import nltk

### Visualization
#### Word Cloud

In [None]:
#Add Arabic stopwords to the natural language toolkit
stopwords = nltk.corpus.stopwords.words('arabic')
stopwords.extend(['Arabic stopwords'])
stopwords.append( stopwords)

#plot Arabic wordcloud
awc = ArabicWordCloud(width=2000,height=1600,max_font_size=400,max_words=10000,collocations=False, background_color='skyblue', colormap="Purples")
plt.figure(figsize=(16,16))
#save figure
wc_ar = awc.from_text(u''.join(processedtext_ar1))
plt.axis("off")
plt.imshow(wc_ar)
plt.savefig("WC1.png")

In [None]:
awc = ArabicWordCloud(width=2000,height=1600,max_font_size=400,max_words=10000,collocations=False, background_color='skyblue', colormap="Purples")
plt.figure(figsize=(16,16))
wc_ar = awc.from_text(u''.join(processedtext_ar2))
plt.axis("off")
plt.imshow(wc_ar)
plt.savefig("WC2.png")

In [None]:
#identify dialect by city
from camel_tools.dialectid import DialectIdentifier

did = DialectIdentifier.pretrained()

sentences = [
    'Sentence from tweet'
]

predictions = did.predict(sentences, 'city')

predictions = did.predict(sentences, 'country')

predictions = did.predict(sentences, 'region')

In [None]:
sen1 = []
# omit bad records using the following example - will require iterative runs.
nums = [10009, 13105, 14064, 16699, 16787, 1691, 16915, 17610, 17611 ]
for i in range(16914,len(processedtext_ar1)):
    if i in nums:
        continue
    text = processedtext_ar1[ i ]
    sen1.append([sa.predict(text)[0], did.predict(text, 'city')[0].top, did.predict(text, 'country')[0].top, did.predict(text, 'region')[0].top])
   

In [None]:
#collect sentiments by city
sentiments1 = pd.DataFrame( sen1, columns = ['sentiment', 'city', 'country', 'region' ] )
sentiments1

In [None]:
df1_all = pd.concat( [ sentiments1, pd.read_csv('file_location_sentiments.csv') ] )

In [None]:
from camel_tools.sentiment import SentimentAnalyzer
from camel_tools.dialectid import DialectIdentifier
from random import sample
did = DialectIdentifier.pretrained()
sa = SentimentAnalyzer.pretrained()

In [None]:
sen2 = []
# omit bad records using the following example - will require iterative runs.
nums = [ 609, 2326, 2327, 2955, 2956, 4523, 4524, 4525,7187,7858, 8795, 
         10871, 12531, 12532, 16521, 18575, 24564, 33337, 36830 ]
for i in range(36827,len(processedtext_ar2)):
    if i in nums:
        continue
    text = processedtext_ar2[ i ]
    sen2.append([sa.predict(text)[0], did.predict(text, 'city')[0].top, did.predict(text, 'country')[0].top, did.predict(text, 'region')[0].top])

In [None]:
sentiments2 = pd.DataFrame( sen2, columns = ['sentiment', 'city', 'country', 'region' ] )
sentiments2

In [None]:
df2_all = pd.concat( [  sentiments2,
                        pd.read_csv('file_location_sentiments.csv') ] )

In [None]:
df2_all

### Frequencies

In [None]:
#create a matrix of sentiments by city
import matplotlib.pyplot as plt
from collections import Counter
import pprint as pp
titles = ['sentiment', 'city', 'country', 'region'] 
sentiment_type = ['positive', 'negative', 'neutral'] 

def build_matrix(sentiments):
    matrix = {}

    for row in sentiments.values.astype(str).tolist():
        if not matrix.get(row[0]):
            matrix[ row[0] ] = {}

        if not matrix[ row[0] ].get( row[1] ) :
            matrix[ row[0] ] [ row[1] ] = {}
            matrix[ row[0] ] [ row[1] ] = 1      
        else:
            matrix[ row[0] ] [ row[1] ] += 1      
            
    return(matrix)

In [None]:
# Creating a data frame using pandas to compare the occurrence of each word across all three sentiments
df1 = pd.DataFrame( build_matrix( sentiments1 ))

df1
ax= df1.plot.bar(color = ('#FF8C00', '#228B22','#1E90FF' ), figsize=(20, 10))
ax.set_xlabel('Cities', fontsize=30)
ax.set_ylabel('Frequency', fontsize=30)
plt.xticks(fontsize=16)
plt.legend(fontsize = 20)
plt.show()

In [None]:
#plot sentiments frequency by city
df2 = pd.DataFrame( build_matrix( sentiments2 ))

df2
ax= df2.plot.bar(color = ('#1E90FF','#228B22','#FF8C00'), figsize=(20, 10))
ax.set_xlabel('Cities', fontsize=30)
ax.set_ylabel('Frequency', fontsize=30)
plt.xticks(fontsize=16)
plt.legend(fontsize = 20)
plt.show()

In [None]:
#count the most common words found in tweets 
counts = dict()
for i in range(0,len(processedtext_ar1)):
    for word in processedtext_ar1[i].split(' '):
        if  not counts.get(''.join(reversed(word))):
            counts[ u''.join(reversed(word)) ] = 1
        else:
            counts[ u''.join(reversed(word)) ] += 1

In [None]:
#plot the most common words found in tweets in Arabic for pre-Omicron tweets
new_vocab1 = Counter( counts )

fig, ax = plt.subplots(figsize=(8, 8))

clean_tweets = pd.DataFrame(new_vocab1.most_common(20), columns=['words', 'count'])
clean_tweets.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="brown")

ax.set_title("Common Words Found in Tweets (Including All Words)")
plt.show()
print(clean_tweets)

In [None]:
#Most Common Words translated to English for pre-Omicon tweets
my_dict1= {u'vaccine':15031, u'vaccination': 3217, u'inoculation':2102, u'god':1520, u'dose': 1448, u'pfizer': 1389, u'health': 1168, u'me':1108, u'doses': 1106, u'protected': 935, u'that':931, u'the dose': 913, u'a dose':897   }
plt.barh(*zip(*my_dict1.items()))
plt.savefig("most_common_words1.png")
plt.show()

In [None]:
#count and plot the most common words found in tweets for post-Omicron tweets
counts = dict()
for i in range(0,len(processedtext_ar2)):
    for word in processedtext_ar2[i].split(' ') :
        print(word)
        if  not counts.get(''.join(reversed(word))):
            counts[ ''.join(reversed(word)) ] = 1
        else:
            counts[ ''.join(reversed(word)) ] += 1

new_vocab2 = Counter( counts )
         
fig, ax = plt.subplots(figsize=(8, 8))

clean_tweets = pd.DataFrame(new_vocab2.most_common(20), columns=['words', 'count'])
clean_tweets.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="brown")

ax.set_title("Common Words Found in Tweets (Including All Words)")
plt.show()

In [None]:
#Most Common Words translated to English for post-Omicon tweets
my_dict2= {u'vaccine':39470, u'presence': 10241, u'dose':7687, u'the dose':7610, u'surrender': 5067, u'drop': 4577, u'vaccination': 4543, u'accomplishment':4532, u' a dose': 4505, u'uptake':4169, u'the second': 4035, u'first':3484   }

plt.barh(*zip(*my_dict2.items()))
plt.savefig("most_common_words2.png")
plt.show()

### Sentiment Analysis

In [None]:
#plot entiment analysis for pre-Omicron tweets
sentiment_counts = (df1['positive'].sum(), df1['negative'].sum(), df1['neutral'].sum())
labels=["Positive","Negative", "Neutral"]
fig1, ax1 = plt.subplots(figsize=(8, 8))
ax1.pie(sentiment_counts, labels=labels, autopct='%1.1f%%', shadow=False, startangle=90 )
ax1.axis('equal')  
plt.tight_layout()
plt.title('Sentiment Distribution for Pre-Omicron Dataset')
plt.savefig("sentiment1.png")
plt.show()

In [None]:
#plot entiment analysis for post-Omicron tweets
sentiment_counts = (df2['positive'].sum(), df2['negative'].sum(), df2['neutral'].sum())
labels=["Positive","Negative", "Neutral"]
#colors=["#ff9999","#99ff99"]
#explode = (0, 0.1)
fig1, ax1 = plt.subplots(figsize=(8, 8))
ax1.pie(sentiment_counts, labels=labels, autopct='%1.1f%%', shadow=False, startangle=90)
ax1.axis('equal')  
plt.tight_layout()
plt.title('Sentiment Distribution for Post-Omicron Dataset')
plt.savefig("sentiment2.png")
plt.show()