In [None]:
!pip3 install -r requirements.txt

In [None]:
import os,string
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import requests
from bs4 import BeautifulSoup
from sklearn.metrics import accuracy_score,classification_report,f1_score
from sklearn.model_selection import train_test_split
from wordEmbeddings_and_classificationModels import prepare_data_for_word_vectors,building_word_vector_model,\
classification_model,cnn_classification_model,lstm_classification_model,bilstm_classification_model,\
padding_input,Embed,ELMoEmbedding,data_prep_ELMo,Classification_model_with_ELMo

current_path=os.getcwd()

In [None]:
def json_to_dict(json_set):
    for k,v in json_set.items():
        if v == "True":
            json_set[k]= True
        elif v == "False":
            json_set[k]=False
        else:
            json_set[k]=v
    return json_set

with open("config.json","r") as f:
    params_set = json.load(f)
params_set = json_to_dict(params_set)


with open("model_params.json", "r") as f:
    model_params = json.load(f)
model_params = json_to_dict(model_params)

In [None]:
params_set

In [None]:
model_params

In [None]:
model_params["loss"][1]

In [None]:
options = params_set["option"]
options

#### Option Embedding Parameter:
- 0  : Word2vec, 
- 1  : gensim Fastext, 
- 2  : Fasttext 2018, 
- 3  : GloVe, 
- 4  : pre-trained Word2vec, 
- 5  : word2vec + POS.
- 6  : glove + POS
- 7  : gensim fasttext + POS
- 8  : Fasttext 2018 + POS
- 9  : Elmo only 
- 10 : Elmo + POS 

- 11 : Elmo + word2vec
- 12 : Elmo + pos + word2vec
- 13 : Elmo + glove
- 14 : Elmo + fattest gensim
- 15 : Elmo + pos + glove
- 16 : Elmo + pos + fasttext gensim

- 17 : Elmo + character embedding
- 18 : Elmo + POS + character embedding
- 19 : Elmo + Glove + POS + character embedding
- 20 : Elmo + word2vec + POS + character embedding
- 21 : Elmo + fasttext gensim + pos + character embedding


#### Neural Network Cell Parameter: (For options 0,1,2,3,4)
- 0 for simple 2 layers MLP
- 1 for lstm
- 2 for bilstm
- 3 for cnn



In [None]:
import keras
from keras import backend as K
from sklearn.metrics import roc_auc_score
from keras.preprocessing.text import text_to_word_sequence


# K.clear_session()
# tf.reset_default_graph()

class Histories(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        # K.clear_session()
        # tf.reset_default_graph()
        self.aucs = []
        self.losses = []

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return
    
    def on_epoch_end(self, epoch, logs=None):
        lr = self.model.optimizer.lr
        decay = self.model.optimizer.decay
        iterations = self.model.optimizer.iterations
        lr_with_decay = lr / (1. + decay * K.cast(iterations, K.dtype(decay)))
        print(K.eval(lr_with_decay))
        #K.clear_session()
        #tf.reset_default_graph()

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return


histories = Histories()

In [None]:
def train_model(train_data,X,y,max_len) :

    results = set()
    train_data['text'].str.lower().str.split().apply(results.update)
    words = results  # set(text_to_word_sequence(text))
    vocab_size = len(words) + 1
    params_set["vocab_size"] = vocab_size
    for option in options:
        print("Option:",option)
        for nn_cell in  model_params["nn_cell"]:
            sentences_as_words,sentences,word_ix = prepare_data_for_word_vectors(X)
            print("Sentences loaded")
            model_wv = building_word_vector_model(option,sentences_as_words,params_set["embed_dim"],
                                               params_set["workers"],params_set["window"],train_data,X,y)
            print("Word vector model built")
            x_train, x_test, y_train, y_test = train_test_split(sentences, y, 
                                                                test_size=params_set["split_ratio"], 
                                                                random_state=42)
            print("Data split done")
            x_train_pad,x_test_pad = padding_input(x_train,x_test,params_set["max_len"])

            embed = Embed(params_set["vocab_size"],params_set["embed_dim"],
              params_set["pos_embed_dim"],params_set["max_len"],True)

            if option in [0,1,2,3,4]:
                if nn_cell == 0:
                    print("Building simple MLP model")
                    model = classification_model(params_set["embed_dim"],x_train_pad,x_test_pad,
                                 y_train,y_test,params_set["vocab_size"],word_ix,model_wv,
                                 params_set["trainable_param"],params_set["option"])
                    print("Simple MLP Model is built")
                elif nn_cell == 1:
                    print("Building LSTM model")
                    model = lstm_classification_model(params_set["embed_dim"],x_train_pad,x_test_pad,
                                 x_train,y_train,y_test,
                                 params_set["vocab_size"],word_ix,model_wv,
                                 params_set["trainable_param"],params_set["option"])
                    print("LSTM Model is built")

                elif nn_cell == 2:
                    print("Building BiLSTM model")
                    model = bilstm_classification_model(params_set["embed_dim"],x_train_pad,x_test_pad,
                                 x_train,y_train,y_test,
                                 params_set["vocab_size"],word_ix,model_wv,
                                 params_set["trainable_param"],params_set["option"])
                    print("BiLSTM Model is built")

                elif nn_cell == 3:
                    print("Building CNN model")
                    model = cnn_classification_model(X,y,x_train_pad,x_test_pad,
                                 y_train,y_test,params_set["vocab_size"])

                    print("CNN Model is built")

                print(model.summary())
                print("Traning Model...")
                model.fit(x_train_pad, y_train, epochs= model_params["epochs"],batch_size=model_params["batch_size"], verbose=1, 
                          validation_data=(x_test_pad, y_test),callbacks=[histories])
                
                model_name = ''
                if option == 0:
                    model_name = 'Word2vec_Model'
                elif option == 1:
                    model_name = 'Gensim_Fastext_Model'
                elif option == 2:
                    model_name = 'Fasttext_2018_Model'
                elif option == 3:
                    model_name = 'GloVe_Model'
                elif option == 4:
                    model_name = 'pre_trained_Word2vec_Model'
                
                #save model
                model.save(model_name + '.h5')
                
                predictions = model.predict(x_test_pad)
                
                print("=======================================================================================================")

            elif option in [5,6,7,8]:
                inp_seq,sent_emb = embed.embed_sentences(word_ix,model_wv,False,x_train_pad)
                pos_enc = embed.tag_pos(sentences_as_words)
                print("POS encoded")
                x_train_pos, x_test_pos, _, _ = train_test_split(pos_enc, y, 
                                                                 test_size=params_set["split_ratio"], 
                                                                 random_state=42)
                x_train_pos_pad,x_test_pos_pad = padding_input(x_train_pos,x_test_pos,params_set["max_len"])
                print("POS padded")

                inp_pos,pos_embed = embed.embed_pos(x_train_pos_pad)
                
                model_name = ''
                if option == 5 :
                    print("Building a combined Word2vec & POS model")
                    model_name = 'Word2vec_POS_Model'
                elif option == 6:
                    print("Building a combined Glove & POS model")
                    model_name = 'Glove_POS_Model'
                elif option == 7:
                    print("Building a combined Fasttext & POS model")
                    model_name = 'Fasttext_POS_Model'
                elif option == 8:
                    print("Building a combined Fasttext 2018 + POS model")
                    model_name = 'Fasttext_2018_POS_Model'
                    
                combined_model = Embed.pos_model_build(inp_seq,inp_pos,sent_emb,pos_embed,x_train_pad,x_train_pos_pad,y_train,
                                    model_params["epochs"],model_params["batch_size"],x_test_pad,x_test_pos_pad,y_test)
                if option == 5 :
                    print("A combined Word2vec & POS model is built")
                if option == 6 :
                    print("A combined Glove & POS model is built")
                if option == 7 :
                    print("A combined Fasttext & POS model is built")
                if option == 8 :
                    print("A combined Fasttext 2018 & POS model is built")
                print(combined_model.summary())
                
                print("Traning Combined Model...")
                combined_model.fit([x_train_pad, x_train_pos_pad], y_train, 
                                   epochs=model_params["epochs"],batch_size=model_params["batch_size"],
                                  validation_data=([x_test_pad, x_test_pos_pad], y_test),callbacks=[histories])
                
                #save model
                combined_model.save(model_name + '.h5')
                
                predictions = combined_model.predict([x_test_pad, x_test_pos_pad])
                
                print("=======================================================================================================")

            elif option == 9 :
                train_text,train_label,test_text,test_label = data_prep_ELMo(x_train,y_train,x_test,y_test,max_len,word_ix)
                print("Building Elmo model")
                elmo_model = Classification_model_with_ELMo(train_text,train_label,
                                       test_text,test_label,
                                        vocab_size,
                                       epochs=model_params["epochs"],
                                       batch_size=model_params["batch_size"])
                print("Elmo model is built")
                print(elmo_model.summary())

                print("Traning Elmo Model...")
                elmo_model.fit(train_text,train_label,epochs=model_params["epochs"],batch_size=model_params["batch_size"],
                          validation_data=(test_text,test_label),callbacks=[histories])
                
                #save model
                elmo_model.save('Elmo_Model.h5')

                predictions = elmo_model.predict(test_text)
                
                print("=======================================================================================================")

            elif option == 10 :
                train_text,train_label,test_text,test_label = data_prep_ELMo(x_train,y_train,x_test,y_test,max_len,word_ix)
                elmo_model = Classification_model_with_ELMo(train_text,train_label,
                                       test_text,test_label,
                                        vocab_size,
                                       epochs=model_params["epochs"],
                                       batch_size=model_params["batch_size"])
                
                inp_seq,sent_emb = embed.embed_sentences(word_ix,elmo_model,False,x_train_pad)
                pos_enc = embed.tag_pos(sentences_as_words)
                print("POS encoded")
                x_train_pos, x_test_pos, _, _ = train_test_split(pos_enc, y, 
                                                                 test_size=params_set["split_ratio"], 
                                                                 random_state=42)
                x_train_pos_pad,x_test_pos_pad = padding_input(x_train_pos,x_test_pos,params_set["max_len"])
                print("POS padded")

                inp_pos,pos_embed = embed.embed_pos(x_train_pos_pad)

                print("Building a combined Elmo & POS model")
                combined_model = Embed.pos_model_build(inp_seq,inp_pos,sent_emb,pos_embed,x_train_pad,x_train_pos_pad,y_train,
                                    model_params["epochs"],model_params["batch_size"],x_test_pad,x_test_pos_pad,y_test)

                print("A combined Elmo & POS model is built")
                print(combined_model.summary())

                print("Traning Combined Elmo & POS Model...")    
                combined_model.fit([x_train_pad, x_train_pos_pad], y_train, 
                                   epochs=model_params["epochs"],
                                   batch_size=model_params["batch_size"],
                                   validation_data=([x_test_pad, x_test_pos_pad], y_test),callbacks=[histories])
            
                # evaluate the model
                print("Evaluate ..")
                loss, accuracy = model.evaluate(x_train_pad, y_train, verbose=1)
                print('Accuracy: %f' % (accuracy*100))
                
                # save model
                combined_model.save('Elmo_POS_Model.h5')

                # make predictions from the model
                print("Make predicttions ..")                
                predictions = combined_model.predict([x_test_pad, x_test_pos_pad])

            
            # predictions = [int(0) if i < 0.5 else int(1) for i in predictions]
            predicted_class = np.argmax(predictions, axis=1)
            # print("predictions argmax: ",predictions)
            predicted_class = predicted_class.tolist()
            y_test = [ np.argmax(i) for i in y_test]
            # print("y_test classes: ",y_test)
            # print("predicted_class: ",predicted_class)
            
            print("F1-Score: ",f1_score(y_test, predicted_class, average='macro')  )
            print("Accuracy: ",accuracy_score(y_test, predicted_class))
            print("Classification Report: ",classification_report(y_test,predicted_class))
            print("=======================================================================================================")


# covid_tweets_with_sentiments_2021 dataset

In [None]:
twitter_train_data_path = "covid_tweets_with_sentiments_2021-08-26.csv"
twitter_df = pd.read_csv(twitter_train_data_path, index_col = 0)
twitter_df.head()

In [None]:
sentiments_ids = {"positive":1, "negative":-1,"neutral":0}
ids_sentiments = {id:sent for sent,id in sentiments_ids.items()}
twitter_df = twitter_df[twitter_df['sentiment'].notna()]
twitter_df = twitter_df[twitter_df['text'].notna()]
twitter_df['sentiment'] = twitter_df['sentiment'].replace(sentiments_ids)
twitter_df['sentiment'] = twitter_df['sentiment'].astype(int)
twitter_df.tail()

In [None]:
from tensorflow.keras.utils import to_categorical


X = np.array(list(twitter_df.text.iloc[:100]))
y = np.array(twitter_df.sentiment.iloc[:100])
y_binary = to_categorical(y)

max_len = len(np.max(twitter_df.text.iloc[:100]))
train_model(twitter_df,X,y_binary,max_len)

#### APIs for covid-19 cases stats. per country 

In [None]:
def all_world_population_countries_names():
    url = "https://world-population.p.rapidapi.com/allcountriesname"
    headers = {
        'x-rapidapi-key': "134117ae79msh40bb2931f9c7e4ap1aa445jsn8d1982670b09",
        'x-rapidapi-host': "world-population.p.rapidapi.com"
        }
    response = requests.request("GET", url, headers=headers)
    return response.json() 


def get_world_population_per_country(country):
    url = "https://world-population.p.rapidapi.com/population"
    querystring = {"country_name":country}
    headers = {
        'x-rapidapi-key': "134117ae79msh40bb2931f9c7e4ap1aa445jsn8d1982670b09",
        'x-rapidapi-host': "world-population.p.rapidapi.com"
        }
    response = requests.request("GET", url, headers=headers, params=querystring)
    return response.json()


def get_covid19api_all_countries():
    url = "https://api.covid19api.com/countries"
    payload={}
    headers = {}
    response = requests.request("GET", url, headers=headers, data=payload, timeout=10)
    return response.json()


def get_covid19api_stats_country(country , from_date , to_date ):
    url = "https://api.covid19api.com/total/country/{}?from={}&to={}".format(country , from_date , to_date )
    payload={}
    headers = {}
    response = requests.request("GET", url, headers=headers, data = payload )
    return response


def get_covid_observer_countries():
    url = "https://covid.observer/us/#countries"
    response = requests.request("GET", url, timeout=10)
    soup = BeautifulSoup(response.content, "html.parser")
    countries_list_element = soup.find_all("div",{"class":"countries-list"})[2]
    
    result = {}
    for a_tag in countries_list_element.find_all("a"):
        result[a_tag.text] = "https://covid.observer" + a_tag['href']
    
    return result

# get topN similar sentences for a given sentence using Cosine Similarity
def most_similar_sentence(sentence, all_sentences):

#     print("Given sentence: ",sentence)
    max_cosine_sim = 0.5
    most_sim_sentence = ''
    most_sim_sentence_index = 0
    
    for index, sim_sentence in enumerate(all_sentences):
        cosine = Cosine(2)
        s0 = sentence
        s1 = sim_sentence
        p0 = cosine.get_profile(s0)
        p1 = cosine.get_profile(s1)
        if p1 and cosine.similarity_profiles(p0, p1) > max_cosine_sim:
            max_cosine_sim = cosine.similarity_profiles(p0, p1)
            most_sim_sentence = sim_sentence
            most_sim_sentence_index = index
            
    sentences_cosine_similarities = (s0,most_sim_sentence,max_cosine_sim)
#     print("Most similar sentence: {}".format(most_sim_sentence).encode('utf-8').strip())

    return (most_sim_sentence, max_cosine_sim, most_sim_sentence_index)

#### Get covid cases dataframe for specific country

In [10]:
# get world countries
all_countries = get_covid_observer_countries()

# get most similar country
# sim_country = most_similar_sentence(country, all_countries.keys())[0]


all_country_stat_dfs = []

for country in all_countries.keys():

    print("Crawling covid cases for {}".format(country))
    
    # get API url for the current country
    url = all_countries[country]

    # make a get request
    page = requests.get(url)

    # declare a BeautifulSoup object from request content
    soup = BeautifulSoup(page.text, 'lxml')

    # text mining on BeautifulSoup object
    table_data = soup.find('table')
    headers = []
    for i in table_data.find_all('th'):
        title = i.text
        headers.append(title)

    # creatr a DataFrame
    specific_country_stat_df = pd.DataFrame(columns = headers)

    for j in table_data.find_all('tr')[1:]:
        row_data = j.find_all('td')
        row = [tr.text for tr in row_data]
        length = len(specific_country_stat_df)
        specific_country_stat_df.loc[length] = row


    specific_country_stat_df.columns = ['date','confirmed_cases','daily_growth','recovered_cases',
                                         'fatal_cases','active_cases','recovery_rate','mortality_rate',
                                         'affected_population','confirmed_per_1000','died_per_1000']

    specific_country_stat_df['index'] = specific_country_stat_df.index

    specific_country_stat_df['country'] = country
    
    all_country_stat_dfs.append(specific_country_stat_df)
    
print("len(all_country_stat_dfs): ", len(all_country_stat_dfs))

Crawling covid cases for United Kingdom
Crawling covid cases for United States of America
Crawling covid cases for Uruguay
Crawling covid cases for Uzbekistan
Crawling covid cases for Vanuatu
Crawling covid cases for Venezuela
Crawling covid cases for Viet Nam
Crawling covid cases for Yemen
Crawling covid cases for Zambia
Crawling covid cases for Zimbabwe
len(all_country_stat_dfs):  216


In [16]:
all_country_stat_dataframe = pd.concat(all_country_stat_dfs)
print(all_country_stat_dataframe.shape)
all_country_stat_dataframe.head()

(114173, 13)


Unnamed: 0,date,confirmed_cases,daily_growth,recovered_cases,fatal_cases,active_cases,recovery_rate,mortality_rate,affected_population,confirmed_per_1000,died_per_1000,index,country
0,Sep 9,153840,0.1 %,0,7157,146683,0.0 %,4.7 %,0.4 %,4.04,0.19,0,Afghanistan
1,Sep 8,153736,0.1 %,0,7151,146585,0.0 %,4.7 %,0.4 %,4.04,0.19,1,Afghanistan
2,Sep 7,153626,0.1 %,0,7144,146482,0.0 %,4.7 %,0.4 %,4.04,0.19,2,Afghanistan
3,Sep 6,153534,0.1 %,0,7141,146393,0.0 %,4.7 %,0.4 %,4.04,0.19,3,Afghanistan
4,Sep 5,153375,0.0 %,0,7127,146248,0.0 %,4.6 %,0.4 %,4.03,0.19,4,Afghanistan


In [17]:
all_country_stat_dataframe.to_csv("all_country_covid_cases_dataframe.csv", index=False)