<a href="https://colab.research.google.com/github/arashms/DL-project/blob/main/DataPreparationForTFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install and Import libraries and define parameters

### Install & Import Libraries

In [1]:
!pip install transformers
!pip install pytorch-forecasting
!pip install pytorch_lightning

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 4.1MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 30.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 38.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=9037b

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import softmax
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import csv 
import random
from transformers import BertTokenizer, RobertaTokenizer
from transformers import BertForNextSentencePrediction, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from scipy import stats
import time
import datetime
import pickle
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import gc
import itertools
import nltk
import transformers
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
# nltk.download('punkt')

### Setting device, random seed, and runtime parameters

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Cuda available: ",torch.cuda.is_available())
if torch.cuda.is_available():
    print("Current device: ",  torch.cuda.current_device())

seed = 204920
seed2 = 293652

random.seed(seed2)
np.random.seed(seed2)
torch.manual_seed(seed2)

if device.type == 'cuda':
    torch.cuda.manual_seed_all(seed)

Cuda available:  True
Current device:  0


### Setting Parameters

In [4]:
COMMENT_DIMENSIONS = 20
first_day = datetime.datetime(2020, 1, 1)
last_day = datetime.datetime(2021, 2, 27)
number_of_days = (last_day - first_day).days + 1
print('Number of days: ', number_of_days)

Number of days:  424


# 2. Reading and Processing Datasets

### Moving datasets to Colab 

In [5]:
!cp  /content/drive/MyDrive/DL-project/Combined.xlsx Combined.xlsx
!cp  /content/drive/MyDrive/DL-project/acaps_covid19_government_measures_dataset_0.xlsx acaps_covid19_government_measures_dataset_0.xlsx
!cp /content/drive/MyDrive/DL-project/time_series_covid_19_confirmed_aggregated.csv time_series_covid_19_confirmed_aggregated.csv
!cp /content/drive/MyDrive/DL-project/comment-embeddings.json comment-embeddings.json
!cp /content/drive/MyDrive/DL-project/reduced-embeddings.json reduced-embeddings.json

### Reading and processing time-series dataset

In [6]:
time_series_country_list = []
time_series_date_list = [0]
time_series_confirmed_cases = {}

with open('time_series_covid_19_confirmed_aggregated.csv') as csvfile:    
    reader = csv.reader(csvfile, delimiter=',')
    cnt = 0
    
    for row in reader:
        if cnt == 0:
            for i in range(1, len(row)):
                datetimeObject = datetime.datetime.strptime(row[i], '%m/%d/%Y')
                time_series_date_list.append((datetimeObject - first_day).days)
        else:
            country = row[0]
            time_series_country_list.append(country)
            time_series_confirmed_cases[country] = [0 for i in range(number_of_days)]

            previous_accumulative_cases = 0
            for i in range(1, len(row)):
                time_series_confirmed_cases[country][time_series_date_list[i]] = int(row[i]) - previous_accumulative_cases
                previous_accumulative_cases = int(row[i])

        cnt += 1


print('Countries: ', len(time_series_country_list), time_series_country_list)
print('Dates: ', len(time_series_date_list), time_series_date_list)
print('\nConfirmed cases in China: \n', time_series_confirmed_cases['China'])

Countries:  201 ['Australia', 'Canada', 'China', 'Denmark', 'France', 'Netherlands', 'United Kingdom', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burma', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Central African Republic', 'Chad', 'Chile', 'Colombia', 'Comoros', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Diamond Princess', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'H

### Reading regulation dataset

In [7]:
from sklearn.preprocessing import LabelEncoder as le

xl_file = pd.ExcelFile('acaps_covid19_government_measures_dataset_0.xlsx')

sheets = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

dataframe = sheets['Dataset']

print('Sheets in the dataset:   ', sheets.keys())
print('Number of regulations in the dataset: ', len(dataframe), '\n')

# print(dataframe.head())
print('\n', dataframe.info(), '\n')

dataset = {}
for key in dataframe:
    # dataframe[key] = le.fit_transform(dataframe[key].astype(str))
    dataframe[key]=dataframe[key].astype('str')
    dataset[key] = list(dataframe[key])


Sheets in the dataset:    dict_keys(['About', 'Dataset', 'Dictionary'])
Number of regulations in the dataset:  23923 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23923 entries, 0 to 23922
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  23923 non-null  int64         
 1   ISO                 23923 non-null  object        
 2   COUNTRY             23923 non-null  object        
 3   REGION              23923 non-null  object        
 4   ADMIN_LEVEL_NAME    3682 non-null   object        
 5   PCODE               0 non-null      float64       
 6   LOG_TYPE            23923 non-null  object        
 7   CATEGORY            23923 non-null  object        
 8   MEASURE             23923 non-null  object        
 9   TARGETED_POP_GROUP  7556 non-null   object        
 10  COMMENTS            23799 non-null  object        
 11  NON_COMPLIANCE      22764 non-null  obj

### Processing regulation dataset

In [8]:
country_list = set([])

for i in range(len(dataset['ID'])):
    country = dataset['COUNTRY'][i]    
    country_list.add(country)
    
country_list = list(country_list)
country_list.sort()
time_series_country_list.sort()

print('Countries in regulation dataset: ', len(country_list), country_list)
print('Countries in time-series dataset: ', len(time_series_country_list), time_series_country_list)

common_countries = []
for country in country_list:
    if country in time_series_country_list:
        common_countries.append(country)

print('Number of countries in intersection of both datasets: ', len(common_countries), common_countries)

regulations = {}

for country in common_countries:
    regulations[country] = {'positive':[[] for i in range(number_of_days)], 'negative':[[] for i in range(number_of_days)]}

for i in range(len(dataset['ID'])):
    
    # Ignore the regulations with unknown implementation time
    if dataset['DATE_IMPLEMENTED'][i] == 'NaT':
        continue
    
    # Ignore the regulations that we don't have time-series of their countries
    if dataset['COUNTRY'][i] not in common_countries:
        continue
    
    # fill the empty comment field with ----
    if dataset['COMMENTS'][i] == 'nan':
        dataset['COMMENTS'][i] = '----'


    country = dataset['COUNTRY'][i]
    datetimeObject = datetime.datetime.strptime(dataset['DATE_IMPLEMENTED'][i], '%Y-%m-%d')
    date = (datetimeObject - first_day).days

    if dataset['LOG_TYPE'][i] == 'Introduction / extension of measures':
        regulations[country]['positive'][date].append(i)
    else: # Phase-out measure 
        regulations[country]['negative'][date].append(i)
    
print('\nA sample of positive and negative regulation ids in Ghana: ')
print(regulations['Ghana'])


Countries in regulation dataset:  193 ['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'CAR', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', "CÃ´te d'Ivoire", 'DPRK', 'DRC', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', '

### Creating Measure and Category dictionaries

In [9]:
measure_dic = {}
category_dic = {}

for i in range(len(dataset['ID'])):
    category = dataset['CATEGORY'][i]
    measure = dataset['MEASURE'][i]

    if category not in category_dic:
        category_dic[category] = 0

    if measure not in measure_dic:
        measure_dic[measure] = 0
    
print(len(category_dic.keys()), category_dic)
print(len(measure_dic.keys()), measure_dic)

6 {'Public health measures': 0, 'Movement restrictions': 0, 'Governance and socio-economic measures': 0, 'Social distancing': 0, 'Lockdown': 0, 'Humanitarian exemption': 0}
35 {'Awareness campaigns': 0, 'Health screenings in airports and border crossings': 0, 'International flights suspension': 0, 'Border checks': 0, 'Strengthening the public health system': 0, 'Isolation and quarantine policies': 0, 'Emergency administrative structures activated or established': 0, 'Surveillance and monitoring': 0, 'Other public health measures enforced': 0, 'Border closure': 0, 'General recommendations': 0, 'State of emergency declared': 0, 'Domestic travel restrictions': 0, 'Limit public gatherings': 0, 'Limit product imports/exports': 0, 'Schools closure': 0, 'Partial lockdown': 0, 'Changes in prison-related policies': 0, 'Closure of businesses and public services': 0, 'Checkpoints within the country': 0, 'Economic measures': 0, 'Military deployment': 0, 'Curfews': 0, 'Visa restrictions': 0, 'Requi

### Creating country2region dictionary

In [18]:
country2region = {}

for i in range(len(dataset['ID'])):
    country = dataset['COUNTRY'][i]
    region = dataset['REGION'][i]

    if country not in country2region:
        country2region[country] = region

print(country2region)

{'Afghanistan': 'Asia', 'Albania': 'Europe', 'Algeria': 'Africa', 'Angola': 'Africa', 'Antigua and Barbuda': 'Americas', 'Argentina': 'Americas', 'Armenia': 'Middle east', 'Australia': 'Pacific', 'Austria': 'Europe', 'Azerbaijan': 'Middle east', 'Bahamas': 'Americas', 'Bahrain': 'Middle east', 'Bangladesh': 'Asia', 'Barbados': 'Americas', 'Belarus': 'Europe', 'Belgium': 'Europe', 'Belize': 'Americas', 'Benin': 'Africa', 'Bhutan': 'Middle east', 'Bolivia': 'Americas', 'Bosnia and Herzegovina': 'Europe', 'Botswana': 'Africa', 'Brazil': 'Americas', 'Brunei': 'Middle east', 'Bulgaria': 'Europe', 'Burkina Faso': 'Africa', 'Burundi': 'Africa', "CÃ´te d'Ivoire": 'Africa', 'Cambodia': 'Asia', 'Cameroon': 'Africa', 'Canada': 'Americas', 'Cape Verde': 'Africa', 'CAR': 'Africa', 'Chad': 'Africa', 'Chile': 'Americas', 'China': 'Asia', 'Colombia': 'Americas', 'Comoros': 'Africa', 'Congo': 'Africa', 'Costa Rica': 'Americas', 'Croatia': 'Europe', 'Cuba': 'Americas', 'Cyprus': 'Europe', 'Czech Republi

# 3. Comment embeddings from DeBerta model

### creating list of comments

In [10]:
comments = []
comment_infos = []

for country in regulations:
    for log_type in regulations[country]:
        for day in range(len(regulations[country][log_type])):

            if len(regulations[country][log_type][day]) == 0:
                continue

            comment = ''
            for index in regulations[country][log_type][day]:
                comment = comment + ' ' + dataset['COMMENTS'][index]
            
            comments.append(comment)
            comment_infos.append((country, log_type, day))

print('Number of comments: ', len(comments))
print(comments[0:5])
print(comment_infos[0:5])

Number of comments:  11048
[' MoPH begins announcements on their facebook to make public aware of coronavirus. ', ' Health teams at airports will check passengers coming from China. ', ' Flights to China are suspended.  Health screenings of all passengers at airports. ', ' All China and Iran nationals', ' the ministry has prepared 100 bed to control this virus in Kabul and 200 others in the province hospital with all the facilities needed in the country. ----']
[('Afghanistan', 'positive', 23), ('Afghanistan', 'positive', 25), ('Afghanistan', 'positive', 26), ('Afghanistan', 'positive', 31), ('Afghanistan', 'positive', 32)]


### Tokenizing

In [11]:
tokenizer = transformers.DebertaTokenizer.from_pretrained('microsoft/deberta-base') 
max_length = 256
train_encodings = tokenizer(comments, add_special_tokens=True, return_token_type_ids=False, truncation=True, padding=True, max_length=max_length)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898825.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=52.0, style=ProgressStyle(description_w…




### Creating pytorch Dataset

In [12]:
class NSPDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
#         self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         if self.labels != None:
#           item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        # return len(self.labels)
        return len(self.encodings['input_ids'])

In [13]:
train_dataset = NSPDataset(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

### Creating DeBerta Model

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = transformers.DebertaModel.from_pretrained('microsoft/deberta-base')


if torch.cuda.device_count() > 1:
#   print("Let's use", torch.cuda.device_count(), "GPUs!")
  model = torch.nn.DataParallel(model)
    
model.to(device)
model.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=474.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=558582766.0, style=ProgressStyle(descri…




DebertaModel(
  (embeddings): DebertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=0)
    (LayerNorm): DebertaLayerNorm()
    (dropout): StableDropout()
  )
  (encoder): DebertaEncoder(
    (layer): ModuleList(
      (0): DebertaLayer(
        (attention): DebertaAttention(
          (self): DisentangledSelfAttention(
            (in_proj): Linear(in_features=768, out_features=2304, bias=False)
            (pos_dropout): StableDropout()
            (pos_proj): Linear(in_features=768, out_features=768, bias=False)
            (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
            (dropout): StableDropout()
          )
          (output): DebertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): DebertaLayerNorm()
            (dropout): StableDropout()
          )
        )
        (intermediate): DebertaIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bia

### Generating Comment Embeddings

In [15]:
all_cls = []

for iteration, batch in tqdm(enumerate(train_loader)):
    
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    
    with torch.no_grad():
        
        last_hidden_state = model(input_ids, attention_mask=attention_mask).last_hidden_state
        cls_tokens = last_hidden_state[:,0,:].detach()
        
        all_cls.append(cls_tokens)

        
# out_cls is a matrix of size number_of_not_null_comments (23799) X size_of_hidden_state_of_BERT (768)
# In this matrix, for each comment we have an embedding vector.
# Use "ids" list to map each comment with its ids.
out_cls = torch.cat(all_cls, 0)

print("shape of output matrix :", out_cls.shape)
comment_embeddings = out_cls.tolist()

346it [09:01,  1.56s/it]


shape of output matrix : torch.Size([11048, 768])


### Save and load comment embeddings

In [28]:
import json
with open('comment-embeddings.json', 'w') as file1, open('/content/drive/MyDrive/DL-project/comment-embeddings.json', 'w') as file2:
    json.dump(comment_embeddings, file1)
    json.dump(comment_embeddings, file2)

In [27]:
comment_embeddings = []
with open('comment-embeddings.json', 'r') as file:
    comment_embeddings = json.load(file)

print(len(comment_embeddings), len(comment_embeddings[0]), comment_embeddings[0])

11048 768 [-0.0021092742681503296, -0.07381375133991241, -0.11492428183555603, -0.06525909155607224, -0.006907906383275986, 0.07458068430423737, -0.13755455613136292, -0.0265787485986948, 0.09799185395240784, -0.06330740451812744, -0.09887293726205826, 0.007764849811792374, -0.04174366965889931, -0.04749990999698639, -0.03481698036193848, -0.06963647156953812, -0.023818157613277435, -0.01967502012848854, 0.04264158383011818, -0.021032515913248062, 0.027185000479221344, 0.03490302339196205, 0.0002858266234397888, -0.07306428253650665, 0.01342175155878067, -0.01583166792988777, -0.0002553015947341919, 0.06236741691827774, -0.0389697402715683, -0.02120368182659149, -0.09736765921115875, 0.026904448866844177, -0.005332056432962418, -0.10552310943603516, 0.08812673389911652, 0.009326308965682983, -0.006019853055477142, -0.08018489927053452, 0.021022941917181015, 0.049953579902648926, -0.0428997240960598, -0.16011479496955872, -0.10330456495285034, 0.03169439360499382, -0.05761684477329254, 

### Reduce embeddings

In [16]:
from sklearn.manifold import TSNE
import numpy as np
from sklearn.decomposition import PCA

# reduced_embeddings = TSNE(verbose=COMMENT_DIMENSIONS).fit_transform(comment_embeddings)

pca = PCA(n_components = COMMENT_DIMENSIONS)
samples = np.array(comment_embeddings)
reduced_embeddings = pca.fit_transform(samples).tolist()

print(len(reduced_embeddings), len(reduced_embeddings[0]), reduced_embeddings[0])

11048 20 [7.845292310613456, 4.628665314575266, 0.8785070598723065, -1.7030493792400954, 0.7023583855166133, 0.25813270612300493, -0.18706157316442965, -0.033971972069033365, -0.049833672665746016, -0.00440830595213049, -0.0285312353973593, 0.008130980707560647, 0.009532910365475782, 0.07594895259281183, -0.0360898977566427, 0.004935803769604501, 0.044994615366270155, 0.0418614366719351, 0.007468988061556442, -0.017554697436681015]


### Save and load reduced embeddings

In [20]:
import json
with open('reduced-embeddings.json', 'w') as file1, open('/content/drive/MyDrive/DL-project/reduced-embeddings.json', 'w') as file2:
    json.dump(reduced_embeddings, file1)
    json.dump(reduced_embeddings, file2)

In [None]:
reduced_embeddings = []
with open('reduced_embeddings.json', 'r') as file:
    reduced_embeddings = json.load(file)

### Move embeddings to dictionary

In [19]:
info2embedding = {}
for i in range(len(reduced_embeddings)):
    info2embedding[comment_infos[i]] = reduced_embeddings[i]

for key in info2embedding:
    print(key, ':\n', info2embedding[key])
    break

('Afghanistan', 'positive', 23) :
 [7.845292310613456, 4.628665314575266, 0.8785070598723065, -1.7030493792400954, 0.7023583855166133, 0.25813270612300493, -0.18706157316442965, -0.033971972069033365, -0.049833672665746016, -0.00440830595213049, -0.0285312353973593, 0.008130980707560647, 0.009532910365475782, 0.07594895259281183, -0.0360898977566427, 0.004935803769604501, 0.044994615366270155, 0.0418614366719351, 0.007468988061556442, -0.017554697436681015]


# 4. Creating Pandas dataframe

In [23]:
features = {'time_idx': [], 'country': [], 'region': [], 'confirmed-cases': []}

for log_type in ['positive', 'negative']:
    for category in category_dic:
        features[log_type + '-' + category] = []
    for measure in measure_dic:
        features[log_type + '-' + measure] = []
    for i in range(COMMENT_DIMENSIONS):
        features[log_type + '-comment-embedding' + str(i)] = []

dataframe = pd.DataFrame(features)

for country in tqdm(regulations):
    for day in range(number_of_days):

        new_entry = {}
        new_entry['country'] = country
        new_entry['time_idx'] = day
        new_entry['region'] = country2region[country]
        new_entry['confirmed-cases'] = time_series_confirmed_cases[country][day]

        for log_type in ['positive', 'negative']:

            for category in category_dic:
                new_entry[log_type + '-' + category] = 0
            for index in regulations[country][log_type][day]:
                category = dataset['CATEGORY'][index]
                new_entry[log_type + '-' + category] += 1

            for measure in measure_dic:
                new_entry[log_type + '-' + measure] = 0
            for index in regulations[country][log_type][day]:
                measure = dataset['MEASURE'][index]
                new_entry[log_type + '-' + measure] += 1
            
            embedding = [0 for i in range(COMMENT_DIMENSIONS)]
            if (country, log_type, day) in info2embedding:
                embedding = info2embedding[(country, log_type, day)]
            for i in range(COMMENT_DIMENSIONS):
                new_entry[log_type + '-comment-embedding' + str(i)] = embedding[i]
        
        dataframe = dataframe.append(new_entry, ignore_index=True)

print(dataframe.info())
dataframe.to_excel('data.xlsx', sheet_name='main', index=False)
dataframe.to_excel('/content/drive/MyDrive/DL-project/data.xlsx', sheet_name='main', index=False)

100%|██████████| 175/175 [1:23:39<00:00, 28.68s/it]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74200 entries, 0 to 74199
Columns: 126 entries, time_idx to negative-comment-embedding19
dtypes: float64(124), object(2)
memory usage: 71.3+ MB
None


# 5. Testing TFT usage

### Reading data file

In [24]:
xl_file = pd.ExcelFile('data.xlsx')

sheets = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

data = sheets['main']

print('\n', dataframe.info(), '\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74200 entries, 0 to 74199
Columns: 126 entries, time_idx to negative-comment-embedding19
dtypes: float64(124), object(2)
memory usage: 71.3+ MB

 None 



### Entering training data

In [25]:
max_prediction_length = 15
max_encoder_length = 60
training_cutoff = data["time_idx"].max() - max_prediction_length

feature_list = []
for log_type in ['positive', 'negative']:
    for category in category_dic:
        feature_list.append(log_type + '-' + category)
    for measure in measure_dic:
        feature_list.append(log_type + '-' + measure) 
    for i in range(COMMENT_DIMENSIONS):
        feature_list.append(log_type + '-comment-embedding' + str(i)) 

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="confirmed-cases",
    group_ids=["country"],
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["region"],
    static_reals=[],
    time_varying_known_categoricals=[],  # We can further add information like "special_days" and "month"
    variable_groups={},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=feature_list,
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)



KeyError: ignored