# 1. Loading Tags to RDS

In [None]:
import time
import json
import datetime
import random
import math
import configparser

import boto3
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from tqdm import tqdm

from IPython.display import display, Markdown, clear_output
import ipywidgets as widgets

In [None]:
ssm = boto3.client('ssm')
dbhost = ssm.get_parameter(Name='/moviestream/dbhost', WithDecryption=True)['Parameter']['Value']
dbuser = ssm.get_parameter(Name='/moviestream/dbuser', WithDecryption=True)['Parameter']['Value']
dbpass = ssm.get_parameter(Name='/moviestream/dbpass', WithDecryption=True)['Parameter']['Value']
dbname = ssm.get_parameter(Name='/moviestream/dbname', WithDecryption=True)['Parameter']['Value']
data_folder = 'app-data'
file = 'tags.csv'

In [None]:
class MovieLensSQLLoader:
    
    def __init__(self, data_folder, file, dbhost, dbuser, dbpass, dbname, dbtable, window=10, dtype={}, date_fields=[]):
        """
        
        """
        self.data_folder = data_folder
        self.file = file
        self.engine = create_engine(f'postgresql://{dbuser}:{dbpass}@{dbhost}:5432/{dbname}')
        self.dbtable = dbtable
        self.window = window
        self.current_window = 0
        self.num_registers = 0
        self.df = pd.read_csv(f"{self.data_folder}/{self.file}",dtype=dtype, parse_dates=date_fields)
        try:
            self.drop_table()
        except:
            pass
    
    def load(self, delay=0, registers=None, if_exists='append', restart=False):
        """
        
        """
        self.current_window = 0 if restart else self.current_window
        num_iters = math.ceil(len(self.df)/self.window)
        
        max_iters = None if registers==None else math.ceil(registers/float(self.window))+1
        load_iters = 0
        iters = tqdm(range(num_iters))
        for i in iters:
            load_iters += 1
            if i > self.current_window:
                temp_df = self.df.iloc[i*self.window:(i+1)*self.window]
                temp_df.to_sql(
                    name=self.dbtable,
                    con=self.engine,
                    if_exists=if_exists
                )
                self.num_registers += len(temp_df)
                self.current_window = i
                iters.set_description("%s registers" % self.num_registers)
                time.sleep(delay)
            if max_iters != None and load_iters >= max_iters:
                break
    
    def status(self):
        return {
            "CurrentWindow":self.current_window,
            "RegistersInSQL":self.num_registers,
            "Window":self.window
        }
        
    def drop_table(self):
        """
        
        """
        self.current_window = 0
        self.num_registers = 0
        self.engine.execute(f'DROP TABLE {self.dbtable}')
        print("Table DROPPED")
        # pd.DataFrame([]).to_sql(name=self.dbtable, con=self.engine, if_exists="replace")

## 0. Read data from S3

In [None]:
mloader = MovieLensSQLLoader(
    data_folder,
    file,
    dbhost,
    dbuser,
    dbpass,
    dbname,
    window = 10,
    dbtable='tags',
    dtype = {
        "userId":np.int64,
        "movieId":np.int64,
        "tag":np.str,
        "timestamp":np.int64,
    },
    date_fields = []
)
display(mloader.df.dtypes)
mloader.df.head(3)

-------

## 1. Pre-Load

In [None]:
mloader.load(registers=1999)

## 2. CDC Inserting

In [None]:
mloader.load()

## 3. CDC Updating

In [1]:
# TODO DataFrame replace entirely the table
# mloader.load(if_exists="replace",restart=True)

### Helpers

-----

## Anexos

Execute the following command if you need to install **pgdb(postgres)** in the notebook's environment
```python 
! pip install pgdb
```