In [None]:
# !pip install SQLAlchemy
# !pip install psycopg2
# !pip install pandas
# !pip install PyYAML
# !pip install tabula-py
# !pip install requests

In [1]:
from sqlalchemy import create_engine, MetaData, Table, select
from dateutil.parser import parse
import pandas as pd
import numpy as np
import psycopg2
import yaml
import re
import tabula

# using the requests library to GET the number of stores
import requests

# Task 3 -- Users Data

In [None]:
# Test code for Psycopg2
# conn = psycopg2.connect(
#     host=ENDPOINT,
#     port=PORT,
#     database=DATABASE,
#     user=USER,
#     password=PASSWORD
# )
yaml_file_path = '../db_creds.yaml'

# Read the YAML file and store its contents in a Python data structure (dictionary)
with open(yaml_file_path, 'r') as file:
    yaml_data = yaml.safe_load(file)

# credentials
DATABASE_TYPE = 'postgresql'
# DBAPI = 'psycopg2'
ENDPOINT = yaml_data['RDS_HOST']
USER = yaml_data['RDS_USER']
PASSWORD = yaml_data['RDS_PASSWORD']
PORT = yaml_data['RDS_PORT']
DATABASE = yaml_data['RDS_DATABASE']

# setup sql engine and connect
sql_engine = create_engine(f"{DATABASE_TYPE}://{USER}:{PASSWORD}@{ENDPOINT}:{PORT}/{DATABASE}")
sql_connection = sql_engine.connect()

# metadata, holds collection of table info, their data types, schema names etc., obtained from here : https://docs.sqlalchemy.org/en/20/core/metadata.html
# pros of MetaData() includes thread safety -> meaning it can handle concurrent tasks from multiple thread (computationally efficient when multiple threads need access to same resource)
metadata = MetaData()
metadata.reflect(sql_engine)
table_names = metadata.tables.keys()

# reflect allows us
names_ = list(table_names)

# read table
users_table = sql_connection.execute(select(Table('legacy_users', metadata, autoload=True, autoload_with=sql_engine)))
headers = users_table.keys()
users_df = pd.DataFrame(users_table.fetchall(), columns=headers)

In [None]:
table_name = 'legacy_users'

In [4]:
def is_alphanumeric(in_str):
    """
    @desc: function to check if the column has alphanumeric entries
    """
    return bool(re.fullmatch(r'^[a-zA-Z0-9_]*$', in_str))

def has_yyyy_mm_dd_format(in_str):
    """
    @desc: function to decide if the a column of a data has date format yyyy-mm-dd
    """
    return bool(re.fullmatch(r'\d{4}-\d{2}-\d{2}', in_str))

def convert_date_to_yyyy_mm_dd(in_column : pd.core.series.Series):
    """
    @desc: function to set the date column with date format yyyy-mm-dd
    """
    in_column = in_column.apply(parse)
    in_column = pd.to_datetime(in_column, infer_datetime_format=True, errors='coerce')
    
    return in_column

In [None]:
# check for nulls or NaNs, alternative is np.unique(users_df.isnull()), or just use df.info()
if users_df.isnull().sum().sum() and users_df.isna().sum().sum():
    raise f"The database : {table_name}, has total {users_df.isnull().sum().sum()} NULL values and {users_df.isna().sum().sum()} NaN values"
else:
    print(f"[usrmsg] No NULLs or NaNs found in {table_name}")

users_df_processed = users_df[~users_df.apply(lambda row: row.astype(str).str.contains('NULL').any(), axis=1)]

# check for data types 
#   -1) always begin with dropping duplicates and storing as a seperate file
users_df_processed = users_df_processed.drop_duplicates()
#   -2) set all columns except index to be of string format
str_convert_dict = {col: 'string' for col in users_df_processed.columns if col not in ['index']}
users_df_processed = users_df_processed.astype(str_convert_dict)
#   -3) remove all entries that are pure alphanumeric
users_df_processed = users_df_processed[~users_df_processed['email_address'].apply(is_alphanumeric)]
#   -4) DoB and join_date should be datetime format and of type yyyy-mm-dd
users_df_processed['date_of_birth'] = convert_date_to_yyyy_mm_dd(users_df_processed['date_of_birth'])
users_df_processed['join_date'] = convert_date_to_yyyy_mm_dd(users_df_processed['join_date'])
#   -5) convert all 'GGB' country code to 'GB'
users_df_processed['country_code'] = users_df_processed['country_code'].str.replace('GGB', 'GB', regex=False)


In [None]:
users_df_processed.head(5)

In [None]:
yaml_file_path = '../local_db_creds.yaml'

# Read the YAML file and store its contents in a Python data structure (dictionary)
with open(yaml_file_path, 'r') as file:
    yaml_data = yaml.safe_load(file)

# credentials
DATABASE_TYPE = 'postgresql+psycopg2'
ENDPOINT = yaml_data['LOCAL_HOST']
USER = yaml_data['LOCAL_USER']
PASSWORD = yaml_data['LOCAL_PASSWORD']
PORT = yaml_data['LOCAL_PORT']
DATABASE = yaml_data['LOCAL_DATABASE']

engine = create_engine(f"{DATABASE_TYPE}://{USER}:{PASSWORD}@{ENDPOINT}:{PORT}/{DATABASE}")
connection = engine.connect()
users_df_processed.to_sql("dim_users", engine, if_exists='replace', index=False)

# Task 4 -- Card Data

In [None]:
# read the .pdf file
pdf_path = "../card_details.pdf"
card_df_list = tabula.read_pdf(pdf_path, stream=True, pages='all')
card_df = pd.concat(card_df_list, ignore_index=True)

In [None]:
card_df.columns

In [None]:
#   -1) drop columns that are filled with missing and/or incorrect information
card_processed_df = card_df.drop(columns=['Unnamed: 0'])
 #  -2) always begin with dropping duplicates 
card_processed_df = card_processed_df.drop_duplicates()
#   -3) remove columns with "NULL"
card_processed_df = card_processed_df[~card_processed_df.apply(lambda row: row.astype(str).str.contains('NULL').any(), axis=1)]
#   -4) remove all entries that are pure alphanumeric
card_processed_df = card_processed_df[~card_processed_df['card_provider'].apply(is_alphanumeric)]
#   -5) those rows that have NaN in card_number expiry_date, fill those appropriately
nan_card_num_expiry_date_df = card_processed_df[card_processed_df['card_number expiry_date'].isna()]
nan_card_num_expiry_date_df['card_number expiry_date'] = nan_card_num_expiry_date_df['card_number'].astype(str) + ' ' + nan_card_num_expiry_date_df['expiry_date'].astype(str)
#   -6) those rows that DONT have NaN in card_number expiry_date column, strip those isolated and replace their equivalent NaN values in the card_number and the expiry_date columns appropriately
not_nan_card_num_expiry_date_df = card_processed_df[~card_processed_df['card_number expiry_date'].isna()]
splitted_cardnumexpdate_df = not_nan_card_num_expiry_date_df['card_number expiry_date'].str.split(n=1, expand=True)
not_nan_card_num_expiry_date_df['card_number'], not_nan_card_num_expiry_date_df['expiry_date'] = splitted_cardnumexpdate_df[0], splitted_cardnumexpdate_df[1]
#   -7) combine the two to store the seperate data with no NaNs . . . (hopefully :P)
card_processed_df = pd.concat([nan_card_num_expiry_date_df, not_nan_card_num_expiry_date_df], ignore_index=True)
del nan_card_num_expiry_date_df, not_nan_card_num_expiry_date_df
#   -8) change all objects to string
card_processed_df = card_processed_df.astype('string')
#   -9) finally, change all date columns to datetimeformat
card_processed_df['date_payment_confirmed'] = convert_date_to_yyyy_mm_dd(card_processed_df['date_payment_confirmed'])
card_processed_df['expiry_date'] = pd.to_datetime(card_processed_df['expiry_date'], format='%m/%y') + pd.offsets.MonthEnd(0)

In [None]:
yaml_file_path = '../local_db_creds.yaml'

# Read the YAML file and store its contents in a Python data structure (dictionary)
with open(yaml_file_path, 'r') as file:
    yaml_data = yaml.safe_load(file)

# credentials
DATABASE_TYPE = 'postgresql+psycopg2'
ENDPOINT = yaml_data['LOCAL_HOST']
USER = yaml_data['LOCAL_USER']
PASSWORD = yaml_data['LOCAL_PASSWORD']
PORT = yaml_data['LOCAL_PORT']
DATABASE = yaml_data['LOCAL_DATABASE']

engine = create_engine(f"{DATABASE_TYPE}://{USER}:{PASSWORD}@{ENDPOINT}:{PORT}/{DATABASE}")
connection = engine.connect()
card_processed_df.to_sql("dim_card_details", engine, if_exists='replace', index=False)

# Task 5 -- Store Related Data

In [2]:
def is_alphanumeric(in_str):
    """
    @desc: function to check if the column has alphanumeric entries
    """
    return bool(re.fullmatch(r'^[a-zA-Z0-9_]*$', in_str))

def has_yyyy_mm_dd_format(in_str):
    """
    @desc: function to decide if the a column of a data has date format yyyy-mm-dd
    """
    return bool(re.fullmatch(r'\d{4}-\d{2}-\d{2}', in_str))

def convert_date_to_yyyy_mm_dd(in_column : pd.core.series.Series):
    """
    @desc: function to set the date column with date format yyyy-mm-dd
    """
    in_column = in_column.apply(parse)
    in_column = pd.to_datetime(in_column, infer_datetime_format=True, errors='coerce')
    
    return in_column

In [3]:
yaml_file_path = '../api_key.yaml'

# Read the YAML file and store its contents in a Python data structure (dictionary)
with open(yaml_file_path, 'r') as file:
    yaml_data = yaml.safe_load(file)


# Retrieve a store: https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/{store_number}
# Return the number of stores: https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores

- Common methods include `GET` (retrieve resource), `POST` (create resource), `PUT` (update resource), and `DELETE` (remove resource). We will discuss in detail about them later

- **Status Codes**: HTTP status codes indicate the result of the request. These codes range from informational (`1xx`) to success (`2xx`), redirection (`3xx`), client errors (`4xx`), and server errors (`5xx`).   

- **Port**: HTTPS typically uses port 443 for communication, while HTTP uses port 80. The use of a different port helps differentiate between secure and non-secure connections.

- The request contains information such as the `HTTP method`, `the endpoint URL`, optional `headers`, and, in some cases, a `payload` or `body`.

- **HTTP Method**: The HTTP method, also known as the HTTP verb, specifies the action to be performed on the resource. Common HTTP methods include `GET` (retrieve data), `POST` (create data), `PUT` (update data), and `DELETE` (remove data).

- **Endpoint URL**: The **endpoint URL** represents the specific resource or functionality on the server that the client wants to interact with. It typically follows a specific URL pattern defined by the API.

- **Headers**: Headers provide additional information about the request, such as content type, authorization tokens, or caching directives. We will learn more about headers  in a later lesson.

- **Payload or Body**: In certain cases, requests may include a payload or body `containing data to be sent to the server`. This is common for methods like `POST` or `PUT`, where the payload contains the data to create or update a resource.

* Endpoint URLs have the usual format of  . . .
```
<ROOT_URL>/<Path>?<Query Parameters>
```

In this structure:
- `<ROOT_URL>` represents the base URL of the API
- `<Path>` refers to the specific path or endpoint within the API that offers a specific service
- `<Query Parameters>` are optional parameters passed in the URL query string, allowing for additional customization or filtering of the request

After the path section of the endpoint URL, one or more *parameters* can be specified

* The `?` symbol denotes the separation between the endpoint path and the start of the parameters
* while multiple parameters are typically separated using the `&` symbol.


In [10]:
response = requests.get('https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores')

In [21]:
yaml_data["API_KEY"]

'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'

In [96]:
store_number = 450
store_detail = []

get_all_stores_url = "https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores"
headers = {
    "X-API-KEY": yaml_data["API_KEY"]
}

for i in range(store_number):
    get_store_number_url = f"https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/{i}"
    response = requests.get(get_store_number_url, headers=headers)
    if response.status_code == 200:
        # Access the response data as JSON
        store_detail.append(response.json())

    else:
        print(f"Request failed with status code: {response.status_code}")
        print(f"Response Text: {response.text}")
assert len(store_detail) == 450

In [94]:
len(store_detail)

In [7]:


# Send a GET request to the Pokémon API to retrieve information about Pikachu
response = requests.get('https://pokeapi.co/api/v2/pokemon/pikachu')

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Access the response data as JSON
    data = response.json()

    # Extract and print the name of the Pokémon
    name = data['name']
    print(f"Name: {name}")

    # Extract and print the Pokémon's abilities
    abilities = [ability['ability']['name'] for ability in data['abilities']]
    print("Abilities:", ", ".join(abilities))

    # Extract and print the Pokémon's base experience
    base_experience = data['base_experience']
    print(f"Base Experience: {base_experience}")

# If the request was not successful, print the status code and response text
else:
    print(f"Request failed with status code: {response.status_code}")
    print(f"Response Text: {response.text}")


Name: pikachu
Abilities: static, lightning-rod
Base Experience: 112
