# Workshop: `teradataml`, NPath & String Manipulation for Feature Extraction

In [None]:
!pip install teradataml --upgrade

In [None]:
!pip install teradataml --upgrade
try:
    import tdprepview
except ImportError:
    !pip install tdprepview
    import tdprepview

# Database connection and synthetic upload

## Database Connection

This code cell imports the teradataml package, prints its version, and creates a context for connecting to a Teradata database.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
import teradataml as tdml
print(tdml.__version__)
tdml.create_context(host="___", username="demo_user", password="___")

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
To create a context, you need to provide the server URL, username, and the database name. These are required to establish a connection to the Teradata environment.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
import teradataml as tdml
print(tdml.__version__)
tdml.create_context("see at the clearscape dashboard",
                    "demo_user", "only you know :)")
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## Data Upload

This code cell runs a notebook to generate data tables, uploads the tables to the Teradata database, and checks the data types of the tables.

In [None]:
%run tables-ddls-banking.ipynb #  generates the data

my_tables_dict = {
        "Customer_Dim": customer_dim,
        "Account_Dim": account_dim,
        "Account_Customer_Map": account_customer_map,
        "Transaction_Fact": transaction_fact,
        "Balance_Fact": balance_fact,
        "Interaction_Fact": interaction_fact,
        "Master_Table": master_table,
        "Customer_Details":customer_details
    }

#upload
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
for tablename, table in my_tables_dict.items():
    tdml.copy_to_sql(table, tablename,if_exists="replace",primary_index= table.columns[0])
    print(tablename, " uploaded")

# check tdtypes
types_dict = {}
for tablename in my_tables_dict.keys():
    types_dict[tablename] = tdml.DataFrame(tablename).tdtypes._column_names_and_types

## Function for pretty printing SQLs

This code cell defines a function `prettyprint_sql` that formats SQL queries for better readability.

In [None]:
import sqlparse
def prettyprint_sql(query):
    print(sqlparse.format(
            query,
            reindent=True, 
            keyword_case='upper'
    ))

## Inspect Data

In [None]:
import ipywidgets as widgets
from IPython.display import display
import pandas as pd

def display_dataframes_in_tabs(table_names):
    # Temporarily set pandas display options to ensure all columns are visible
    with pd.option_context('display.max_columns', None):
        # Create a list to hold each tab's contents and their titles
        tab_contents = []
        tab_titles = table_names
        
        for table_name in table_names:
            # Get the first ten rows of the DataFrame
            df_head = tdml.DataFrame(table_name)
            table_output = widgets.Output()  # Create an output widget for the table
            with table_output:
                display(df_head)  # Display the DataFrame inside the output widget
            tab_contents.append(table_output)  # Add to tab contents list

        # Create the tabs widget
        tabs = widgets.Tab(children=tab_contents)
        
        # Set tab titles
        for i, title in enumerate(tab_titles):
            tabs.set_title(i, title)

        # Display the tabs
        display(tabs)

In [None]:
display_dataframes_in_tabs(my_tables_dict.keys())

<br><br><div style="border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
display_dataframes_in_tabs(___.___)  # Call the function with the keys of my_tables_dict

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">

1. Hint: Use the dictionary containing your table names.
2. Hint: Use the keys() function to get all table names as list.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
display_dataframes_in_tabs(my_tables_dict.keys())
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

# Introduction to Python Client teradataml & tdml DataFrames

## teradataml DataFrame Creation + Properties


This code cell imports the teradataml package.

In [None]:
import teradataml as tdml

This code cell creates DataFrame objects from tables in the Teradata database, including creating from table name in the default database, from a query, and from a table in a non-default database. It also inspects the created DataFrame objects.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
#create from tablename in current default database
DF_Customer_Dim = tdml.DataFrame("___")  # Specify the table name

#create from query
DF_Account_Dim = tdml.DataFrame.from_query("SELECT * FROM ___")  # Complete the query with the table name

#create from tablename in non-default database, this is preferred
DF_Account_Customer_Map = tdml.DataFrame(tdml.in_schema("___", "___"))  # Specify the schema and table name

# inspect
DF_Customer_Dim

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
For each DataFrame creation, ensure you specify the correct table name or schema. The first DataFrame is created directly from a table name, the second from a SQL query, and the third from a table in a specified schema.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
#create from tablename in current default database
DF_Customer_Dim = tdml.DataFrame("Customer_Dim")

#create from query
DF_Account_Dim= tdml.DataFrame.from_query("SELECT * FROM Account_Dim")

#create from tablename in non-default database, this is preferred
DF_Account_Customer_Map = tdml.DataFrame(tdml.in_schema("demo_user", "Account_Customer_Map" ))

# inspect
DF_Customer_Dim
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

***DataFrame Properties***

This code cell retrieves the shape of the DataFrame DF_Customer_Dim.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
DF_Customer_Dim.___  # Access the shape attribute of the DataFrame

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
To find the dimensions of a DataFrame, use the `.shape` attribute. 
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
DF_Customer_Dim.shape
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

This code cell retrieves the column names of the DataFrame DF_Customer_Dim.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
DF_Customer_Dim.___  # Access the columns attribute of the DataFrame

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
To view the column names of a DataFrame, use the `columns` attribute. This attribute does not require parentheses as it is not a method.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
DF_Customer_Dim.columns
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

This code cell retrieves the data types of each column in the DataFrame DF_Customer_Dim.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
DF_Customer_Dim.___  # Check the data types of the DataFrame columns

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
Use the `dtypes` attribute to view the data types of each column in a DataFrame.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
DF_Customer_Dim.dtypes
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

This code cell attempts to retrieve the Teradata data types of the columns in the DataFrame DF_Customer_Dim.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
DF_Customer_Dim.___  # Use the appropriate method to check the Teradata data types of the DataFrame

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
To check the Teradata data types of each column in a DataFrame, use the property (tdtypes) that provides this information. 
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
DF_Customer_Dim.tdtypes
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## SQL within Python (1): dynamic query building using format strings & `tdml.DataFrame.from_query()` 

This code cell constructs a SQL query using format strings and a list of conditions. The final query is printed and executed using the teradataml package.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
# use format strings to construct queries
query_f = "SELECT Customer_Id FROM Customer_Dim WHERE {conditions} "
conditions_list = ["Preferred_Language_Code = 'EN' ",
                          "Region IS IN ('BE', 'ZH') "]

query_final = query_f.format(conditions= " AND ".join(___))  # Join conditions with 'AND'

prettyprint_sql(___)  # Pretty print the final query

tdml.DataFrame.from_query(___)  # Create DataFrame from the query

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
To construct the final query, you need to join the conditions in `conditions_list` with 'AND'. Then, use the `prettyprint_sql` function to display the query and `tdml.DataFrame.from_query` to execute it.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
# use format strings to construct queries
query_f = "SELECT Customer_Id FROM Customer_Dim WHERE {conditions} "
conditions_list = ["Preferred_Language_Code = 'EN' ",
                          "Region IS IN ('BE', 'ZH') "]

query_final = query_f.format(conditions= " AND ".join(conditions_list))

prettyprint_sql(query_final)

tdml.DataFrame.from_query(query_final)
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## Filtering DataFrames

This code filters the `DF_Customer_Dim` DataFrame based on the conditions that the Preferred Language Code is "EN" and the Region is either "BE" or "ZH". It then selects only the Customer_Id column and stores the result in `DF_myfilter`.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
DF_myfilter = DF_Customer_Dim[
    (DF_Customer_Dim["Preferred_Language_Code"] == ___) &  # Filter for English language, EN
    (DF_Customer_Dim["Region"].isin([___, ___]))  # Filter for specific regions, BE, ZH
][[___]]  # Select the Customer_Id column

DF_myfilter

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
You need to filter the DataFrame based on the preferred language being "EN" and the region being either 'BE' or 'ZH'. After filtering, select only the "Customer_Id" column.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
DF_myfilter = DF_Customer_Dim[
    (DF_Customer_Dim["Preferred_Language_Code"] == "EN") &
    (DF_Customer_Dim["Region"].isin(['BE', 'ZH']))
][["Customer_Id"]]

DF_myfilter
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## SQL within Python (2):  `show_query`, format strings,  `tdml.execute_sql()` to crystallise logic for later use

This code cell is used to inspect the SQL query behind the `DF_Account_Dim` DataFrame using the `prettyprint_sql` function.

In [None]:
# inspect the query behind a DF, not interesting
prettyprint_sql(DF_Account_Dim.show_query())

This code cell is used to inspect the SQL query behind the processed `DF_myfilter` DataFrame using the `prettyprint_sql` function.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
# inspect the query behind a processed DF 
prettyprint_sql(DF_myfilter.____)  # Use the method to show the query of the DataFrame

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
You need to call the `show_query()` method on the DataFrame object to retrieve the SQL query. Make sure to pass the correct DataFrame variable to the `prettyprint_sql` function.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
# inspect the query behind a processed DF 
prettyprint_sql(DF_myfilter.show_query())
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

This code cell replaces a view named `customer_BEZH_EN` with the query of the `DF_myfilter` DataFrame. It then executes the SQL query to create the view and retrieves the DataFrame associated with the view.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
replace_view_f = """REPLACE VIEW {view_name} AS
{query}"""
view_name = "customer_BEZH_EN"  # Define the name of the view

replace_view_ddl = replace_view_f.format(view_name = view_name, 
                                         query = ___.___)  # Use the method to get the query
prettyprint_sql(replace_view_ddl)  # Pretty print the SQL statement

tdml.execute_sql(replace_view_ddl)  # Execute the SQL statement

tdml.DataFrame(___)  # Create a DataFrame from the view name

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
First, define the `view_name` variable with the appropriate view name. Then, ensure you call the `show_query()` method on the correct object to get the SQL query. Use the formatted SQL statement in the `prettyprint_sql`, `execute_sql`, and `DataFrame` functions.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
replace_view_f ="""REPLACE VIEW {view_name} AS
{query}"""
view_name = "customer_BEZH_EN"


replace_view_ddl = replace_view_f.format(view_name = view_name, 
                                         query = DF_myfilter.show_query())
prettyprint_sql(replace_view_ddl)

tdml.execute_sql(replace_view_ddl)

tdml.DataFrame(view_name)
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## Simple Joins

This code retrieves the DataFrames `DF_Transaction_Fact` and `DF_Account_Customer_Map` using the `tdml.DataFrame` function to join transactions with customer_dim via the account_customer map.

In [None]:
# join transactions with customer_dim via account_customer map
DF_Transaction_Fact = tdml.DataFrame("Transaction_Fact")

DF_Account_Customer_Map = tdml.DataFrame("Account_Customer_Map")

This code cell references the DataFrame DF_Customer_Dim.

In [None]:
DF_Customer_Dim

This code cell references the DataFrame DF_Account_Customer_Map.

In [None]:
DF_Account_Customer_Map

This code cell joins the DF_Transaction_Fact with DF_Account_Customer_Map and DF_Customer_Dim. It drops unnecessary columns and displays the SQL query used for the join operation.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
DF_transaction_joined = DF_Transaction_Fact.join(
    DF_Account_Customer_Map, 
    on = ___,  # Specify the column to join on
    how = ___,  # Specify the type of join
    rprefix = "mymap"  # Specify the prefix for the right DataFrame
).join(
    DF_Customer_Dim,
    on = ___,  # Specify the column to join on
    how = ___,  # Specify the type of join
    lprefix = "mymap"  # Specify the prefix for the left DataFrame
).drop(columns=["mymap_Account_Id","mymap_Customer_Id"])  # Drop unnecessary columns

prettyprint_sql(DF_transaction_joined.show_query())

DF_transaction_joined

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
When joining DataFrames, ensure you specify the correct column names for the `on` parameter and choose the appropriate join type (`left`, `right`, `inner`, etc.). Use `rprefix` and `lprefix` to avoid column name conflicts. Finally, drop any columns that are not needed in the final DataFrame.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
DF_transaction_joined = DF_Transaction_Fact.join(
    DF_Account_Customer_Map, 
    on = "Account_Id", 
    how="left", rprefix ="mymap"
).join(
    DF_Customer_Dim,
    on = "Customer_Id",
    how = "left", lprefix ="mymap"
).drop(columns=["mymap_Account_Id","mymap_Customer_Id"])

prettyprint_sql(DF_transaction_joined.show_query())

DF_transaction_joined
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

This code cell demonstrates an alternative approach to joining multiple tables using SQL query directly. It shows that proficient SQL users may find it faster to write the join operation in SQL rather than chaining operations in Python.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
tdml.DataFrame.____("""
SELECT 
    mytransaction.*,
    cust_dim.*
FROM 
    Transaction_Fact AS mytransaction
LEFT JOIN
    Account_Customer_Map AS mymap
ON
    (mytransaction.___ = mymap.___) 
LEFT JOIN
    Customer_Dim AS cust_dim
ON 
    (mymap.___ = cust_dim.___)  

"""   
)

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
When performing a SQL JOIN, ensure that the fields you are joining on are correctly matched. Look for the common fields between the tables involved in the JOIN operations. use from_query to get a DataFrame based on a query.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
tdml.DataFrame.from_query("""
SELECT 
    mytransaction.*,
    cust_dim.*
FROM 
    Transaction_Fact AS mytransaction
LEFT JOIN
    Account_Customer_Map AS mymap
ON
    (mytransaction.Account_id = mymap.Account_id) 
LEFT JOIN
    Customer_Dim AS cust_dim
ON 
    (mymap.Customer_Id = cust_dim.Customer_Id) 

"""   
)
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## DataFrame manipulation: simple aggregates

This code cell calculates aggregate values based on the joined DataFrame DF_transaction_joined. It groups by "Customer_Id" and calculates the sum and mean of "Transaction_Amount". The SQL query used for this aggregation is displayed.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
DF_tj = DF_transaction_joined
DF_tr_agg = DF_tj.groupby("____").agg(
    [DF_tj['Transaction_Amount'].____().alias("Total_Transaction_Amount"),  # Alias for total transaction amount
     DF_tj['Transaction_Amount'].____().alias("Average_Transaction_Amount"),  # Alias for average transaction amount
    ])

prettyprint_sql(DF_tr_agg.____())  # Show the SQL query for the aggregation

DF_tr_agg

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
You need to provide meaningful aliases for the aggregated columns. Think about what each aggregation represents. Also, ensure you call the method to display the SQL query for the DataFrame.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
DF_tj = DF_transaction_joined
DF_tr_agg = DF_tj.groupby("Customer_Id").agg(
    [DF_tj['Transaction_Amount'].sum().alias("Total_Transaction_Amount"),
     DF_tj['Transaction_Amount'].mean().alias("Average_Transaction_Amount"),
    ])

prettyprint_sql(DF_tr_agg.show_query())

DF_tr_agg
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## Select columns based on datatypes

This code cell retrieves the data types of columns in the DF_master DataFrame and categorizes them into integer, float, and string features. It then selects columns with float data types from the DF_master DataFrame.

In [None]:
master_types = DF_master.dtypes._column_names_and_types
print(master_types)

int_feats = [c[0] for c in master_types if c[1] == "int"]
float_feats = [c[0] for c in master_types if c[1] == "float"]
str_feats = [c[0] for c in master_types if c[1] == "str"]

DF_master.select(float_feats)

## Create columns, constant values and case whens
https://docs.teradata.com/r/Enterprise/Teradata-Package-for-Python-Function-Reference-20.00/teradataml-Data-Preparation-Functions/Date-and-Time-Functions/current_date

This code cell adds constants (campaign_id, campaign_author, campaign_planning_date) to the DF_master DataFrame for a campaign. The SQL query for displaying the DataFrame is printed, and the modified DataFrame DF_campaign is displayed.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
# add constants to DF for a campaign
from sqlalchemy import func as f

thiscampaign_id = "f3354_002"
thiscampaign_author = "martin"

DF_campaign = DF_master.assign(
    campaign_id = ___,
    campaign_author = ___,
    campaign_planning_date = f.___()  # Use an SQL function to get the current date
)

prettyprint_sql(DF_campaign.___())  # Show the SQL query

DF_campaign

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
You need to assign values to `thiscampaign_id` and `thiscampaign_author`. Use `f.current_date()` to get today's date. To display the SQL query, use the `show_query()` method on `DF_campaign`.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
# add constants to DF for a campaign
from sqlalchemy import func as f

thiscampaign_id = "f3354_002"
thiscampaign_author = "martin"

DF_campaign = DF_master.assign(
    campaign_id = thiscampaign_id,
    campaign_author = thiscampaign_author,
    campaign_planning_date = f.current_date() # today
)

prettyprint_sql(DF_campaign.show_query())

DF_campaign
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

This code cell demonstrates how to manually set new columns in a DataFrame using different options such as full SQL, SQL literal_column, and DataFrame Column Expression. It assigns new columns based on certain conditions like Customer_Segment and Region. The resulting DataFrame is displayed and returned.

<br><br><div style=" border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
from sqlalchemy.sql import literal_column as col
# works for both picking out a column and injecting in some SQL

# manual settings of new columns
DF_c2 = DF_campaign.___(
    # Option 1: FULL SQL
      flag_PB =  
          col("CASE WHEN Customer_Segment = 'Private Banking' THEN 1 ELSE 0 END"),
        
        # Option 2: pick out column with SQL literal_column
     flag_ZH =
        tdml.case([(___("Region") == ___ , 1)], else_=0),      
    
    # Option 3: pick out column with DataFrame Column Expression
    flag_BE =
        tdml.case([(DF_campaign.___ == 'BE' , 1)], else_=0),
)

prettyprint_sql(DF_c2.show_query())

DF_c2

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">
For the `flag_ZH` and `flag_BE` columns, you need to specify the region code that should be checked against the "Region" column. The first one should check for "ZH" and the second one for "BE".
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
from sqlalchemy.sql import literal_column as col
# works for both picking out a column and injecting in some SQL

# manual settings of new columns
DF_c2 = DF_campaign.assign(
    # Option 1: FULL SQL
      flag_PB =  
          col("CASE WHEN Customer_Segment = 'Private Banking' THEN 1 ELSE 0 END"),
        
        # Option 2: pick out column with SQL literal_column
     flag_ZH =
        tdml.case([(col("Region") == "ZH" , 1)], else_=0),      
    
    # Option 3: pick out column with DataFrame Column Expression
    flag_BE =
        tdml.case([(DF_campaign.Region == "BE" , 1)], else_=0),
)

prettyprint_sql(DF_c2.show_query())

DF_c2
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

# NPath for Sequence Analysis

https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Analytic-Functions/Path-and-Pattern-Analysis-Functions/nPath

In [None]:
DF_interaction = tdml.DataFrame("Interaction_Fact")
DF_interaction

## Simple NPath

https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Analyzing-Your-Data/Analytics-Database-Analytic-Functions/Path-and-Pattern-Analysis-Functions/nPath

Count Tuples of Interaction Type Events

<br><br><div style="border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
npath_obj = tdml.NPath(
    data1= ___,  # DataFrame to be used for NPath analysis
    data1_partition_column = "___",  # Column to partition the data
    data1_order_column = "___",  # Column to order the data
    
    mode = "OVERLAPPING",
    symbols = ["___ IS NOT NULL AS E" ],  # Symbols for pattern matching
    pattern = "___" ,  # Pattern to be matched    
    result = ["NTH(___, 1 OF E) AS n1",  # Result columns for the analysis
              "NTH(___, 2 OF E) AS n2", ],

    volatile = True  
)

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">

1. Hint: Use the DataFrame you created for the Interaction_Fact table.
2. Hint: This is the column used to partition the data, typically a unique identifier like "Customer_Id".
3. Hint: This is the column used to order the data, typically a date column like "Interaction_Date".
5. Hint: This is the column used for pattern matching, typically "Interaction_Type".
6. Hint: This is the pattern you want to match, often a sequence like "E.E".
7. Hint: This is the column used in the NTH function, typically "Interaction_Type".
8. Hint: This is the column used in the NTH function, typically "Interaction_Type".
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
npath_obj = tdml.NPath(
    data1= DF_interaction,
    data1_partition_column = "Customer_Id",
    data1_order_column = "Interaction_Date",
    
    mode = "OVERLAPPING",
    symbols = ["Interaction_Type IS NOT NULL AS E" ],
    pattern = "E.E" ,    
    result = ["NTH(Interaction_Type, 1 OF E) AS n1", 
              "NTH(Interaction_Type, 2 OF E) AS n2", ],

    volatile = True    
)
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

In [None]:
print(npath_obj.show_query())

In [None]:
DF_npath_result = npath_obj.result
DF_npath_result

<br><br><div style="border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
DF_npath_counts = (DF_npath_result
                   .___(mycount = 1)  # Assign a new column 'mycount' with value 1
                   .___(["n1", "n2"]).count())  # Group by 'n1' and 'n2'
DF_npath_counts

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">

1. Hint: You are creating a new column to count occurrences; the function name is assign
2. Hint: You need to group by the first two columns from the result. 
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
DF_npath_counts = (DF_npath_result
                   .assign(mycount = 1)
                   .groupby(["n1","n2"]).count())
DF_npath_counts
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## Use NPath to analyse events changing before and after an event

let's assume we have a change in policy around June 2021 and we want to investigate changes in pattern

In [None]:
# upload table indicating that change
tdml.DataFrame.from_query("""
    SELECT 
    CAST(TO_TIMESTAMP('2021/06/01', 'YYYY/MM/DD') AS TIMESTAMP(6)) AS change_date,
    'change_rule' AS change_info
    """).to_sql("change_date_table", if_exists="replace")
tdml.DataFrame("change_date_table")

<br><br><div style="border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
n_path_query = """

SELECT * FROM nPath(
	ON "Interaction_Fact"  PARTITION BY "Customer_Id" ORDER BY "Interaction_Date"
    ON "change_date_table"  DIMENSION ORDER BY ___  -- Order by the change date
	USING
	Mode(NONOVERLAPPING)
	Pattern('E.C.E')
	Symbols(
        Interaction_Fact.Interaction_Type IS NOT NULL AS ___,  -- Symbol for interaction type
        change_date_table.change_info IS NOT NULL AS ___  -- Symbol for change info
    
    )
	Result(
        FIRST(Customer_Id OF E) AS Customer_Id,
        FIRST(Interaction_Type OF E) AS n1,
        FIRST(Interaction_Date OF E) AS ts1,
        LAST(Interaction_Type  OF E) AS n2,
        LAST(Interaction_Date  OF E) AS ts2
        
    )
) as sqlmr

"""

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">

1. Hint: You need to order by the column that represents the change date.
2. Hint: This symbol represents events in the interaction fact. See in the Pattern Parameter
3. Hint: This symbol represents the change event. See in the Pattern Parameter
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
n_path_query = """

SELECT * FROM nPath(
	ON "Interaction_Fact"  PARTITION BY "Customer_Id" ORDER BY "Interaction_Date"
    ON "change_date_table"  DIMENSION ORDER BY change_date
	USING
	Mode(NONOVERLAPPING)
	Pattern('E.C.E')
	Symbols(
        Interaction_Fact.Interaction_Type IS NOT NULL AS E,
        change_date_table.change_info IS NOT NULL AS C
    
    )
	Result(
        FIRST(Customer_Id OF E) AS Customer_Id,
        FIRST(Interaction_Type OF E) AS n1,
        FIRST(Interaction_Date OF E) AS ts1,
        LAST(Interaction_Type  OF E) AS n2,
        LAST(Interaction_Date  OF E) AS ts2
        
    )
) as sqlmr

"""
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

In [None]:
# check plausibility of npath results
tdml.DataFrame.from_query(n_path_query)

In [None]:
# count changes
tdml.DataFrame.from_query(f"""
SELECT n1, n2, COUNT(*) as count_rows
FROM
({n_path_query}) t
GROUP BY n1, n2
""")

# String based Feature Engineering

derive important information from text based fields.

https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/SQL-Functions-Expressions-and-Predicates/String-Operators-and-Functions

https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/SQL-Functions-Expressions-and-Predicates/Logical-Predicates/LIKE/NOT-LIKE

In [None]:
# data generation
import pandas as pd
company_names = "Global Tech GmbH;Future Solutions Inc.;Green Plumbing Ltd.;Quick Consulting AG;Bright Logistics NV;Royal Media BV;Blue Bank SA;Spring Water Group;Urban Energy Partners;National Health GmbH;Schmidt Tech GmbH;Müller Consulting Ltd.;Johnson Media AG;Taylor Energy SA;Dupont Logistics NV;Peeters Water BV;Van den Berg Partners AG;Kowalski Tech GmbH;Berlin Logistics Group;London Media Ltd.;Brussels Health SA;Hamburg Water GmbH;Munich Energy NV;Antwerp Consulting BV;Leuven Logistics AG;Stuttgart Bank GmbH;Dortmund Tech SA;Amsterdam Solutions NV;Global Energy AG;Future Water BV;Green Media SA;Quick Tech NV;Bright Consulting Ltd.;Royal Bank GmbH;Blue Logistics AG;Spring Energy SA;Urban Solutions BV;National Tech GmbH;Schmidt Logistics AG;Müller Bank NV;Johnson Water SA;Taylor Consulting BV;Dupont Tech GmbH;Peeters Media AG;Van den Berg Energy SA;Kowalski Consulting NV;Berlin Bank GmbH;London Tech Ltd.;Brussels Solutions SA;Hamburg Logistics NV;Munich Media GmbH;Antwerp Energy AG;Leuven Tech SA;Stuttgart Consulting BV;Dortmund Bank NV;Amsterdam Logistics AG;Global Water GmbH;Future Tech Ltd.;Green Energy SA;Quick Bank NV;Bright Media AG;Royal Solutions GmbH;Blue Tech BV;Spring Consulting AG;Urban Logistics SA;National Water GmbH;Schmidt Media NV;Müller Energy AG;Johnson Solutions SA;Taylor Logistics BV;Dupont Water GmbH;Peeters Consulting AG;Van den Berg Bank NV;Kowalski Energy BV;Berlin Tech GmbH;London Logistics AG;Brussels Media SA;Hamburg Bank NV;Munich Solutions BV;Antwerp Tech AG;Leuven Energy SA;Stuttgart Water GmbH;Dortmund Media AG;Amsterdam Consulting BV"
company_names = company_names.split(";")
df_c = pd.DataFrame(data={"company_name":company_names})
tdml.copy_to_sql(df_c, "company_names")

<br><br><div style="border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
DF_companies = tdml.DataFrame(___)  # Load the DataFrame from the "company_names" table
DF_companies

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">

1. Hint: You need to specify the name of the table to load into the DataFrame.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
DF_companies = tdml.DataFrame("company_names")
DF_companies
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## step 1: manual queries

<br><br><div style="border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
tdml.DataFrame.from_query("""
SELECT
    company_name,
    -- length
    CHARACTER_LENGTH(company_name) AS name_length,

    -- like
    CASE WHEN company_name LIKE '%GmbH%' THEN ___ ELSE ___ END AS is_gmbh,  -- Name the column for GmbH presence
    
    -- regex
    CASE WHEN REGEXP_SUBSTR(company_name, '[A-Za-z]{3,} bank') IS NOT NULL THEN ___ ELSE ___ END AS has_bank_keyword  -- Name the column for bank keyword presence

FROM
    company_names
""")

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">

1. Hint: This value indicates the presence of 'GmbH'.
3. Hint: This value indicates the absence of 'GmbH'.
5. Hint: This value indicates the presence of a bank keyword.
6. Hint: This value indicates the absence of a bank keyword.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
tdml.DataFrame.from_query("""
SELECT
    company_name,
    -- length
    CHARACTER_LENGTH(company_name) AS name_length,

    -- like
    CASE WHEN company_name LIKE '%GmbH%' THEN 1 ELSE 0 END AS is_gmbh,
    
    -- regex
    CASE WHEN REGEXP_SUBSTR(company_name, '[A-Za-z]{3,} bank') IS NOT NULL THEN 1 ELSE 0 END AS has_bank_keyword

FROM
    company_names
""")
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

## step 2: automated queries based on a list of keywords

In [None]:
suffixes = ["LLC", "Inc", "Ltd", "GmbH", "AG", "NV", "BV", "SA", "Group", "Partners"]

<br><br><div style="border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
indicator_columns = [
    f"CASE WHEN company_name LIKE '%{___}%' THEN 1 ELSE 0 END AS is_{___}"  # first blank: suffix variable, second blank: suffix variable
    for s in ___  # suffixes list
]

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">

1. Hint: This is the variable that holds the suffixes you want to check in the company names.
2. Hint: This is the same variable as the first blank, used to create the alias for the column.
3. Hint: This is the list of suffixes you are iterating over.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
indicator_columns = [
    f"CASE WHEN company_name LIKE '%{s}%' THEN 1 ELSE 0 END AS is_{s}"
    for s in suffixes
] 
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

In [None]:
my_query = f"""
SELECT
    company_name,
    {",\n    ".join(indicator_columns)}
FROM
    company_names
"""
print(my_query)

<br><br><div style="border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
tdml.DataFrame.___(my_query)  # method to execute the query

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">

1. Hint: This is the method used to create a DataFrame **from** a SQL **query**.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
tdml.DataFrame.from_query(my_query)
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

### step 3: "persist" the logic as a view

<br><br><div style="border-radius: 15px 15px 0 0; background: #a8d5e2; padding: 10px;"></div>

In [None]:
tdml.execute_sql(f"REPLACE ___ company_features AS ({my_query})")  # database object to contain logic, not data

<details>
  <summary style="font-weight:bold; color:#9b870c;">Hint</summary>
  <div style="background-color:#fff9db; padding:10px; border-radius:5px; margin-top:5px;">

1. Hint: This db object does not contain data, it is just *view*ing the data in a certain way.
  </div>
</details>

<details>
  <summary style="font-weight:bold; color:#0c9b3b;">Solution</summary>
  <div style="background-color:#dbffdb; padding:10px; border-radius:5px; margin-top:5px;">

```python
tdml.execute_sql(f"REPLACE VIEW company_features AS ({my_query})")
```
  </div>
</details>

<div style=" border-radius: 0 0 15px 15px ; background: #a8d5e2; padding: 10px;"></div><br><br>

In [None]:
tdml.DataFrame("company_features")

__outlook:__ Use a feature store for better feature management: available in latest teradataml version https://pypi.org/project/teradataml/ 

# clean up db & remove context

In [None]:
db_obj_list = [
 'Account_Customer_Map',
 'Account_Dim',
 'Balance_Fact',
 'Customer_Details',
 'Customer_Dim',
 'Interaction_Fact',
 'Master_Table',
 'Transaction_Fact',
 'cluster_input',
 'customer_BEZH_EN',
 'master_filter',
 'quality_customerdetails',
 'waterfall_base',
"change_date_table",
"company_names",
"company_features",
]

for torv in db_obj_list:
    try:
        tdml.db_drop_table(torv)
        print(f"Table {torv} deleted.")
    except:
        pass
    try:
        tdml.db_drop_view(torv)
        print(f"View {torv} deleted.")
    except:
        pass

In [None]:
tdml.remove_context()