### Exercise 03: Import required datasets, csv-files, into SAP HANA Cloud.

#### Let's establish the connection to SAP HANA Cloud.

In [None]:
%run ./02-setup.ipynb

#### First dataset represents daily cashflows. Each row contains a specific date and date related informatthe cashflow for that date. We want to use it for classification. We want to use the dataset for forecasting future daily cashflows.

#### Let's strart by reading the csv-file into a pandas dataframe.

In [None]:
# Read the tab-delimited file
df_cash = pd.read_csv('./data/CashFlows.txt', delimiter='\t', thousands='.', header=0)

# Display column names
print('Columns: ', df_cash.columns)

In [None]:
# create hana dataframe/DB table from pandas dataframe
hdf_cash = create_dataframe_from_pandas(
        conn,
        df_cash,
        table_name="CASHFLOW",
        force=True,
        replace=True,
        drop_exist_tab=True,
        table_structure={   "Date" : "TIMESTAMP", "WorkingDaysIndices" : "INTEGER", "ReverseWorkingDaysIndices" : "INTEGER",
                            "MondayMonthInd" : "INTEGER", "TuesdayMonthInd" : "INTEGER", "WednesdayMonthInd" : "INTEGER",
                            "ThursdayMonthInd" : "INTEGER", "FridayMonthInd" : "INTEGER", "BeforeLastMonday" : "INTEGER", 
                            "LastMonday" : "INTEGER", "BeforeLastTuesday" : "INTEGER", "LastTuesday" : "INTEGER",
                            "BeforeLastWednesday" : "INTEGER", "LastWednesday" : "INTEGER", "BeforeLastThursday" : "INTEGER", 
                            "LastThursday" : "INTEGER", "BeforeLastFriday" : "INTEGER", "LastFriday" : "INTEGER", 
                            "Last5WDaysInd" : "INTEGER", "Last5WDays" : "INTEGER", "Last4WDaysInd" : "INTEGER", "Last4WDays" : "INTEGER", 
                            "LastWMonth" : "INTEGER", "BeforeLastWMonth" : "INTEGER", "Cash" : "DOUBLE"}
)

hdf_cash.head(5).collect()

In [None]:
#from hana_ml import dataframe as hd
sql_cmd = 'SELECT "Date", "Cash", "MondayMonthInd", "FridayMonthInd" FROM "CASHFLOW"'
series_in = hdf.DataFrame(conn, sql_cmd)

series_in.head(5).collect()

#### Second dataset represents a US census dataset. Each row contains socio-demographic information for a person. We want to use it for classification.

#### Let's strart by reading the csv-file into a pandas dataframe.

In [None]:
# Read the tab-delimited file
df_census = pd.read_csv('./data/Census.csv', delimiter=',', header=0)

df_census['id'] = df_census.index

# Display column names
print('Columns: ', df_census.columns)

In [None]:
# create hana dataframe/DB table from pandas dataframe
hdf_census = create_dataframe_from_pandas(
        conn,
        df_census,
        table_name="CENSUS",
        force=True,
        replace=True,
        drop_exist_tab=True,
        table_structure={   "id" : "INTEGER", "age" : "INTEGER", "workclass" : "NVARCHAR(64)", "fnlwgt" : "INTEGER", 
                            "education" : "NVARCHAR(64)", "education-num" : "INTEGER",
                            "marital-status" : "NVARCHAR(64)", "occupation" : "NVARCHAR(64)", 
                            "relationship" : "NVARCHAR(64)", "race" : "NVARCHAR(64)", "sex" : "NVARCHAR(64)",
                            "capital-gain" : "INTEGER", "capital-loss" : "INTEGER", 'hours-per-week' : "INTEGER", 
                            "native-country" : "NVARCHAR(64)", "class" : "INTEGER"}
)

hdf_census.head(5).collect()

#### Third dataset represents shampoo sales. Each row contains shampoo sales for a specific date. We want to use it for forecasting future sales.

#### Let's strart by reading the csv-file into a pandas dataframe.

In [None]:
# Read the tab-delimited file
column_names = ["ID", "SALES"]

df_shampoo = pd.read_csv('./data/Shampoo.csv', 
                         delimiter=',', decimal='.', names=column_names)


# Display column names
print('Columns: ', df_shampoo.columns)

df_shampoo.head(5)

In [None]:
# create hana dataframe/DB table from pandas dataframe
hdf_shampoo = create_dataframe_from_pandas(
        conn,
        df_shampoo,
        table_name="SHAMPOO",
        force=True,
        replace=True,
        drop_exist_tab=True,
        table_structure={   "ID" : "INTEGER", "SALES" : "DOUBLE"}
)

hdf_shampoo.head(5).collect()

#### Fourth dataset represents results from a marketing campaign for a bank. Each row contains information for a bank-customer incl. if the person responded yes or no to the offer. We want to use it for classification.

#### Let's strart by reading the csv-file into a pandas dataframe.

In [None]:
# Read the tab-delimited file
df_bank = pd.read_csv('./data/Bank.csv', delimiter=';', header=0)

df_bank.columns = [col.upper() for col in df_bank.columns]

# Display column names
print('Columns: ', df_bank.columns)

In [None]:
# create hana dataframe/DB table from pandas dataframe
hdf_bank = create_dataframe_from_pandas(
        conn,
        df_bank,
        table_name="BANK",
        force=True,
        replace=True,
        drop_exist_tab=True,
        table_structure={   "age" : "INTEGER", "job" : "NVARCHAR(64)", "marital" : "NVARCHAR(64)", "education" : "NVARCHAR(64)", 
                            "default" : "NVARCHAR(64)", "housing" : "NVARCHAR(64)", "loan" : "NVARCHAR(64)", 
                            "contact" : "NVARCHAR(64)", "month" : "NVARCHAR(64)", "day_of_week" : "NVARCHAR(64)", 
                            "duration" : "INTEGER", "campaign" : "INTEGER", "pdays" : "INTEGER", 
                            "previous" : "INTEGER",  "poutcome" : "NVARCHAR(64)", "cons.price.idx" : "INTEGER", 
                            "cons.conf.idx" : "INTEGER", "euribor3m" : "INTEGER", "nr.employed" : "INTEGER", "y" : "NVARCHAR(64)"
                            }
)

hdf_bank.head(5).collect()

#### Fifth dataset represents daily gas-prices. Each row contains the daily gas-price. We want to use it for forecasting future gas prices.

#### Let's strart by reading the csv-file into a pandas dataframe.

In [None]:
# Read the tab-delimited file
df_fuelprices = pd.read_csv('./data/GasPrices.csv', 
                            delimiter=';', decimal='.', header=0)

# Display column names
print('Columns: ', df_fuelprices.columns)

In [None]:
hdf_fuelprices = create_dataframe_from_pandas(
        conn, 
        df_fuelprices,
        table_name="FUEL_PRICES",
        force=True,
        replace=True,
        drop_exist_tab=True,
        table_structure={"date": "TIMESTAMP", "station_uuid": "NVARCHAR(5000)", 
                        "diesel": "DOUBLE", "e5": "DOUBLE", "e10": "DOUBLE",
                        "dieselchange" : "INTEGER","e5change" : "INTEGER","e10change" : "INTEGER"
                        }
)

hdf_fuelprices.head(5).collect()