### Notebook to Explore New Data Sources

Datasources found at https://www.ine.pt/xportal/xmain?xpid=INE&xpgid=ine_princindic

In [1]:
import os
import pandas as pd
import xlrd 
import numpy as np

In [8]:
dirname = '../data/data_augmentation/'
os.listdir(dirname)

['08-01-19 01_52_32_theglobaleconomy.xls',
 '.DS_Store',
 'newhousing_construction_index_monthly_portugal.xlsx',
 '08-01-19 01_55_20_theglobaleconomy.txt',
 'cons_confidence_monthly_portugal.xls',
 'eco_activity_monthly_portugal.xlsx',
 'other_stats_monthly_portugal.xlsx',
 'turnover_index_industry_monthly_portugal.xlsx',
 'railway_passengers_monthly_portugal.xlsx',
 'harmonized_cons_prices_monthly_portugal.xlsx',
 'daily_euribor3m.csv',
 'hotel_guests_monthly_portugal.xlsx',
 '08-01-19 01_58_10_theglobaleconomy.csv',
 'unemployment_monthly_portugal.xlsx',
 '08-01-19 01_55_17_theglobaleconomy.csv',
 'industry_employment_index_monthly_portugal.xlsx']

In [9]:
df1 = pd.read_csv(dirname+'08-01-19 01_58_10_theglobaleconomy.csv')

In [10]:
df2 = pd.read_csv(dirname+'08-01-19 01_55_17_theglobaleconomy.csv')

In [11]:
df1.columns

Index(['Country', 'Code', 'Year', 'Month', ' Consumer credit',
       ' Deposit interest rate', ' Mortgage credit interest rate',
       ' Business credit interest rate', ' Employment', ' Unemployment rate ',
       ' Current account balance', ' Exports', ' FDI',
       ' Foreign exchange reserves', ' Government expenditure',
       ' Budget balance', ' Labor cost', ' Exchange rate to USD',
       ' Minimum wage', ' Government debt '],
      dtype='object')

In [12]:
df2.columns

Index(['Country', 'Code', 'Year', 'Month', ' Private sector credit',
       ' Business credit', ' Household credit', ' Mortgage credit',
       ' Gross Domestic Product (GDP)', ' Consumer Price Index (CPI)',
       ' Money supply (broad money)', ' Household consumption', ' Investment',
       ' Economic growth ', ' Consumption growth',
       ' Investment as percent of GDP', ' Investment growth',
       ' Consumption as percent of GDP', ' Inflation monthly',
       ' Inflation annual', ' Retail sales', ' Business confidence survey',
       ' Consumer confidence survey', ' Economic growth Q-on-Q'],
      dtype='object')

In [13]:
df2.drop(columns=['Year', 'Month'], axis=1, inplace=True)

In [14]:
#df1 and df2 have identical timeframes so they can be merged directly
df = pd.concat([df1,df2], axis=1)

In [15]:
#Replacing empty cells so they can be dropped
df.replace(' ', np.nan, inplace=True)

In [16]:
df.fillna(method='backfill', inplace=True)

In [17]:
#Loading df after it had been augmented in iteration 1 so it can be merged:
df_w_time_stats = pd.read_pickle('../data/pickle_files/df_pickle_w_time_stats')

In [18]:
#checking data types prior to merging
print(df.Year.dtypes)
print(df.Month.dtypes)
print(df_w_time_stats.year.dtypes)
print(df_w_time_stats.month.dtypes)
#the data types match so we can merge direcly

int64
int64
int64
int64


In [19]:
df.rename(columns={'Year':'year', 'Month':'month'}, inplace=True)

In [20]:
df = df_w_time_stats.merge(df, how='left', on=['year', 'month'])

In [21]:
#Checking that the merge went succesfully:
print("Row Match: ", df.shape[0] == df_w_time_stats.shape[0])

Row Match:  True


In [22]:
#The new df's columns have leading and trailing spaces, removing them:
df.columns = [i.strip() for i in df.columns]

In [23]:
#checking for duplicate columns (we had already added economic indicators)
df.columns

Index(['cons_conf', 'ind_turnover', 'major_purch_expect',
       'major_purch_opportu', 'unempl_expect', 'inflation_delta_expect',
       'economy_expect', 'economy_past', 'financial_past',
       'savings_delta_expect', 'household_debt_ratio', 'savings_expect',
       'emplmnt_industry_index', 'railway_passengers', 'month', 'year', 'Date',
       'day', 'age', 'duration', 'campaign', 'pdays', 'previous',
       'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
       'nr.employed', 'job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'month_last_contact', 'day_of_week_last_contact',
       'poutcome', 'y', 'Country', 'Code', 'Consumer credit',
       'Deposit interest rate', 'Mortgage credit interest rate',
       'Business credit interest rate', 'Employment', 'Unemployment rate',
       'Current account balance', 'Exports', 'FDI',
       'Foreign exchange reserves', 'Government expenditure', 'Budget balance',
       'Labor cost', 'Exchange rate

In [24]:
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            iv = vs.iloc[:,i].tolist()
            for j in range(i+1, lcs):
                jv = vs.iloc[:,j].tolist()
                if iv == jv:
                    dups.append(cs[i])
                    break

    return dups

In [25]:
dups = duplicate_columns(df)

In [26]:
#Verifying that these are actual duplicate columns: cons_conf should not be removed
df[dups].head()

Unnamed: 0,cons_conf,Country,Country.1,Country.2,Country.3,Code,Code.1,Code.2,Code.3,Country.4,Country.5,Code.4,Code.5
0,-28.5,Portugal,Portugal,Portugal,Portugal,PRT,PRT,PRT,PRT,Portugal,Portugal,PRT,PRT
1,-28.5,Portugal,Portugal,Portugal,Portugal,PRT,PRT,PRT,PRT,Portugal,Portugal,PRT,PRT
2,-28.5,Portugal,Portugal,Portugal,Portugal,PRT,PRT,PRT,PRT,Portugal,Portugal,PRT,PRT
3,-28.5,Portugal,Portugal,Portugal,Portugal,PRT,PRT,PRT,PRT,Portugal,Portugal,PRT,PRT
4,-28.5,Portugal,Portugal,Portugal,Portugal,PRT,PRT,PRT,PRT,Portugal,Portugal,PRT,PRT


In [27]:
dups.pop(0)

'cons_conf'

In [28]:
df.drop(columns=dups, inplace=True)

In [29]:
df.head()

Unnamed: 0,cons_conf,ind_turnover,major_purch_expect,major_purch_opportu,unempl_expect,inflation_delta_expect,economy_expect,economy_past,financial_past,savings_delta_expect,...,Consumption growth,Investment as percent of GDP,Investment growth,Consumption as percent of GDP,Inflation monthly,Inflation annual,Retail sales,Business confidence survey,Consumer confidence survey,Economic growth Q-on-Q
0,-28.5,113.72,-21.8,-65.1,48.7,52.2,-33.3,-57.1,-28.0,-37.8,...,5.12,23.32,9.55,64.6,0.44,2.78,116.9,0.11,-28.5,-0.51
1,-28.5,113.72,-21.8,-65.1,48.7,52.2,-33.3,-57.1,-28.0,-37.8,...,5.12,23.32,9.55,64.6,0.44,2.78,116.9,0.11,-28.5,-0.51
2,-28.5,113.72,-21.8,-65.1,48.7,52.2,-33.3,-57.1,-28.0,-37.8,...,5.12,23.32,9.55,64.6,0.44,2.78,116.9,0.11,-28.5,-0.51
3,-28.5,113.72,-21.8,-65.1,48.7,52.2,-33.3,-57.1,-28.0,-37.8,...,5.12,23.32,9.55,64.6,0.44,2.78,116.9,0.11,-28.5,-0.51
4,-28.5,113.72,-21.8,-65.1,48.7,52.2,-33.3,-57.1,-28.0,-37.8,...,5.12,23.32,9.55,64.6,0.44,2.78,116.9,0.11,-28.5,-0.51


In [30]:
df.isna().sum()

cons_conf                        0
ind_turnover                     0
major_purch_expect               0
major_purch_opportu              0
unempl_expect                    0
inflation_delta_expect           0
economy_expect                   0
economy_past                     0
financial_past                   0
savings_delta_expect             0
household_debt_ratio             0
savings_expect                   0
emplmnt_industry_index           0
railway_passengers               0
month                            0
year                             0
Date                             0
day                              0
age                              0
duration                         0
campaign                         0
pdays                            0
previous                         0
emp.var.rate                     0
cons.price.idx                   0
cons.conf.idx                    0
euribor3m                        0
nr.employed                      0
job                 

In [38]:
df.select_dtypes(include = ['object']).columns

Index(['Government expenditure', 'Budget balance', 'Labor cost',
       'Government debt', 'Gross Domestic Product (GDP)',
       'Household consumption', 'Investment', 'Economic growth',
       'Consumption growth', 'Investment as percent of GDP',
       'Investment growth', 'Consumption as percent of GDP',
       'Economic growth Q-on-Q'],
      dtype='object')

In [42]:
#converting object columns into numeric - they were always supposed to be numeric
for i in df.select_dtypes(include = ['object']).columns:
    df[i] = df[i].astype(float)

In [44]:
#final data quality check
df.dtypes

cons_conf                               float64
ind_turnover                            float64
major_purch_expect                      float64
major_purch_opportu                     float64
unempl_expect                           float64
inflation_delta_expect                  float64
economy_expect                          float64
economy_past                            float64
financial_past                          float64
savings_delta_expect                    float64
household_debt_ratio                    float64
savings_expect                          float64
emplmnt_industry_index                  float64
railway_passengers                      float64
month                                     int64
year                                      int64
Date                             datetime64[ns]
day                                       int64
age                                       int64
duration                                  int64
campaign                                

In [45]:
#Writing the latest augmented dataframe out
df.to_pickle('../data/pickle_files/df_pickle_w_all_stats')