In [1]:
# Loading functions
%run ../modules/common
%run ../modules/cleaning_merging

In [2]:
# Ignore 'dask' warning
from datetime import datetime
#from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab

In [3]:
date_dataset = datetime.strptime("2018-01-01", '%Y-%m-%d')
columns_targets = ['date_month', 'id_company', 'id_branch', 'is_sole_proprietor', 'has_relocated']
columns_features = ['date_month', 'id_company', 'id_branch',
                    'is_discontinued', 'financial_calamity_outcome', 'date_established', 'qty_employees', 
                    'year_qty_employees', 'id_company_creditproxy', 'score_payment_assessment', 
                    'amt_revenue', 'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
                    'amt_consolidated_operating_result', 'year_consolidated_operating_result', 
                    'perc_credit_limit_adjustment', 'color_credit_status', 'rat_pd', 'score_pd',
                    'has_increased_risk', 'is_sole_proprietor', 'code_SBI_2', 'code_SBI_1',
                    'qty_address_mutations_total', 'qty_address_mutations_month', 'has_name_change', 
                    'code_discontinuation', 'code_financial_calamity', 'qty_issued_credit_reports', 
                    'Associate', 'Authorized official', 'Board member', 'Chairman', 'Commissioner', 
                    'Director', 'Liquidator', 'Major', 'Managing clerk', 'Managing partner', 
                    'Member of the partnership', 'Miscellaneous', 'Owner', 'Secretary', 'Secretary/Treasurer', 
                    'Treasurer', 'Unknown', 'Vice President', 'amt_operating_result', 'code_legal_form', 
                    'date_financial_calamity_started', 'date_financial_calamity_stopped', 'date_start', 
                    'from_date_start', 'qty_stopped_names', 'qty_started_names', 'year_operating_result']    

In [6]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'
dir_data = '01_input'
gc_bucket = GCS_Bucket(name_project = project, name_bucket = bucket_name)

In [7]:
bucket = gc_bucket.get_bucket()

In [8]:
df_features = get_features(date_dataset, columns_features, dir_data, bucket)

TypeError: unsupported operand type(s) for +: 'Bucket' and 'str'

In [10]:
date_month = date_dataset

df_months_combined = pd.DataFrame()  # The data frame which will contain all independent variables
    
# Get all months in range
df_date_months = pd.DataFrame(pd.date_range(date_month, periods=12, freq="M").tolist(),
                              columns=['date_month'])
df_date_months['date_month'] = df_date_months['date_month'].values.astype('datetime64[M]') # First day of month

# Get the file names of all required month files
#month_files = get_month_filenames(df_date_months, bucket, dir_data)

In [11]:
month_files = [] # List of month files

df_date_months['year'] = df_date_months.date_month.dt.year
list_years = df_date_months['year'].unique()
# If there are multiple years, iterate through years  
for year in list_years:
    # Get the year's data file names
    dir_data_year = dir_data + '/' + str(year)
    list_blob = list(bucket.list_blobs(prefix=dir_data_year))
    # finding out which month files should be processed by looking which contain the first month date (YYYY-mm-01)
    df_year_months = df_date_months[df_date_months['year'] == year]['date_month']
    for blob in list_blob:
        for month in df_year_months:
            if (month.strftime("%Y-%m-%d") in blob.name) & ('CSV' in blob.name):
                month_files.append(blob.name)

In [12]:
month_files

['01_input/2018/modelling_2018-01-01_2018-01-31.CSV',
 '01_input/2018/modelling_2018-02-01_2018-02-28.CSV',
 '01_input/2018/modelling_2018-03-01_2018-03-31.CSV',
 '01_input/2018/modelling_2018-04-01_2018-04-30.CSV',
 '01_input/2018/modelling_2018-05-01_2018-05-31.CSV',
 '01_input/2018/modelling_2018-06-01_2018-06-30.CSV',
 '01_input/2018/modelling_2018-07-01_2018-07-31.CSV',
 '01_input/2018/modelling_2018-08-01_2018-08-31.CSV',
 '01_input/2018/modelling_2018-09-01_2018-09-30.CSV',
 '01_input/2018/modelling_2018-10-01_2018-10-31.CSV']

In [None]:
# Cleaning, transforming and combining month files                
for month_file in month_files:
    with fs.open('graydon-data/' + month_file) as f:
        df_month = pd.read_csv(f, sep=';', usecols= columns_features, index_col=False)   
        df_month = df_month[(df_month['is_sole_proprietor'] == 0)] # & (one_month_df['is_discontinued'] == 0) 
        df_month.columns = (df_month.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', ''))
        df_months_combined = df_months_combined.append(df_month)
        print('The number of rows so far by adding ', month_file, ":", df_months_combined.shape[0])

df_months_combined['date_dataset'] = date_month

# Aggregating data to year
df_months_combined = df_months_combined.groupby(['date_dataset', 
                                                  'id_company', 
                                                  'id_branch']).agg({'has_relocated': 'max', 
                                                                     'date_month': 'max'})
df_months_combined = df_months_combined.rename(index=str, columns={"date_month": "date_month_last"})
df_months_combined = df_months_combined.reset_index()
df_months_combined['date_dataset'] = pd.to_datetime(df_months_combined['date_dataset'])
df_months_combined['id_company'] = df_months_combined['id_company'].astype(int)
df_months_combined['id_branch'] = df_months_combined['id_branch'].astype(int)

return(df_months_combined)