# Load libraries

In [2]:
# Ignore 'dask' warning
import pandas as pd
import numpy as np
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab

## Setting up constants

All required

In [12]:
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'
dir_year_files_from = 'including_scores/merged_per_year/merged_cleaned'
dir_year_files_to = 'files_to_bucket'
years = ['2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008']
selected_columns_small = ['date_month', 'id_company', 'id_branch', 'date_start', 'from_date_start']

## Initializing bucket

In [4]:
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)

## Function definitions

In [5]:
def read_one_year_from_bucket_merged_csv(year, dir_prefix = '', selected_columns = ''):
    """ Reads a whole year of data from the already merged files """
    full_year_df = pd.DataFrame()
    
    # Setting up dictionary of column types
    dtype={'id_company'  :np.float64,
           'id_branch'    :np.int64,
           'is_discontinued':bool,
           'code_discontinuation': np.float64,
           'code_financial_calamity':object,
           'financial_calamity_outcome'   : np.float64,
           'code_legal_form' : np.float64,
           'qty_employees' :np.float64,
           'year_qty_employees' :np.float64,
           'id_company_creditproxy':object,
           'score_payment_assessment'    : np.float64,
           'amt_revenue'  : np.float64,
           'year_revenue'  : np.float64,
           'amt_operating_result'   : np.float64,
           'year_operating_result'    :object,
           'amt_consolidated_revenue'   : np.float64,
           'year_consolidated_revenue'   :object,
           'amt_consolidated_operating_result'     : np.float64,
           'year_consolidated_operating_result'   :object,
           'qty_issued_credit_reports' : np.float64,
           'perc_credit_limit_adjustment' :object,
           'color_credit_status'  :object,
           'rat_pd'              :object,
           'score_pd'            : np.float64,
           'has_increased_risk'  :bool,
           'is_sole_proprietor'   :bool,
           'code_sbi_2'         : np.float64,
           'code_sbi_1'          :object,
           'qty_address_mutations_total'  :np.float64,
           'qty_address_mutations_month'   :np.float64,
           'has_relocated':bool,
           'qty_started_names': np.float64,
           'qty_stopped_names': np.float64,
           'has_name_change':bool,
           'total_changeof_board_members_' :np.float64
         }
    
    parse_dates= ['date_established' ,'date_established', 'date_financial_calamity_started',
           'date_financial_calamity_stopped', 'date_start', 'from_date_start' ]
    
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:  
        if year in blob.name:
            with fs.open('graydon-data/' + blob.name) as f:
                if selected_columns == '' or None:
                    full_year_df = pd.read_csv(f, sep=',', index_col=0, dtype=dtype, parse_dates=parse_dates) 
                else:
                    full_year_df = pd.read_csv(f, sep=',', index_col=0, 
                                               usecols = selected_columns, dtype=dtype, parse_dates=parse_dates) 
            print('The number of rows read: ', full_year_df.shape[0])
    return full_year_df

In [6]:
def get_combined_years(year, dir_year_files, selected_columns = None):

    df_all_years = pd.DataFrame()

    for year in years:
        df_one_year = read_one_year_from_bucket_merged_csv(year = year, 
                                                          dir_prefix = dir_year_files,
                                                          selected_columns = selected_columns)
        df_all_years = df_all_years.append(df_one_year)

    return df_all_years

In [7]:
def save_df_locally(df, dir_prefix, year, as_json= False):
    """ Saves df as json or csv locally on server """
    if as_json:        
        file_path = dir_prefix + '/' + year + '_merged.json'
        df.to_json(file_path)
    else:
        file_path =  dir_prefix + '/' + year + '_merged.csv'
        df.to_csv(file_path)

## Execution

In [8]:
# Reading relocation dates
blob_list = list(bucket.list_blobs(prefix='location_start_date.CSV'))

for blob in blob_list: 
    with fs.open('graydon-data/' + blob.name) as f:
        df_relocation_dates = pd.read_csv(f, sep=',', 
                                          na_values=['', '1198-06-12', 'NA']) 
        df_relocation_dates['date_relocation_last'] = pd.to_datetime(df_relocation_dates['date_relocation_last'])
        df_relocation_dates['date_relocation_penultimate'] = pd.to_datetime(df_relocation_dates['date_relocation_penultimate'])

In [None]:
# Preview of the data 
HTML(DataFrame(df_relocation_dates).head(5).to_html())

# Manually by year

In [None]:
#i_year = '2018'

Reading year

In [None]:
'''
df_year = read_one_year_from_bucket_merged_csv(year = i_year, dir_prefix = dir_year_files_from)
qty_rows_input = len(df_year) 
'''

Use only the relocations of i_year and the years before:

In [None]:
'''
is_same_or_before_year = df_relocation_dates['date_relocation_last'].dt.year <= int(i_year)
df_relocation_dates_year = df_relocation_dates[is_same_or_before_year]
'''

Adding relocation dates for each branch month combination before the maximum date before the month date:

In [None]:
'''
df_branch_months = df_year[['id_company', 'id_branch', 'date_month']]
df_branch_months = df_branch_months.merge(df_relocation_dates_year, 
                                          on=['id_company', 'id_branch'], 
                                          how='left')
df_max_dates = df_branch_months.groupby(['id_company', 'id_branch', 'date_month'])['date_relocation_last', 'date_relocation_penultimate'].max()
'''

Adding the new data to the original year data

In [None]:
'''
df_year = df_year.merge(df_max_dates,
                        on=['id_company', 'id_branch', 'date_month'], 
                        how='left')
qty_rows_output = len(df_year)
'''

Check whether number of rows yearly should remain constant

In [None]:
'''
if(qty_rows_input != qty_rows_output):
    print('Mismatch in rows for ', i_year)
    print("Going in : ", qty_rows_input)
    print("Going out: ", qty_rows_output)
    raise Exception('Mismatch in rows for: {}'.format(i_year))
'''

In [None]:
#save_df_locally(df= df_year, dir_prefix= dir_year_files_to, year= i_year)

# In loop form

In [13]:
for i_year in years:
    
    # Reading year
    print('1. Reading monthly branch data of ', i_year)
    df_year = read_one_year_from_bucket_merged_csv(year = i_year, dir_prefix = dir_year_files_from)
    qty_rows_input = len(df_year) 
    
    # Use only the relocations of i_year and the years before
    print('2. Selecting relocation data of ', i_year, " and before")
    is_same_or_before_year = df_relocation_dates['date_relocation_last'].dt.year <= int(i_year)
    df_relocation_dates_year = df_relocation_dates[is_same_or_before_year]
    
    # Getting relocation dates for each branch month combination before the maximum date before the month date
    print('3. Getting relocation data for each month ', i_year, " per branch")
    df_branch_months = df_year[['id_company', 'id_branch', 'date_month']]
    df_branch_months = df_branch_months.merge(df_relocation_dates_year, 
                                              on=['id_company', 'id_branch'], 
                                              how='left')
    df_max_dates = df_branch_months.groupby(['id_company', 'id_branch', 'date_month'])['date_relocation_last', 'date_relocation_penultimate'].max()
    
    # Adding the new data to the original year data
    print('4. Adding relocation dates to company data of ', i_year)
    df_year = df_year.merge(df_max_dates,
                            on=['id_company', 'id_branch', 'date_month'], 
                            how='left')
    qty_rows_output = len(df_year)
    
    # Check whether number of rows yearly should remain constant
    if(qty_rows_input != qty_rows_output):
        print('Mismatch in rows for ', i_year)
        print("Going in : ", qty_rows_input)
        print("Going out: ", qty_rows_output)
        raise Exception('Mismatch in rows for: {}'.format(i_year))
        
    print('5. Saving enriched yearly data of ', i_year, ' locally' )       
    save_df_locally(df= df_year, dir_prefix= dir_year_files_to, year= i_year)
    
    print('6. Done processing and saving', i_year)

Processing file:  including_scores/merged_per_year/merged_cleaned/2011_merged.csv
The number of rows read:  19853262
2. Selecting relocation data of  2011  and before
3. Getting relocation data for each month  2011  per branch
4. Adding relocation dates to company data of  2011
5. Saving enriched yearly data of  2011  locally
6. Done processing and saving 2011
Processing file:  including_scores/merged_per_year/merged_cleaned/2010_merged.csv
The number of rows read:  19121202
2. Selecting relocation data of  2010  and before
3. Getting relocation data for each month  2010  per branch
4. Adding relocation dates to company data of  2010
5. Saving enriched yearly data of  2010  locally
6. Done processing and saving 2010
Processing file:  including_scores/merged_per_year/merged_cleaned/2009_merged.csv
The number of rows read:  17898540
2. Selecting relocation data of  2009  and before
3. Getting relocation data for each month  2009  per branch
4. Adding relocation dates to company data of  