# Load libraries

In [None]:
# Ignore 'dask' warning
import pandas as pd
import numpy as np
from datetime import datetime
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab

## Data-set selection

In [None]:
date_dataset = datetime.strptime("2018-01-01", '%Y-%m-%d')
columns_dependent_vars = ['date_month', 'id_company', 'id_branch', 'is_sole_proprietor', 'has_relocated']
columns_independent_vars = ['date_month', 'id_company', 'id_branch',
                            'is_discontinued', 'financial_calamity_outcome', 'date_established', 'qty_employees', 
                            'year_qty_employees', 'id_company_creditproxy', 'score_payment_assessment', 
                            'amt_revenue', 'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
                            'amt_consolidated_operating_result', 'year_consolidated_operating_result', 
                            'perc_credit_limit_adjustment', 'color_credit_status', 'rat_pd', 'score_pd',
                            'has_increased_risk', 'is_sole_proprietor', 'code_SBI_2', 'code_SBI_1',
                            'qty_address_mutations_total', 'qty_address_mutations_month', 'has_name_change', 
                            'code_discontinuation', 'code_financial_calamity', 'qty_issued_credit_reports', 
                            'Associate', 'Authorized official', 'Board member', 'Chairman', 'Commissioner', 
                            'Director', 'Liquidator', 'Major', 'Managing clerk', 'Managing partner', 
                            'Member of the partnership', 'Miscellaneous', 'Owner', 'Secretary', 'Secretary/Treasurer', 
                            'Treasurer', 'Unknown', 'Vice President', 'amt_operating_result', 'code_legal_form', 
                            'date_financial_calamity_started', 'date_financial_calamity_stopped', 'date_start', 
                            'from_date_start', 'qty_stopped_names', 'qty_started_names', 'year_operating_result']    

## Google Cloud setup

### Setting up constants

In [None]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'
dir_data = 'including_scores/unzipped'

### Initializing bucket

In [None]:
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)

## Function definitions

In [None]:
def month_delta(date, delta):
    """ Adding/subtracting months to/from a date """
    m, y = (date.month + delta) % 12, date.year + ((date.month) + delta - 1) // 12
    if not m: m = 12
    d = min(date.day, [31, 29 if y%4==0 and not y%400==0 else 28,31,30,31,30,31,31,30,31,30,31][m-1])
    return date.replace(day=d,month=m, year=y)

In [None]:
def aggregate_board_members(df):
    """Agregates the number of board members into one feature """    
    col_list_to_sum = ['associate', 'authorized_official', 'board_member', 'chairman', 'commissioner',
                       'director', 'liquidator', 'major', 'managing_clerk', 'managing_partner',
                       'member_of_the_partnership', 'miscellaneous', 'owner', 'secretary',
                       'secretary/treasurer', 'treasurer', 'unknown', 'vice_president']  
    df['total_changeof_board_members_'] = df[col_list_to_sum].sum(axis=1)
    df = df.drop(columns=col_list_to_sum)
    return df

In [None]:
def clean_month_data(df):
   """Cleans data and returns formatted df"""
   df['date_month'] = pd.to_datetime(df['date_month'])
   df['financial_calamity_outcome'] = df['financial_calamity_outcome'].fillna(-1)
   df['qty_employees'] = df['qty_employees'].str.strip()
   df.loc[df.qty_employees == 'NA', 'qty_employees'] = None
   df['qty_employees'] = df['qty_employees'].fillna(0)
   df['qty_employees'] = df['qty_employees'].astype(str).astype(int)
   df['year_qty_employees'] = df['year_qty_employees'].str.strip()
   df.loc[df.year_qty_employees == 'NA', 'year_qty_employees'] = None
   df['amt_revenue'] = df['amt_revenue'].str.strip()
   df.loc[df.amt_revenue == 'NA', 'amt_revenue'] = np.NaN
   df['amt_revenue'] = df['amt_revenue'].astype(str).str.replace(',','.').astype(float)
   df['year_revenue'] = df['year_revenue'].str.strip()
   df.loc[df.year_revenue == 'NA', 'year_revenue'] = None
   df['amt_consolidated_revenue'] = df['amt_consolidated_revenue'].str.strip()
   df.loc[df.amt_consolidated_revenue == 'NA', 'amt_consolidated_revenue'] = np.NaN
   df['amt_consolidated_revenue'] = df['amt_consolidated_revenue'].astype(str).str.replace(',','.').astype(float)
   df['year_consolidated_revenue'] = df['year_consolidated_revenue'].str.strip()
   df.loc[df.year_consolidated_revenue == 'NA', 'year_consolidated_revenue'] = np.NaN
   df['amt_consolidated_operating_result'] = df['amt_consolidated_operating_result'].str.strip()
   df.loc[df.amt_consolidated_operating_result == 'NA', 'amt_consolidated_operating_result'] = np.NaN
   df['amt_consolidated_operating_result'] = df['amt_consolidated_operating_result'].astype(str).str.replace(',','.').astype(float)
   df['year_consolidated_operating_result'] = df['year_consolidated_operating_result'].str.strip()
   df.loc[df.year_consolidated_operating_result == 'NA', 'year_consolidated_operating_result'] = np.NaN
   df['score_pd'] = df['score_pd'].str.strip()
   df.loc[df.score_pd == 'NA', 'score_pd'] = np.NaN
   df['score_pd'] = df['score_pd'].astype(str).str.replace(',','.').astype(float)
   df['has_increased_risk'] = df['has_increased_risk'].astype(bool)
   df.loc[df.has_increased_risk == None, 'has_increased_risk'] = False
   df.loc[df.code_sbi_2.isnull(), 'code_sbi_2'] = np.NaN
   df.loc[df.date_established < '1700-12-31' , 'date_established'] = None
   df['date_established'] = pd.to_datetime(df['date_established'])
   df['amt_operating_result'] = df['amt_operating_result'].str.strip()
   df.loc[df.amt_operating_result == 'NA', 'amt_operating_result'] = np.NaN
   df['amt_operating_result'] = df['amt_operating_result'].astype(str).str.replace(',','.').astype(float)
   df['year_operating_result'] = df['year_consolidated_operating_result'].str.strip()
   #df.loc[df.year_operating_result == 'NA', 'year_operating_result'] = 0
   return df

In [None]:
def get_relocation_dates(date_dataset):
    """ Reading  relocation data """
    # Reading relocation dates
    blob_list = list(bucket.list_blobs(prefix='location_start_date.CSV'))

    for blob in blob_list: 
        with fs.open('graydon-data/' + blob.name) as f:
            df_relocation_dates = pd.read_csv(f, sep=',', 
                                              na_values=['', '1198-06-12', 'NA']) 
            df_relocation_dates['date_relocation_last'] = pd.to_datetime(df_relocation_dates['date_relocation_last'])
            df_relocation_dates['date_relocation_penultimate'] = pd.to_datetime(df_relocation_dates['date_relocation_penultimate'])
            
    return(df_relocation_dates)

In [None]:
def get_month_filenames(df_date_months, dir_data):
    """ Get the file names of number of the months in the data frame """
    month_files = [] # List of month files
    
    df_date_months['year'] = df_date_months.date_month.dt.year
    list_years = df_date_months['year'].unique()

    # If there are multiple years, iterate through years  
    for year in list_years:
        # Get the year's data file names
        dir_data_year = dir_data + '/' + str(year)
        list_blob = list(bucket.list_blobs(prefix=dir_data_year))

        # finding out which month files should be processed by looking which contain the first month date (YYYY-mm-01)
        df_year_months = df_date_months[df_date_months['year'] == year]['date_month']
        for blob in list_blob:
            for month in df_year_months:
                if (month.strftime("%Y-%m-%d") in blob.name) & ('CSV' in blob.name):
                    month_files.append(blob.name)
                    
    return(month_files)

In [None]:
def get_features(date_month, columns_features, dir_data):
    """ Getting the dependent variable set """
    df_months_combined = pd.DataFrame()  # The data frame which will contain all independent variables
    
    # Get all months in range
    df_date_months = pd.DataFrame(pd.date_range(date_month, periods=12, freq="M").tolist(),
                                  columns=['date_month'])
    df_date_months['date_month'] = df_date_months['date_month'].values.astype('datetime64[M]') # First day of month

    # Get the file names of all required month files
    month_files = get_month_filenames(df_date_months, dir_data)
                    
    # Cleaning, transforming and combining month files                
    for month_file in month_files:
        with fs.open('graydon-data/' + month_file) as f:
            df_month = pd.read_csv(f, sep=';', usecols= columns_features, index_col=False)   
            df_month = df_month[(df_month['is_sole_proprietor'] == 0)] # & (one_month_df['is_discontinued'] == 0) 
            df_month.columns = (df_month.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', ''))
            df_months_combined = df_months_combined.append(df_month)
            print('The number of rows so far by adding ', month_file, ":", df_months_combined.shape[0])
     
    df_months_combined['date_dataset'] = date_month
    
    # Aggregating data to year
    df_months_combined = df_months_combined.groupby(['date_dataset', 
                                                      'id_company', 
                                                      'id_branch']).agg({'has_relocated': 'max', 
                                                                         'date_month': 'max'})
    df_months_combined = df_months_combined.rename(index=str, columns={"date_month": "date_month_last"})
    df_months_combined = df_months_combined.reset_index()
    df_months_combined['date_dataset'] = pd.to_datetime(df_months_combined['date_dataset'])
    df_months_combined['id_company'] = df_months_combined['id_company'].astype(int)
    df_months_combined['id_branch'] = df_months_combined['id_branch'].astype(int)
    
    return(df_months_combined)

In [None]:
def get_targets(date_month, columns_targets, dir_data):
    """ Getting the independent variable set """
    df_months_combined = pd.DataFrame()  # The data frame which will contain all independent variables
    month_files = []                     # List of month files in scope
    
    # Get all months
    date_start = month_delta(date_month, -12)
    df_date_months = pd.DataFrame(pd.date_range(date_start, periods=12, freq="M").tolist(),
                                  columns=['date_month'])
    df_date_months['date_month'] = df_date_months['date_month'].values.astype('datetime64[M]') # First day of month
    
    # Get the file names of all required month files
    month_files = get_month_filenames(df_date_months, dir_data)
    
    # Cleaning, transforming and combining month files    
    for month_file in month_files:
        with fs.open('graydon-data/' + month_file) as f:
            df_month = pd.read_csv(f, sep=';', usecols= columns_targets, index_col=False)   
            df_month = df_month[(df_month['is_sole_proprietor'] == 0)] 
            df_month.columns = (df_month.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', ''))
            df_month = aggregate_board_members(df_month)
            df_month = clean_month_data(df_month)
            df_months_combined = df_months_combined.append(df_month)
            print('The number of rows so far by adding', month_file, ":", df_months_combined.shape[0])
            
    df_months_combined['date_dataset'] = date_month        
    
    return(df_months_combined)

In [None]:
def upload_df_to_gc_bucket(df, dir_prefix, year, as_json= False):
    """ Uploads pandas DF to Gc bucket either as json or csv """
    if as_json:
        df_json = df.to_json()
        new_file_path = dir_prefix + '/' + year + '_merged.json'
        gcs_datalab.Bucket(bucket_name).item(new_file_path).write_to(df_json,'text/json')
    else:
        df_csv = df.to_csv()
        new_file_path = dir_prefix + '/' + year + '_merged.csv'
        gcs_datalab.Bucket(bucket_name).item(new_file_path).write_to(df_csv,'text/csv')

In [None]:
def save_df_locally(df, dir_prefix, dataset_name, as_json= False):
    """ Saves df as json or csv locally on server """
    if as_json:        
        file_path = dir_prefix + '/dataset_' + dataset_name + '.json'
        df.to_json(file_path)
    else:
        file_path =  dir_prefix + '/dataset_' + dataset_name + '.csv'
        df.to_csv(file_path)

# Data collection

## Independent variables

In [None]:
df_features = get_features(date_dataset, columns_features, dir_data)

## Dependent variables

In [None]:
df_targets = get_targets(date_dataset, columns_targets, dir_data)

## Combine dependent and independent variables

In [None]:
df_total = df_features.merge(df_targets,
                             on=['id_company', 'id_branch', 'date_dataset'],
                             how='left')

## Saving the complete data-set

In [None]:
save_df_locally(df= df_total, dir_prefix= 'files_to_bucket', dataset_name = str(date_dataset))

# Checks

## Dependent variable : relocation indicator

Example data:

In [None]:
df_features.head()

In [None]:
print("Relocated:", sum(df_features.has_relocated), "of", len(df_features), "branches")

For the companies that 'died' within the dependent window the date_month_last value is before _yyyy-12-01_:

In [None]:
df_features.groupby([date_month_last]).count()

## Independent variables

Example data:

In [None]:
df_targets.head()

Number of records per month

In [None]:
df_targets.groupby(['date_month']).count()