In [1]:
# Ignore 'dask' warning
import pandas as pd
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab
import numpy as np

In [2]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'

In [3]:
# Initializing bucket
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)

In [4]:
selected_columns = ['date_month', 'id_company', 'id_branch',
       'is_discontinued',
       'financial_calamity_outcome', 'date_established', 
       'qty_employees', 'year_qty_employees', 'id_company_creditproxy',
       'score_payment_assessment', 'amt_revenue',
       'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
       'amt_consolidated_operating_result',
       'year_consolidated_operating_result', 
       'perc_credit_limit_adjustment', 'color_credit_status', 'rat_pd',
       'score_pd','has_increased_risk',
       'is_sole_proprietor', 'code_SBI_2', 'code_SBI_1',
       'qty_address_mutations_total',
       'qty_address_mutations_month', 
       'has_relocated',
       'has_name_change', 'code_discontinuation', 'code_financial_calamity',
       'qty_issued_credit_reports', 'Associate', 'Authorized official', 'Board member', 'Chairman',
       'Commissioner', 'Director', 'Liquidator', 'Major', 'Managing clerk',
       'Managing partner', 'Member of the partnership', 'Miscellaneous',
       'Owner', 'Secretary', 'Secretary/Treasurer', 'Treasurer', 'Unknown',
       'Vice President', 'amt_operating_result', 'code_legal_form', 'date_financial_calamity_started', 
       'date_financial_calamity_stopped', 'date_start', 'from_date_start',
       'qty_stopped_names', 'qty_started_names', 'year_operating_result'       ]

In [5]:
def aggregate_board_members(df):
    """Agregates the number of board members into one feature """    
    col_list_to_sum = ['associate', 'authorized_official', 'board_member', 'chairman', 'commissioner',
       'director', 'liquidator', 'major', 'managing_clerk', 'managing_partner',
       'member_of_the_partnership', 'miscellaneous', 'owner', 'secretary',
       'secretary/treasurer', 'treasurer', 'unknown', 'vice_president']  
    df['total_changeof_board_members_'] = df[col_list_to_sum].sum(axis=1)
    df = df.drop(columns=col_list_to_sum)
    return df

In [6]:
def read_one_month_csv_from_bucket(year, month, last_day_of_month, dir_prefix = '', selected_columns= ''):
    """ Reads one month of data and returns a pandas Df """
    one_month_df = pd.DataFrame()
    dir_prefix = dir_prefix + '/' + year
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:
        if month + '-' + last_day_of_month in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                if selected_columns == '' or None:
                    one_month_df = pd.read_csv(f, sep=';')
                else:
                    one_month_df = pd.read_csv(f, sep=';', usecols= selected_columns)
    one_month_df.columns = (one_month_df.columns.str.strip().str.lower().str.replace(' ', '_').
                            str.replace('(', '').str.replace(')', '') )
    return one_month_df

In [7]:
#%%time
def read_all_csv_months_yearly_from_bucket_merged(years_to_read_in_list, dir_prefix = '', selected_columns = ''):
    """ Reads a whole year of data and returns a monthly merged pandas Df """
    all_years_merged_df = pd.DataFrame()
    for year in years_to_read_in_list:
        print('Starting with year: ', year)
        dir_prefix = dir_prefix + '/' + year
        blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
        for blob in blob_list:  
            one_month_df = None
            if 'CSV' in blob.name:
                print('Processing file: ', blob.name)
                with fs.open('graydon-data/' + blob.name) as f:
                    one_month_df = pd.read_csv(f, sep=';', usecols= selected_columns)   
                    one_month_df = one_month_df[(one_month_df['is_sole_proprietor'] == 0) ]
                                               # & (one_month_df['is_discontinued'] == 0) 
                    one_month_df.columns = (one_month_df.columns.str.strip().str.lower(). 
                    str.replace(' ', '_').str.replace('(', '').str.replace(')', '') )
                    one_month_df = aggregate_board_members(one_month_df)
                    one_month_df = clean_data_per_year(one_month_df)
                    all_years_merged_df = all_years_merged_df.append(one_month_df)
            print('The number of rows so far is: ', all_years_merged_df.shape[0])
    return all_years_merged_df

In [8]:
def clean_data_per_year(df):
    """Cleans data and returns formatted df"""
    df['date_month'] = pd.to_datetime(df['date_month'])
    df['financial_calamity_outcome'] = df['financial_calamity_outcome'].fillna(-1) 
    df['qty_employees'] = df['qty_employees'].str.strip() 
    df.loc[df.qty_employees == 'NA', 'qty_employees'] = np.NaN
    #df['qty_employees'] = df['qty_employees'].fillna(0) 
    #df['qty_employees'] = df['qty_employees'].astype(str).astype(int)
    df['year_qty_employees'] = df['year_qty_employees'].str.strip()
    df.loc[df.year_qty_employees == 'NA', 'year_qty_employees'] =  np.NaN
    df['amt_revenue'] = df['amt_revenue'].str.strip() 
    df.loc[df.amt_revenue == 'NA', 'amt_revenue'] =  np.NaN
    df['amt_revenue'] = df['amt_revenue'].astype(str).str.replace(',','.')
    df['year_revenue'] = df['year_revenue'].str.strip() 
    df.loc[df.year_revenue == 'NA', 'year_revenue'] = 0
    df['amt_consolidated_revenue'] = df['amt_consolidated_revenue'].str.strip() 
    df.loc[df.amt_consolidated_revenue == 'NA', 'amt_consolidated_revenue'] =  np.NaN
    df['amt_consolidated_revenue'] = df['amt_consolidated_revenue'].astype(str).str.replace(',','.')
    df['year_consolidated_revenue'] = df['year_consolidated_revenue'].str.strip() 
    df.loc[df.year_consolidated_revenue == 'NA', 'year_consolidated_revenue'] =  np.NaN
    df['amt_consolidated_operating_result'] = df['amt_consolidated_operating_result'].str.strip() 
    df.loc[df.amt_consolidated_operating_result == 'NA', 'amt_consolidated_operating_result'] =  np.NaN
    df['amt_consolidated_operating_result'] = df['amt_consolidated_operating_result'].astype(str).str.replace(',','.')
    df['year_consolidated_operating_result'] = df['year_consolidated_operating_result'].str.strip() 
    df.loc[df.year_consolidated_operating_result == 'NA', 'year_consolidated_operating_result'] =  np.NaN
    df['score_pd'] = df['score_pd'].str.strip() 
    df.loc[df.score_pd == 'NA', 'score_pd'] =  np.NaN
    df['score_pd'] = df['score_pd'].astype(str).str.replace(',','.')
    df['has_increased_risk'] = df['has_increased_risk'].astype(bool)
    #df.loc[df.has_increased_risk == None, 'has_increased_risk'] = False
    #df.loc[df.code_sbi_2.isnull(), 'code_sbi_2'] = 0  
    df.loc[df.date_established < '1700-12-31' , 'date_established'] =  np.NaN
    df['date_established'] = pd.to_datetime(df['date_established'])
    df['amt_operating_result'] = df['amt_operating_result'].str.strip() 
    df.loc[df.amt_operating_result == 'NA', 'amt_operating_result'] =  np.NaN
    df['amt_operating_result'] = df['amt_operating_result'].astype(str).str.replace(',','.')
    df['year_operating_result'] = df['year_consolidated_operating_result'].str.strip() 
    df.loc[df.year_operating_result == 'NA', 'year_operating_result'] =  np.NaN
    return df

In [9]:
def save_df_locally(df, dir_prefix, year, as_json= False):
    """ Saves df as json or csv locally on server """
    if as_json:        
        file_path = dir_prefix + '/' + year + '_merged.json'
        df.to_json(file_path)
    else:
        file_path =  dir_prefix + '/' + year + '_merged=.csv'
        df.to_csv(file_path)

### Reading one year of data

In [10]:
%%time
df_one_year = read_all_csv_months_yearly_from_bucket_merged(dir_prefix ='01_input', 
                                                              selected_columns= selected_columns
                                                              ,years_to_read_in_list=['2018'])

Starting with year:  2018
Processing file:  01_input/2018/modelling_2018-01-01_2018-01-31.CSV


  call = lambda f, *a, **k: f(*a, **k)


The number of rows so far is:  1907886
Processing file:  01_input/2018/modelling_2018-02-01_2018-02-28.CSV
The number of rows so far is:  3821899
Processing file:  01_input/2018/modelling_2018-03-01_2018-03-31.CSV
The number of rows so far is:  5740157
Processing file:  01_input/2018/modelling_2018-04-01_2018-04-30.CSV
The number of rows so far is:  7664280
Processing file:  01_input/2018/modelling_2018-05-01_2018-05-31.CSV


  call = lambda f, *a, **k: f(*a, **k)


The number of rows so far is:  9592892
Processing file:  01_input/2018/modelling_2018-06-01_2018-06-30.CSV
The number of rows so far is:  11525086
Processing file:  01_input/2018/modelling_2018-07-01_2018-07-31.CSV
The number of rows so far is:  13464338
Processing file:  01_input/2018/modelling_2018-08-01_2018-08-31.CSV
The number of rows so far is:  15408766
Processing file:  01_input/2018/modelling_2018-09-01_2018-09-30.CSV
The number of rows so far is:  17357026
Processing file:  01_input/2018/modelling_2018-10-01_2018-10-31.CSV


  call = lambda f, *a, **k: f(*a, **k)


The number of rows so far is:  19311866
Processing file:  01_input/2018/modelling_2018-11-01_2018-11-30.CSV


  call = lambda f, *a, **k: f(*a, **k)


The number of rows so far is:  21268067
Processing file:  01_input/2018/modelling_2018-12-01_2018-12-31.CSV
The number of rows so far is:  23224251
CPU times: user 15min 24s, sys: 3min 26s, total: 18min 50s
Wall time: 54min 3s


In [11]:
# Preview of the data 
HTML(DataFrame(df_one_year).head(20).to_html())

Unnamed: 0,date_month,id_company,id_branch,date_established,is_discontinued,code_discontinuation,code_financial_calamity,date_financial_calamity_started,date_financial_calamity_stopped,financial_calamity_outcome,code_legal_form,qty_employees,year_qty_employees,id_company_creditproxy,score_payment_assessment,amt_revenue,year_revenue,amt_operating_result,year_operating_result,amt_consolidated_revenue,year_consolidated_revenue,amt_consolidated_operating_result,year_consolidated_operating_result,qty_issued_credit_reports,perc_credit_limit_adjustment,color_credit_status,rat_pd,score_pd,has_increased_risk,is_sole_proprietor,code_sbi_2,code_sbi_1,qty_address_mutations_total,qty_address_mutations_month,date_start,from_date_start,has_relocated,qty_started_names,qty_stopped_names,has_name_change,total_changeof_board_members_
0,2018-01-01,3,10079408,1921-03-17,False,,,,,-1.0,5.0,1.0,2018.0,3,20.0,3.55253e-316,2009,9.219265e-318,2015.0,4.954293e-316,2015.0,2.34681e-318,2015.0,3.0,35,G,B,-4.95,False,False,64.0,,0,0,,,False,0,0,False,0
1,2018-01-01,5,10079416,1740-01-01,False,,,,,-1.0,5.0,9.0,2018.0,1064993,20.0,,0,,,,,,,0.0,-5,G,A,-4.883,False,False,46.0,,0,0,,,False,0,0,False,0
2,2018-01-01,6,10079424,1874-11-20,False,,F,2013-01-29,,-1.0,5.0,25.0,2014.0,6,24.0,4.446591e-317,2011,,,,,,,0.0,-100,R,D,,True,False,41.0,,0,0,,,False,0,0,False,0
3,2018-01-01,9,10079432,1897-05-01,False,,,,,-1.0,5.0,45.0,2018.0,9,20.0,,0,,,,,,,2.0,25,G,CCC,-4.99,False,False,47.0,,0,0,,,False,0,0,False,0
4,2018-01-01,12,35,1924-08-01,False,,,,,-1.0,5.0,5.0,2018.0,12,28.0,,0,,,,,,,0.0,30,G,BBB,-5.071,False,False,64.0,,0,0,,,False,0,0,False,0
5,2018-01-01,14,43,1821-05-01,False,,,,,-1.0,5.0,6.0,2018.0,14,20.0,,0,,,,,,,0.0,50,G,BBB,-5.007,False,False,64.0,,0,0,,,False,0,0,False,0
6,2018-01-01,17,51,NaT,False,,,,,-1.0,5.0,1.0,2018.0,17,20.0,,0,,,,,,,0.0,20,G,A,-5.01,False,False,64.0,,0,0,,,False,0,0,False,0
7,2018-01-01,23,94,1915-11-15,True,7.0,,,,-1.0,5.0,,,23,29.0,,0,,,,,,,0.0,0,R,D,,True,False,46.0,,0,0,,,False,0,0,False,0
8,2018-01-01,25,10079467,1970-12-07,True,7.0,,,,-1.0,5.0,,,25,29.0,,0,,,,,,,0.0,5,R,D,,True,False,61.0,,0,0,,,False,0,0,False,0
9,2018-01-01,47,140,1862-01-01,False,,,,,-1.0,5.0,1.0,2007.0,47,20.0,,0,,,,,,,1.0,45,G,CCC,-5.834,False,False,70.0,,0,0,,,False,0,0,False,0


In [12]:
# Displaying number of rows and columns
df_one_year.shape

(23224251, 41)

In [13]:
# Saving df locally
save_df_locally(df= df_one_year, dir_prefix= 'files_to_bucket', year= '2018')