In [1]:
# Ignore 'dask' warning
import pandas as pd
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab

In [2]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'

In [3]:
# Initializing bucket
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)

In [None]:
#%%time
def read_all_csv_months_yearly_from_bucket_merged(years_to_read_in_list, dir_prefix = '', selected_columns = ''):
    """ Reads a whole year of data and returns a monthly merged pandas Df """
    all_years_merged_df = pd.DataFrame()
    
    for year in years_to_read_in_list:
        print('Starting with year: ', year)
        dir_prefix = dir_prefix + '/' + year
        blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
        for blob in blob_list:  
            one_month_df = None
            if 'CSV' in blob.name:
                print('Processing file: ', blob.name)
                with fs.open('graydon-data/' + blob.name) as f:
                    one_month_df = pd.read_csv(f, sep=';', usecols= selected_columns)   
                    one_month_df = one_month_df[(one_month_df['is_sole_proprietor'] == 0) ]
                                               # & (one_month_df['is_discontinued'] == 0) 
                    one_month_df.columns = (one_month_df.columns.str.strip().str.lower(). 
                    str.replace(' ', '_').str.replace('(', '').str.replace(')', '') )
                    all_years_merged_df = all_years_merged_df.append(one_month_df)
            print('The number of rows so far is: ', all_years_merged_df.shape[0])
    return all_years_merged_df

In [4]:
def read_one_year_from_bucket_merged_csv(year, dir_prefix = ''):
    """ Reads a whole year of data from the already merged files """
    full_year_df = pd.DataFrame()
    print('Starting with year: ', year)
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:  
        print("blob", blob.name)
        if year in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                full_year_df = pd.read_csv(f, sep=',', index_col=0)   
        print('The number of rows so far is: ', full_year_df.shape[0])
    return full_year_df

In [None]:
def clean_data_per_year(df):
    df['date_month'] = pd.to_datetime(df['date_month'])
    df['financial_calamity_outcome'] = df['financial_calamity_outcome'].fillna(-1) 
    df['qty_employees'] = df['qty_employees'].str.strip() 
    df.loc[df.qty_employees == 'NA', 'qty_employees'] = 0
    df['qty_employees'] = df['qty_employees'].fillna(0) 
    df['qty_employees'] = df['qty_employees'].astype(str).astype(int)
    df['year_qty_employees'] = df['year_qty_employees'].str.strip()
    df.loc[df.year_qty_employees == 'NA', 'year_qty_employees'] = None
    df['amt_revenue'] = df['amt_revenue'].str.strip() 
    df.loc[df.amt_revenue == 'NA', 'amt_revenue'] = 0
    df['amt_revenue'] = df['amt_revenue'].astype(str).str.replace(',','.').astype(float)
    df['year_revenue'] = df['year_revenue'].str.strip() 
    df.loc[df.year_revenue == 'NA', 'year_revenue'] = 0
    df['amt_consolidated_revenue'] = df['amt_consolidated_revenue'].str.strip() 
    df.loc[df.amt_consolidated_revenue == 'NA', 'amt_consolidated_revenue'] = 0
    df['amt_consolidated_revenue'] = df['amt_consolidated_revenue'].astype(str).str.replace(',','.').astype(float)
    df['year_consolidated_revenue'] = df['year_consolidated_revenue'].str.strip() 
    df.loc[df.year_consolidated_revenue == 'NA', 'year_consolidated_revenue'] = 0
    df['amt_consolidated_operating_result'] = df['amt_consolidated_operating_result'].str.strip() 
    df.loc[df.amt_consolidated_operating_result == 'NA', 'amt_consolidated_operating_result'] = 0
    df['amt_consolidated_operating_result'] = df['amt_consolidated_operating_result'].astype(str).str.replace(',','.').astype(float)
    df['year_consolidated_operating_result'] = df['year_consolidated_operating_result'].str.strip() 
    df.loc[df.year_consolidated_operating_result == 'NA', 'year_consolidated_operating_result'] = 0
    df['score_pd'] = df['score_pd'].str.strip() 
    df.loc[df.score_pd == 'NA', 'score_pd'] = 0
    df['score_pd'] = df['score_pd'].astype(str).str.replace(',','.').astype(float)
    df['has_increased_risk'] = df['has_increased_risk'].astype(bool)
    df.loc[df.has_increased_risk == None, 'has_increased_risk'] = False
    df.loc[df.code_sbi_2.isnull(), 'code_sbi_2'] = 0
    return df
        

In [5]:
df_2018 = read_one_year_from_bucket_merged_csv('2018', dir_prefix= 'including_scores/merged_per_year')

Starting with year:  2018
including_scores/merged_per_year
blob including_scores/merged_per_year/
The number of rows so far is:  0
blob including_scores/merged_per_year/2016_merged.csv
The number of rows so far is:  0
blob including_scores/merged_per_year/2017_merged.csv
The number of rows so far is:  0
blob including_scores/merged_per_year/2018_merged.csv
Processing file:  including_scores/merged_per_year/2018_merged.csv


  if (yield from self.run_code(code, result)):
  mask |= (ar1 == a)


The number of rows so far is:  19311866


In [6]:
df_2018.columns

Index(['date_month', 'id_company', 'id_branch', 'is_discontinued',
       'financial_calamity_outcome', 'qty_employees', 'year_qty_employees',
       'id_company_creditproxy', 'score_payment_assessment', 'amt_revenue',
       'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
       'amt_consolidated_operating_result',
       'year_consolidated_operating_result', 'perc_credit_limit_adjustment',
       'color_credit_status', 'rat_pd', 'score_pd', 'has_increased_risk',
       'is_sole_proprietor', 'code_sbi_2', 'code_sbi_1',
       'qty_address_mutations_total', 'qty_address_mutations_month',
       'has_relocated', 'has_name_change', 'vice_president'],
      dtype='object')

In [68]:
HTML(clean_data_per_year(df_2018).head(80)).to_html())

Unnamed: 0,date_month,id_company,id_branch,is_discontinued,financial_calamity_outcome,qty_employees,year_qty_employees,id_company_creditproxy,score_payment_assessment,amt_revenue,year_revenue,amt_consolidated_revenue,year_consolidated_revenue,amt_consolidated_operating_result,year_consolidated_operating_result,perc_credit_limit_adjustment,color_credit_status,rat_pd,score_pd,has_increased_risk,is_sole_proprietor,code_sbi_2,code_sbi_1,qty_address_mutations_total,qty_address_mutations_month,has_relocated,has_name_change,vice_president
0,2018-01-01,3,10079408,False,-1.0,1,2018.0,3,20.0,3.55253e-316,2009,4.954293e-316,2015,2.34681e-318,2015.0,35,G,B,-4.95,False,False,64.0,,0,0,False,False,0
1,2018-01-01,5,10079416,False,-1.0,9,2018.0,1064993,20.0,0.0,0,,0,0.0,0.0,-5,G,A,-4.883,False,False,46.0,,0,0,False,False,0
2,2018-01-01,6,10079424,False,-1.0,25,2014.0,6,24.0,4.446591e-317,2011,,0,0.0,0.0,-100,R,D,0.0,True,False,41.0,,0,0,False,False,0
3,2018-01-01,9,10079432,False,-1.0,45,2018.0,9,20.0,0.0,0,,0,0.0,0.0,25,G,CCC,-4.99,False,False,47.0,,0,0,False,False,0
4,2018-01-01,12,35,False,-1.0,5,2018.0,12,28.0,0.0,0,,0,0.0,0.0,30,G,BBB,-5.071,False,False,64.0,,0,0,False,False,0
5,2018-01-01,14,43,False,-1.0,6,2018.0,14,20.0,0.0,0,,0,0.0,0.0,50,G,BBB,-5.007,False,False,64.0,,0,0,False,False,0
6,2018-01-01,17,51,False,-1.0,1,2018.0,17,20.0,0.0,0,,0,0.0,0.0,20,G,A,-5.01,False,False,64.0,,0,0,False,False,0
7,2018-01-01,23,94,True,-1.0,0,,23,29.0,0.0,0,,0,0.0,0.0,0,R,D,0.0,True,False,46.0,,0,0,False,False,0
8,2018-01-01,25,10079467,True,-1.0,0,,25,29.0,0.0,0,,0,0.0,0.0,5,R,D,0.0,True,False,61.0,,0,0,False,False,0
9,2018-01-01,47,140,False,-1.0,1,2007.0,47,20.0,0.0,0,,0,0.0,0.0,45,G,CCC,-5.834,False,False,70.0,,0,0,False,False,0


In [60]:
df_2018.dtypes

date_month                            datetime64[ns]
id_company                                     int64
id_branch                                      int64
is_discontinued                                 bool
financial_calamity_outcome                   float64
qty_employees                                  int64
year_qty_employees                            object
id_company_creditproxy                        object
score_payment_assessment                     float64
amt_revenue                                  float64
year_revenue                                  object
amt_consolidated_revenue                     float64
year_consolidated_revenue                     object
amt_consolidated_operating_result            float64
year_consolidated_operating_result           float64
perc_credit_limit_adjustment                   int64
color_credit_status                           object
rat_pd                                        object
score_pd                                     f

In [11]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19311866 entries, 0 to 3205901
Data columns (total 28 columns):
date_month                            datetime64[ns]
id_company                            int64
id_branch                             int64
is_discontinued                       bool
financial_calamity_outcome            float64
qty_employees                         object
year_qty_employees                    object
id_company_creditproxy                object
score_payment_assessment              float64
amt_revenue                           object
year_revenue                          object
amt_consolidated_revenue              object
year_consolidated_revenue             object
amt_consolidated_operating_result     object
year_consolidated_operating_result    object
perc_credit_limit_adjustment          int64
color_credit_status                   object
rat_pd                                object
score_pd                              object
has_increased_risk        