In [9]:
# Ignore 'dask' warning
import pandas as pd
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab
from datetime import date
import numpy as np

In [10]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'

In [11]:
# Initializing bucket
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)

In [12]:
# Setting up dictionary of column types
dtype={ 'id_company'  :np.float64,
        'id_branch'    :np.int64, 
        'code_sbi_2'         : np.float64, 
        'code_sbi_1'          :object, 
        'has_relocated':bool,
        'has_relocated_next_year ' : bool,
        'qty_address_mutations_total' :np.float64,
        #'has_name_change'  :bool,
        'ratio_operating_result_consolidated_operating_result': np.float64,
        'ratio_revenue_consolidated_revenue': np.float64,
        'qty_green_flags'   :np.float64,
        'qty_orange_flags'   :np.float64,
        'qty_red_flags'   :np.float64,
        'A'   :np.float64,
        'AA'   :np.float64,
        'AAA'   :np.float64,
        'B'   :np.float64,
        'BB'   :np.float64,
        'BBB'   :np.float64,
        'C'   :np.float64,
        'CC'   :np.float64,
        'CCC'   :np.float64,
        'D'   :np.float64,
        'NR'   :np.float64,
        'company_age'   :np.float64,
        'years_since_last_amt_consolidated_operating_result'   :np.float64,
        'years_since_last_amt_consolidated_revenue'   :np.float64,
        'years_since_last_amt_operating_result'   :np.float64,
        'years_since_last_qty_employees'   :np.float64,
        'years_since_last_amt_revenue'   :np.float64,
        'delta_qty_employees'   :np.float64,
        'delta_qty_issued_credit_reports'   :np.float64,
        'delta_score_payment_assessment'   :np.float64,
        'delta_score_payment_assessment'   :np.float64,
        'code_legal_form_has_changed ' : bool,
        'is_discontinued_any ' : bool,
        'has_financial_calamity ' : bool,
        'mean_amt_consolidated_operating_result'   :np.float64,
        'mean_amt_consolidated_revenue'   :np.float64,
        'mean_amt_operating_result'   :np.float64,
        'mean_amt_revenue'   :np.float64,
        'mean_qty_employees'   :np.float64,
        'mean_qty_issued_credit_reports'   :np.float64,
        'mean_score_payment_assessment'   :np.float64,
        'mean_score_pd'   :np.float64,
        'qty_address_mutations_year'   :np.float64,
        'qty_started_names_year'   :np.float64,
        'qty_stopped_names_year'   :np.float64,
        'qty_board_changes_year'   :np.float64,
        'variance_qty_issued_credit_reports'   :np.float64,
        'variance_score_payment_assessment'   :np.float64,
        'variance_score_pd'   :np.float64
      }

In [13]:
# Setting up dictionary of column types
parse_dates= ['date_month', 'date_relocation_last', 'date_relocation_penultimate']

In [14]:
def read_one_year_from_bucket_merged_csv(year, dir_prefix = ''):
    """ Reads a whole year of data from the already merged files """
    full_year_df = pd.DataFrame()
    print('Starting with year: ', year)
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:  
        print("blob", blob.name)
        if year in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                full_year_df = pd.read_csv(f, sep=',', index_col=0, dtype=dtype, parse_dates=parse_dates)   
        print('The number of rows so far is: ', full_year_df.shape[0])
    return full_year_df

In [15]:
one_year_df = read_one_year_from_bucket_merged_csv(
    dir_prefix= 'including_scores/merged_per_year/aggregated'
                                                   , year = '2017')

Starting with year:  2017
including_scores/merged_per_year/aggregated
blob including_scores/merged_per_year/aggregated/
The number of rows so far is:  0
blob including_scores/merged_per_year/aggregated/2017_merged_cleaned.csv
Processing file:  including_scores/merged_per_year/aggregated/2017_merged_cleaned.csv


  mask |= (ar1 == a)


The number of rows so far is:  1965800


In [17]:
HTML(DataFrame(one_year_df).head(100).to_html())

Unnamed: 0,id_branch,date_month,id_company,code_sbi_2,code_sbi_1,qty_address_mutations_total,has_relocated,has_name_change,date_relocation_last,date_relocation_penultimate,ratio_operating_result_consolidated_operating_result,ratio_revenue_consolidated_revenue,qty_green_flags,qty_orange_flags,qty_red_flags,A,AA,AAA,B,BB,BBB,C,CC,CCC,D,NR,company_age,years_since_last_amt_consolidated_operating_result,years_since_last_amt_consolidated_revenue,years_since_last_amt_operating_result,years_since_last_qty_employees,years_since_last_amt_revenue,delta_qty_employees,delta_qty_issued_credit_reports,delta_score_payment_assessment,code_legal_form_has_changed,is_discontinued_any,has_financial_calamity,mean_amt_consolidated_operating_result,mean_amt_consolidated_revenue,mean_amt_operating_result,mean_amt_revenue,mean_qty_employees,mean_qty_issued_credit_reports,mean_score_payment_assessment,mean_score_pd,qty_address_mutations_year,qty_started_names_year,qty_stopped_names_year,qty_board_changes_year,variance_qty_issued_credit_reports,variance_score_payment_assessment,variance_score_pd,has_relocated_next_year
0,0,2017-01-01,902120824.0,0.0,,0.0,False,False,NaT,NaT,,,2.0,0.0,4906.0,119.0,0.0,0.0,12.0,0.0,73.0,0.0,0.0,0.0,0.0,4704.0,38.0,,,,4.0,,0.0,0.0,0.0,False,False,False,0.0,0.0,0.0,0.0,0.000407,0.000815,0.01141,-0.002178,0.0,0.0,0.0,0.0,0.000814,56.0,-10.691,False
1,35,2017-01-01,12.0,64.0,,0.0,False,False,2017-08-03,NaT,,,12.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,94.0,,,,1.0,,0.0,0.0,0.0,False,False,False,0.0,0.0,0.0,0.0,5.0,0.25,28.0,-5.078833,1.0,0.0,0.0,0.0,0.204545,336.0,-60.946,False
2,43,2017-01-01,14.0,64.0,,0.0,False,False,NaT,NaT,,,12.0,0.0,0.0,5.0,0.0,0.0,0.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,197.0,,,,1.0,,0.0,0.0,0.0,False,False,False,0.0,0.0,0.0,0.0,6.0,0.166667,20.0,-4.996583,0.0,0.0,0.0,0.0,0.151515,240.0,-59.959,False
3,51,2017-01-01,17.0,64.0,,0.0,False,False,2002-02-07,NaT,,,12.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,1.0,,0.0,0.0,0.0,False,False,False,0.0,0.0,0.0,0.0,1.0,0.0,20.0,-5.0145,0.0,0.0,0.0,0.0,0.0,240.0,-60.174,False
4,94,2017-01-01,23.0,46.0,,0.0,False,False,NaT,NaT,,,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,103.0,,,,,,0.0,0.0,0.0,False,True,False,0.0,0.0,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,348.0,0.0,False
5,140,2017-01-01,47.0,70.0,,0.0,False,False,NaT,NaT,,,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,156.0,,,,11.0,,0.0,0.0,0.0,False,False,False,0.0,0.0,0.0,0.0,1.0,0.0,20.0,-5.121,0.0,0.0,0.0,0.0,0.0,240.0,-61.452,False
6,175,2017-01-01,53.0,47.0,,0.0,False,False,NaT,NaT,,,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,,,,,11.0,,0.0,0.0,0.0,False,False,False,0.0,0.0,0.0,0.0,3.0,0.0,20.0,-5.522,0.0,0.0,0.0,0.0,0.0,240.0,-66.264,False
7,183,2017-01-01,54.0,64.0,,0.0,False,False,NaT,NaT,,,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,39.0,,,,1.0,,0.0,0.0,0.0,False,False,False,0.0,0.0,0.0,0.0,1.0,0.0,20.0,-4.997417,0.0,0.0,0.0,0.0,0.0,240.0,-59.969,False
8,248,2017-01-01,63.0,68.0,,0.0,False,False,NaT,NaT,,,12.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,1.0,0.0,0.0,99.0,,,,1.0,,0.0,0.0,0.0,False,False,False,0.0,0.0,0.0,0.0,4.0,0.0,20.0,-5.045,0.0,0.0,0.0,2.0,0.0,240.0,-60.54,False
9,272,2017-01-01,68.0,13.0,,0.0,False,False,2006-01-05,NaT,inf,,0.0,3.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,57.0,,,,1.0,,0.0,-1.0,0.0,False,False,False,0.0,0.0,1.60715e-319,0.0,1.0,0.25,21.0,-4.72675,0.0,0.0,0.0,0.0,0.204545,252.0,-56.721,False
