# Setting up

## Modules

In [1]:
import sys
sys.path.append("../modules")

In [2]:
from cleaning_merging import *

In [3]:
from datetime import datetime
from IPython.display import HTML

## Data gathering variables

In [4]:
date_dataset = datetime.strptime("2018-01-01", '%Y-%m-%d')

### For a loop

In [5]:
dates = ["2018-01-01", "2017-01-01", "2016-01-01", "2015-01-01", "2014-01-01", "2013-01-01"]
list_date_dataset = [datetime.strptime(date, '%Y-%m-%d') for date in dates]

## Project constants

In [6]:
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'
dir_input_data = '01_input'
dir_output_data = '02_cleaned'

# Gathering data

Instantiate clean and merge object

In [7]:
clean_merge = Cleaner_Merger(project, bucket_name, dir_input_data, dir_output_data)

## Features

In [8]:
df_features = clean_merge.get_features(date_dataset)

Read 01_input/2017/modelling_2017-01-01_2017-01-31.CSV with 5000 rows and 57 columns
After removing sole proprietors there are 4411 rows are left
The number of rows so far by adding 01_input/2017/modelling_2017-01-01_2017-01-31.CSV : 4411
Read 01_input/2017/modelling_2017-02-01_2017-02-28.CSV with 5000 rows and 57 columns
After removing sole proprietors there are 4412 rows are left
The number of rows so far by adding 01_input/2017/modelling_2017-02-01_2017-02-28.CSV : 8823
Read 01_input/2017/modelling_2017-03-01_2017-03-31.CSV with 5000 rows and 57 columns
After removing sole proprietors there are 4414 rows are left
The number of rows so far by adding 01_input/2017/modelling_2017-03-01_2017-03-31.CSV : 13237
Read 01_input/2017/modelling_2017-04-01_2017-04-30.CSV with 5000 rows and 57 columns
After removing sole proprietors there are 4414 rows are left
The number of rows so far by adding 01_input/2017/modelling_2017-04-01_2017-04-30.CSV : 17651
Read 01_input/2017/modelling_2017-05-01_20

TypeError: unorderable types: int() > str()

Example of the data:

In [None]:
df_features.head()

Number of company, branch, month combinations

In [None]:
df_features.shape[0]

## Target

In [None]:
df_target = clean_merge.get_targets(date_dataset, columns_targets)

Example of the data:

In [None]:
df_target.head()

The number of companies with target:

In [None]:
df_target.shape[0]

## Combining features and target

In [None]:
df_monthly = df_features.merge(df_target,
                               on=['id_company', 'id_branch'],
                               how='left')

Example of the data:

In [None]:
df_monthly.head()

Number of monthly company branch data records:

In [None]:
df_monthly.shape[0]

In [None]:
df_monthly['has_relocated']

## -----------

In [None]:
date_month = date_dataset

df_months_combined = pd.DataFrame()  # The data frame which will contain all independent variables
    
# Get all months in range
df_date_months = pd.DataFrame(pd.date_range(date_month, periods=12, freq="M").tolist(),
                              columns=['date_month'])
df_date_months['date_month'] = df_date_months['date_month'].values.astype('datetime64[M]') # First day of month

# Get the file names of all required month files
#month_files = get_month_filenames(df_date_months, bucket, dir_data)

In [None]:
month_files = [] # List of month files

df_date_months['year'] = df_date_months.date_month.dt.year
list_years = df_date_months['year'].unique()
# If there are multiple years, iterate through years  
for year in list_years:
    # Get the year's data file names
    dir_data_year = dir_data + '/' + str(year)
    list_blob = list(bucket.list_blobs(prefix=dir_data_year))
    # finding out which month files should be processed by looking which contain the first month date (YYYY-mm-01)
    df_year_months = df_date_months[df_date_months['year'] == year]['date_month']
    for blob in list_blob:
        for month in df_year_months:
            if (month.strftime("%Y-%m-%d") in blob.name) & ('CSV' in blob.name):
                month_files.append(blob.name)

In [None]:
month_files

In [None]:
# Cleaning, transforming and combining month files                
for month_file in month_files:
    with fs.open('graydon-data/' + month_file) as f:
        df_month = pd.read_csv(f, sep=';', usecols= columns_features, index_col=False)   
        df_month = df_month[(df_month['is_sole_proprietor'] == 0)] # & (one_month_df['is_discontinued'] == 0) 
        df_month.columns = (df_month.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', ''))
        df_months_combined = df_months_combined.append(df_month)
        print('The number of rows so far by adding ', month_file, ":", df_months_combined.shape[0])

df_months_combined['date_dataset'] = date_month

# Aggregating data to year
df_months_combined = df_months_combined.groupby(['date_dataset', 
                                                  'id_company', 
                                                  'id_branch']).agg({'has_relocated': 'max', 
                                                                     'date_month': 'max'})
df_months_combined = df_months_combined.rename(index=str, columns={"date_month": "date_month_last"})
df_months_combined = df_months_combined.reset_index()
df_months_combined['date_dataset'] = pd.to_datetime(df_months_combined['date_dataset'])
df_months_combined['id_company'] = df_months_combined['id_company'].astype(int)
df_months_combined['id_branch'] = df_months_combined['id_branch'].astype(int)

return(df_months_combined)