In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [102]:
from google_trans_new import google_translator   

## Reading in the data

In [71]:
status = pd.read_csv("../data/financials/status_forms_test1000.csv")

In [72]:
financials = pd.read_csv("../data/financials/fin_forms_test1000.csv")

In [73]:
# setting the columns for the status variables to go in
financials['default'] = [0]*financials.shape[0]
financials['date_diff'] = [np.nan]*financials.shape[0]
financials['default_date'] = [np.nan]*financials.shape[0]
financials['status'] = [np.nan]*financials.shape[0]
financials['actual_status'] = [np.nan]*financials.shape[0]
financials['remark'] = [np.nan]*financials.shape[0]

# setting the columns for the forms variables
financials['company_form'] = [np.nan]*financials.shape[0]
financials['county'] = [np.nan]*financials.shape[0]
financials['industry'] = [np.nan]*financials.shape[0]
financials['operation'] = [np.nan]*financials.shape[0]
financials['registered_date'] = [np.nan]*financials.shape[0]

## Removing the ids from status that do not appear in the financials dataset and visa versa

In [74]:
status = status[status['id'].isin(financials['id'].unique())]
financials = financials[financials['id'].isin(status['id'].unique())]

## Number of companies that are inaktivt but not banckrupt

In [75]:
indexnames = status[status['status']=='inaktivt'][status['remark'].isna()]['id'].index

In [76]:
# removing companies that are inactive but are not bankrupt (from status and financial)
status = status.drop(indexnames)
financials = financials[financials['id'].isin(status['id'].unique())]

## Merging status and financial data

In [77]:
financials.head()

Unnamed: 0,company,id,year,date,net_sales,other_sales,op_profit_ebit,prof_after_net_fin_items,results,sub_unpaid_cap,...,date_diff,default_date,status,actual_status,remark,company_form,county,industry,operation,registered_date
0,Lily Properties AB,5590655865,2019,2019-12,0,-,-44,-44,-44,0,...,,,,,,,,,,
1,Lily Properties AB,5590655865,2018,2018-12,0,-,-72,-72,-72,0,...,,,,,,,,,,
2,Lily Properties AB,5590655865,2017,2017-12,28,-,-279,-279,-279,0,...,,,,,,,,,,
3,Lily Properties AB,5590655865,2016,2016-12,0,-,-226,749,749,0,...,,,,,,,,,,
4,Fogdetorps El Aktiebolag,5564709904,2019,2019-12,0,-,-62,7938,7938,0,...,,,,,,,,,,


In [78]:
def add_company_status(company_id):
    company = financials[financials['id']==company_id]
    company_status = status[status['id']==company_id]
    
    # add the default info
    if 'inaktivt' in list(company_status['status']):
        
        years = company.shape[0]
        min_date_diff = 1000000
        min_date_diff_row = 0

        for i in range(years):
            date_diff = datetime.strptime(status[status['id']==company_id]['default_date'].iloc[0],'%Y-%m-%d') - datetime.strptime(company['date'].iloc[i], '%Y-%m')
            date_diff = date_diff.days
            if date_diff < 0:
                # returned financials after banckrupcy (should not happen)
                continue
            if date_diff < min_date_diff:
                min_date_diff = date_diff
                min_date_diff_row = i

        company['default'].iloc[min_date_diff_row] = 1
        company['date_diff'].iloc[min_date_diff_row] = round(min_date_diff/365,3)
        company['default_date'].iloc[min_date_diff_row] = company_status['default_date'].iloc[0]
        company['status'].iloc[min_date_diff_row] = company_status['status'].iloc[0]
        company['actual_status'].iloc[min_date_diff_row] = company_status['actual_status'].iloc[0]
        company['remark'].iloc[min_date_diff_row] = company_status['remark'].iloc[0]
        
    # add the forms info
    company['company_form'] = company.shape[0]*list(company_status['company_form'])
    company['county'] = company.shape[0]*list(company_status['county'])
    company['industry'] = company.shape[0]*list(company_status['industry'])
    company['operation'] = company.shape[0]*list(company_status['operation'])
    company['registered_date'] = company.shape[0]*list(company_status['registered_date'])
    
    return company

In [79]:
for company_id in financials['id'].unique():
    financials[financials['id']==company_id] = add_company_status(company_id)

## Replacing the dashes with NaNs

In [80]:
for col in financials.columns:
    financials[col].replace("-",np.nan,inplace=True)

In [81]:
financials.shape

(977, 48)

## Remove the percentage signs

In [82]:
for col in financials.columns[30:37]:
    financials[col] = financials[col].str[0:-1]

## Converting the object datatypes to floats

In [89]:
for col in financials.columns[4:37]:
    financials[col] = financials[col].astype(float)

## Dividing percentage columns by 100

In [91]:
financials.iloc[:,30:37] = financials.iloc[:,30:37]/100

## Adding active status so it matches the 0s and 1s

In [96]:
financials['status'].replace(np.nan,'aktivt',inplace=True)

## Translating sweedish to english

In [107]:
def translate_columns(columns):
    
    # get all the words that need to be translated
    sweedish_words = []
    for col in columns:
        sweedish_words += list(financials[col].unique())
                                                              
    # replace the sweedish words with their english counter part
    translator = google_translator()  
    for word in sweedish_words:
        # if it is np.nan
        if type(word)==float:
            continue
        # google did not translate this right aktivt ----> actively
        elif word == 'aktivt':
            print(word + " ------> "+'active')
            financials.replace(word,'active', inplace=True)
        else:
            print(word + " ------> "+translator.translate(word,lang_tgt='en'))
            financials.replace(word,translator.translate(word,lang_tgt='en').strip(), inplace=True)

In [108]:
translate_columns(['status','actual_status','remark','company_form','county','industry','operation'])

aktivt ------> actively 
inaktivt ------> inactive 
inaktivt ------> inactive 
Registrerad ------> Registered 
Konkurs avslutad ------> Bankruptcy completed 
Konkurs inledd ------> Bankruptcy initiated 
Aktiebolag ------> Limited company 
Stockholms län ------> Stockholm County 
Uppsala län ------> Uppsala county 
Kronobergs län ------> Kronoberg County 


KeyboardInterrupt: 