In [40]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

pd.set_option("display.max_rows", 50000)

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
merged_data = pd.read_csv("/content/drive/MyDrive/Fiverr/PROJECT_MIMI/Fund_Settlement_Security_Stocks_Ratings.csv", parse_dates=["data_date"])

  interactivity=interactivity, compiler=compiler, result=result)


In [43]:
merged_data.shape

(17123, 1782)

In [44]:
merged_data.rename(
    columns={"iqv-s&p_domestic_long_term_issuer_credit_rating": "s&p_domestic_long_term_issuer_credit_rating", 
             "iqv-s&p_domestic_short_term_issuer_credit_rating": "s&p_domestic_short_term_issuer_credit_rating"}, inplace=True)

In [45]:
merged_data.columns

Index(['global_company_key', 'data_date', 'data_year_-_fiscal',
       'industry_format', 'level_of_consolidation_-_company_annual_descriptor',
       'population_source', 'data_format', 'ticker_symbol', 'company_name',
       'adoption_of_accounting_changes',
       ...
       'dividends_per_share_-_ex_date_-_monthly',
       'dividends_per_share_-_pay_date_-_monthly',
       'common_stock_float_shares_-_canada', 'price_-_close_-_daily',
       'price_-_high_-_daily', 'price_-_low_-_daily', 'price_-_open_-_daily',
       'price_status_code_-_daily',
       's&p_domestic_long_term_issuer_credit_rating',
       's&p_domestic_short_term_issuer_credit_rating'],
      dtype='object', length=1782)

In [46]:
merged_data["year"] = merged_data["data_date"].apply(lambda x: x.strftime("%Y"))

In [47]:
constant_columns = sorted(['price_-_close_-_daily', 'price_-_high_-_daily', 'price_-_low_-_daily', 'price_-_open_-_daily', 
                    'price_status_code_-_daily', "dividend_rate_-_monthly", 'dividends_per_share_-_ex_date_-_monthly', 
                    'dividends_per_share_-_pay_date_-_monthly', 'common_stock_float_shares_-_canada', 'iso_currency_code_-_monthly', 
                    "settlementamount", "sued", "global_company_key", "ticker_symbol", "s&p_domestic_short_term_issuer_credit_rating", 
                    "s&p_domestic_long_term_issuer_credit_rating", "year"])

In [48]:
data_ = merged_data[constant_columns]
data_.drop_duplicates(subset=["global_company_key"], keep="first", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [49]:
missing_columns = merged_data[sorted(list(set(merged_data.columns) - set(sorted(constant_columns))))].isnull().sum().reset_index().rename(columns={0: "total_missing", "index": "md_columns"})
missing_columns["percentage_missing"] = missing_columns["total_missing"].apply(lambda x: (x/merged_data.shape[0]) * 100)
missing_columns = missing_columns[missing_columns["percentage_missing"] > 50]

In [50]:
numerical_variables = list()
categorical_variables = list()

for column in merged_data[missing_columns[missing_columns["percentage_missing"] > 50]["md_columns"].unique()]:
  if merged_data[column].dtype in ["float64", "int64"]:
    numerical_variables.append(column)
    merged_data[column] = merged_data[column].apply(lambda x: round(x, 2))

  if merged_data[column].dtype == "object":
    categorical_variables.append(column)

In [51]:
merged_data.shape

(17123, 1783)

In [52]:
merged_data = merged_data.groupby(["global_company_key", "year"])[numerical_variables].agg("sum").reset_index()

In [53]:
merged_data.shape

(8724, 1490)

In [54]:
merged_data = merged_data.groupby(["global_company_key"])[numerical_variables].agg("mean").reset_index()

In [55]:
merged_data = pd.merge(merged_data, data_, on="global_company_key")
merged_data.drop(columns="year", inplace=True)

In [56]:
merged_data.shape

(2462, 1504)

In [57]:
merged_data.to_csv("/content/drive/MyDrive/Fiverr/PROJECT_MIMI/Final_DataSet.csv", index=False)

In [58]:
merged_data["sued"].value_counts()

no     2037
yes     425
Name: sued, dtype: int64

In [59]:
merged_data.dropna(subset=["s&p_domestic_long_term_issuer_credit_rating", "s&p_domestic_short_term_issuer_credit_rating"], inplace=True)

In [60]:
categorical_variables = list()
categorical_variables_ = list()

for column in merged_data.columns:
  if merged_data[column].dtype == "object":
    categorical_variables.append(column)
    merged_data[column] = merged_data[column].astype("category")
    merged_data[column+"_"] = merged_data[column].cat.codes
    categorical_variables_.append(column+"_")

  elif merged_data[column].dtype in ["float64", "int64"]:
    merged_data[column] = merged_data[column].apply(lambda x: round(x, 5))


In [61]:
merged_data[categorical_variables].head()

Unnamed: 0,iso_currency_code_-_monthly,sued,ticker_symbol
0,USD,no,AAN
1,USD,no,ABT
2,USD,yes,AET
3,USD,no,ATRI
4,USD,yes,ACV


In [62]:
for variable in categorical_variables:
  if variable != "sued":
    stat, p, dof, expected = chi2_contingency(pd.crosstab(merged_data[variable], merged_data["sued"]))
    
    if p <= 0.05:
      print(column)
      print("Reject null hypothesis")

    else:
      print("Fail to reject null hypothesis")

Fail to reject null hypothesis
Fail to reject null hypothesis


In [63]:
from sklearn.tree import DecisionTreeRegressor

X = merged_data[[column for column in merged_data.columns if column not in ["sued", "iso_currency_code_-_monthly", "ticker_symbol", "sued_", "settlementamount"]]]
y = merged_data["sued_"]

In [64]:
tree = DecisionTreeRegressor().fit(X, y)

In [68]:
indexed_column = list()

for i in range(0, len(list(tree.feature_importances_))):
  if list(tree.feature_importances_)[i] in sorted(list(tree.feature_importances_), reverse=True):
    if i != 0:
      indexed_column.append(i)

In [69]:
sorted(list(tree.feature_importances_), reverse=True)[1000]

0.0

In [70]:
for i in range(0, len(merged_data.columns)):
  if i in indexed_column:
    print(merged_data.columns[i])

acceptances_outstanding
acceptances_outstanding_data_code
accounting_changes_-_cumulative_effect
accounting_changes_-_cumulative_effect_data_code
accounting_changes_-_cumulative_effect_footnote
accounts_payable/creditors_-_brokers,_dealers,_and_clearing_organizations
accounts_payable/creditors_-_customer
accounts_payable/creditors_-_other_-_fs
accounts_payable/creditors_-_other_-_fs_data_code
accounts_payable_-_trade_data_code
accounts_payable_-_utility
accounts_payable_-_utility_data_code
accounts_payable_-_utility_footnote
accounts_payable_and_accrued_liabilities_-_increase/(decrease)
accounts_payable_and_accrued_liabilities_-_increase/(decrease)_data_code
accounts_receivable/debtors_-_brokers,_dealers,_and_clearing_organizations
accounts_receivable/debtors_-_customer
accounts_receivable/debtors_-_customer_data_code
accounts_receivable/debtors_-_customer_footnote
accounts_receivable/debtors_-_total
accounts_receivable/debtors_-_total_data_code
accounts_receivable/debtors_-_total_foot

In [71]:
merged_data.corr()

Unnamed: 0,global_company_key,acceptances_outstanding,acceptances_outstanding_data_code,accounting_changes_-_cumulative_effect,accounting_changes_-_cumulative_effect_data_code,accounting_changes_-_cumulative_effect_footnote,"accounts_payable/creditors_-_brokers,_dealers,_and_clearing_organizations",accounts_payable/creditors_-_customer,accounts_payable/creditors_-_other_-_fs,accounts_payable/creditors_-_other_-_fs_data_code,accounts_payable_-_trade_data_code,accounts_payable_-_utility,accounts_payable_-_utility_data_code,accounts_payable_-_utility_footnote,accounts_payable_and_accrued_liabilities_-_increase/(decrease),accounts_payable_and_accrued_liabilities_-_increase/(decrease)_data_code,"accounts_receivable/debtors_-_brokers,_dealers,_and_clearing_organizations",accounts_receivable/debtors_-_customer,accounts_receivable/debtors_-_customer_data_code,accounts_receivable/debtors_-_customer_footnote,accounts_receivable/debtors_-_total,accounts_receivable/debtors_-_total_data_code,accounts_receivable/debtors_-_total_footnote,accounts_receivable_-_decrease_(increase),accounts_receivable_-_decrease_(increase)_data_code,accounts_receivable_-_trade_-_utility,accounts_receivable_-_trade_-_utility_data_code,accrued_expenses,accrued_expenses_and_deferred_income,accrued_expenses_and_deferred_income_data_code,accrued_expenses_data_code,accrued_unbilled_revenues_(balance_sheet),accrued_unbilled_revenues_(balance_sheet)_data_code,accum_other_comp_inc_-_derivatives_unrealized_gain/loss,accum_other_comp_inc_-_derivatives_unrealized_gain/loss_data_code,accum_other_comp_inc_-_min_pension_liab_adj,accum_other_comp_inc_-_min_pension_liab_adj_data_code,accum_other_comp_inc_-_min_pension_liab_adj_footnote,accum_other_comp_inc_-_other_adjustments,accum_other_comp_inc_-_unreal_g/l_ret_int_in_sec_assets,...,unrecog._tax_benefits_-_end_of_year,unrecog._tax_benefits_-_end_of_year.1,us_canadian_translation_rate,uses_of_funds_-_other,uses_of_funds_-_other_data_code,uses_of_funds_-_total,utility_-_liberalized_depreciation_code,value_added_taxes,volatility_-_assumption_(%),working_capital_(source)_-_decrease_(increase)_(cash_flow),working_capital_(source)_-_decrease_(increase)_(cash_flow)_data_code,working_capital_(use)_-_increase_(decrease)_(cash_flow),working_capital_(use)_-_increase_(decrease)_(cash_flow)_data_code,working_capital_(use)_-_increase_(decrease)_(cash_flow)_footnote,working_capital_change_-_other_-_increase/(decrease),working_capital_change_-_other_-_increase/(decrease)_data_code,working_capital_change_-_total,writedowns_after-tax,writedowns_after-tax_data_code,writedowns_basic_eps_effect,writedowns_basic_eps_effect_data_code,writedowns_diluted_eps_effect,writedowns_diluted_eps_effect_data_code,writedowns_pretax,writedowns_pretax_data_code,common_stock_float_shares_-_canada,dividend_rate_-_monthly,dividends_per_share_-_ex_date_-_monthly,dividends_per_share_-_pay_date_-_monthly,price_-_close_-_daily,price_-_high_-_daily,price_-_low_-_daily,price_-_open_-_daily,price_status_code_-_daily,s&p_domestic_long_term_issuer_credit_rating,s&p_domestic_short_term_issuer_credit_rating,settlementamount,iso_currency_code_-_monthly_,sued_,ticker_symbol_
global_company_key,1.0,,,0.026998,,,,,,,,,,,-0.081996,0.004261,,,,,,,,0.03998477,0.01259,,,-0.12069,,,0.021888,,,0.039215,-0.039892,0.100442,-0.009581,,0.008141,,...,-0.119581,0.025164,-0.008827,,,,,,0.08269,,,,,,,,,0.079546,-0.015749,0.03216,-0.022599,0.031425,-0.004882,0.076892,-0.015749,-0.041194,-0.185738,-0.177727,-0.177395,-0.116622,-0.116492,-0.115696,-0.107678,-0.03548,-0.161971,-0.128979,-0.145149,-0.006038,-0.018548,-0.023433
acceptances_outstanding,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
acceptances_outstanding_data_code,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
accounting_changes_-_cumulative_effect,0.026998,,,1.0,,,,,,,,,,,-0.00286,0.032373,,,,,,,,0.002316348,-0.006986,,,-0.004227,,,-0.01027,,,0.001206,-0.003234,0.002703,-0.002926,,0.000297,,...,-0.003227,-0.004513,-0.003064,,,,,,-0.022001,,,,,,,,,0.002931,-0.001494,0.003714,-0.000678,0.003671,-0.000866,0.002884,-0.001494,-0.001104,-0.00078,-0.003362,0.003432,-0.00594,-0.005831,-0.006007,-0.00606,-0.003272,-0.010412,-0.003392,,0.002858,-0.010507,-0.030088
accounting_changes_-_cumulative_effect_data_code,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
accounting_changes_-_cumulative_effect_footnote,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"accounts_payable/creditors_-_brokers,_dealers,_and_clearing_organizations",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
accounts_payable/creditors_-_customer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
accounts_payable/creditors_-_other_-_fs,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
accounts_payable/creditors_-_other_-_fs_data_code,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
merged_data.isnull().sum()

global_company_key                                                                        0
acceptances_outstanding                                                                   0
acceptances_outstanding_data_code                                                         0
accounting_changes_-_cumulative_effect                                                    0
accounting_changes_-_cumulative_effect_data_code                                          0
accounting_changes_-_cumulative_effect_footnote                                           0
accounts_payable/creditors_-_brokers,_dealers,_and_clearing_organizations                 0
accounts_payable/creditors_-_customer                                                     0
accounts_payable/creditors_-_other_-_fs                                                   0
accounts_payable/creditors_-_other_-_fs_data_code                                         0
accounts_payable_-_trade_data_code                                              