In [1]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

pd.set_option("display.max_rows", 50000)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
merged_data = pd.read_csv("/content/drive/MyDrive/Fiverr/PROJECT_MIMI/Fund_Settlement_Security_Stocks_Ratings.csv", parse_dates=["data_date"])

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
merged_data.shape

(17123, 1782)

In [5]:
merged_data.rename(
    columns={"iqv-s&p_domestic_long_term_issuer_credit_rating": "s&p_domestic_long_term_issuer_credit_rating", 
             "iqv-s&p_domestic_short_term_issuer_credit_rating": "s&p_domestic_short_term_issuer_credit_rating"}, inplace=True)

In [6]:
merged_data.columns

Index(['global_company_key', 'data_date', 'data_year_-_fiscal',
       'industry_format', 'level_of_consolidation_-_company_annual_descriptor',
       'population_source', 'data_format', 'ticker_symbol', 'company_name',
       'adoption_of_accounting_changes',
       ...
       'dividends_per_share_-_ex_date_-_monthly',
       'dividends_per_share_-_pay_date_-_monthly',
       'common_stock_float_shares_-_canada', 'price_-_close_-_daily',
       'price_-_high_-_daily', 'price_-_low_-_daily', 'price_-_open_-_daily',
       'price_status_code_-_daily',
       's&p_domestic_long_term_issuer_credit_rating',
       's&p_domestic_short_term_issuer_credit_rating'],
      dtype='object', length=1782)

In [7]:
merged_data["year"] = merged_data["data_date"].apply(lambda x: x.strftime("%Y"))

In [8]:
constant_columns = sorted(['price_-_close_-_daily', 'price_-_high_-_daily', 'price_-_low_-_daily', 'price_-_open_-_daily', 
                    'price_status_code_-_daily', "dividend_rate_-_monthly", 'dividends_per_share_-_ex_date_-_monthly', 
                    'dividends_per_share_-_pay_date_-_monthly', 'common_stock_float_shares_-_canada', 'iso_currency_code_-_monthly', 
                    "settlementamount", "sued", "global_company_key", "ticker_symbol", "s&p_domestic_short_term_issuer_credit_rating", 
                    "s&p_domestic_long_term_issuer_credit_rating", "year"])

In [9]:
data_ = merged_data[constant_columns]
data_.drop_duplicates(subset=["global_company_key"], keep="first", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
missing_columns = merged_data[sorted(list(set(merged_data.columns) - set(sorted(constant_columns))))].isnull().sum().reset_index().rename(columns={0: "total_missing", "index": "md_columns"})
missing_columns["percentage_missing"] = missing_columns["total_missing"].apply(lambda x: (x/merged_data.shape[0]) * 100)
missing_columns = missing_columns[missing_columns["percentage_missing"] > 50]

In [11]:
numerical_variables = list()
categorical_variables = list()

for column in merged_data[missing_columns[missing_columns["percentage_missing"] > 50]["md_columns"].unique()]:
  if merged_data[column].dtype in ["float64", "int64"]:
    numerical_variables.append(column)
    merged_data[column] = merged_data[column].apply(lambda x: round(x, 2))

  if merged_data[column].dtype == "object":
    categorical_variables.append(column)

In [12]:
merged_data.shape

(17123, 1783)

In [13]:
merged_data = merged_data.groupby(["global_company_key", "year"])[numerical_variables].agg("sum").reset_index()

In [14]:
merged_data.shape

(8724, 1490)

In [15]:
merged_data = merged_data.groupby(["global_company_key"])[numerical_variables].agg("mean").reset_index()

In [16]:
merged_data = pd.merge(merged_data, data_, on="global_company_key")
merged_data.drop(columns="year", inplace=True)

In [17]:
merged_data.shape

(2462, 1504)

In [18]:
merged_data.to_csv("/content/drive/MyDrive/Fiverr/PROJECT_MIMI/Final_DataSet.csv", index=False)

In [19]:
merged_data["sued"].value_counts()

no     2037
yes     425
Name: sued, dtype: int64

In [20]:
merged_data.dropna(subset=["s&p_domestic_long_term_issuer_credit_rating", "s&p_domestic_short_term_issuer_credit_rating"], inplace=True)

In [21]:
categorical_variables = list()
categorical_variables_ = list()

for column in merged_data.columns:
  if merged_data[column].dtype == "object":
    categorical_variables.append(column)
    merged_data[column] = merged_data[column].astype("category")
    merged_data[column+"_"] = merged_data[column].cat.codes
    categorical_variables_.append(column+"_")

  elif merged_data[column].dtype in ["float64", "int64"]:
    merged_data[column] = merged_data[column].apply(lambda x: round(x, 5))


In [22]:
merged_data[categorical_variables].head()

Unnamed: 0,iso_currency_code_-_monthly,sued,ticker_symbol
0,USD,no,AAN
1,USD,no,ABT
2,USD,yes,AET
3,USD,no,ATRI
4,USD,yes,ACV


In [23]:
for variable in categorical_variables:
  if variable != "sued":
    stat, p, dof, expected = chi2_contingency(pd.crosstab(merged_data[variable], merged_data["sued"]))
    
    if p <= 0.05:
      print(column)
      print("Reject null hypothesis")

    else:
      print("Fail to reject null hypothesis")

Fail to reject null hypothesis
Fail to reject null hypothesis


In [28]:
from sklearn.tree import DecisionTreeRegressor

X = merged_data[[column for column in merged_data.columns if column not in ["sued", "iso_currency_code_-_monthly", "ticker_symbol", "sued_", "settlementamount"]]]
y = merged_data["sued_"]

In [30]:
tree = DecisionTreeRegressor().fit(X, y)
list(tree.feature_importances_)

[0.0015734099742738426,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.010789123224960126,
 0.0044954570693538365,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.018365207595890543,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0025174559588381482,
 0.0,
 0.0131803343229555,
 0.0,
 0.0,
 0.0,
 0.00491910603520283,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0062545489660575096,
 0.0,
 0.002360114961410764,
 0.0,
 0.005731193308037612,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0015509822235447188,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0029762347594176593,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.014173216635001724,
 0.0,
 0.0,
 0.0,
 0.003832984245628901,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0032870851788124027,
 0.0,
 0.017700028873988916,
 0.0,
 0.0

In [27]:
merged_data.isnull().sum()

global_company_key                                                                        0
acceptances_outstanding                                                                   0
acceptances_outstanding_data_code                                                         0
accounting_changes_-_cumulative_effect                                                    0
accounting_changes_-_cumulative_effect_data_code                                          0
accounting_changes_-_cumulative_effect_footnote                                           0
accounts_payable/creditors_-_brokers,_dealers,_and_clearing_organizations                 0
accounts_payable/creditors_-_customer                                                     0
accounts_payable/creditors_-_other_-_fs                                                   0
accounts_payable/creditors_-_other_-_fs_data_code                                         0
accounts_payable_-_trade_data_code                                              