In [1]:
!pip install openpyxl



In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS

In [22]:
file_path = "data/ESG_raw_data_07_02.xlsx"
file_sect = "data/secteurs.csv"
corresp_secteurs = pd.read_csv(file_sect, sep=";")
data = pd.read_excel(file_path)
data = data.join(corresp_secteurs.set_index('primary_industry'), on = "primary_industry")

In [23]:
secteurs = pd.unique(data["secteur"])

In [24]:
variables_env = ["scope_1", "scope_2", "scope_3", "waste_production", "waste_recycling", 
                 "water_consumption", "water_withdrawal", "energy_consumption"]

data = data.dropna(subset = ["employees"])
data = data.loc[data["employees"] != 0]
for v in variables_env :
    data[v] = data[v]/data["employees"]

In [40]:
variables_num = ["market_cap", "employees", "revenue", "scope_1", "scope_2", "scope_3",
                         "waste_production", "waste_recycling", "water_consumption",
                         "water_withdrawal", "energy_consumption", "hours_of_training",
                         "gender_pay_gap", "independent_board_members_percentage",
                         "legal_costs_paid_for_controversies", "ceo_compensation"]

for v in variables_num :
    data[v] = pd.to_numeric(data[v])

## Modèle Tobit de type 2

In [69]:
# Créer des variables dummies pour les variables catégorielles
df = pd.get_dummies(data, columns=['secteur', 'region'])
df = df.drop(["region_United States and Canada", "secteur_Communication Services"], axis = 1)

# Définir la liste des variables explicatives
column_names = df.columns
fixed_column_names = [name.replace(" ", "_").replace("_/", "").replace(",", "") for name in column_names]
df.columns = fixed_column_names

variables = df.columns[11:]

# Définir la variable "observé"
df['is_observed'] = ~df['gender_pay_gap'].isna()

In [70]:
print(variables)

Index(['revenue', 'scope_1', 'scope_2', 'scope_3', 'waste_production',
       'waste_recycling', 'water_consumption', 'water_withdrawal',
       'energy_consumption', 'hours_of_training',
       'independent_board_members_percentage',
       'legal_costs_paid_for_controversies', 'ceo_compensation',
       'gender_pay_gap', 'secteur_Consumer_Discretionary',
       'secteur_Consumer_Staples', 'secteur_Energy', 'secteur_Financials',
       'secteur_Health_Care', 'secteur_Industrials_capital_goods',
       'secteur_Industrials_commercial_and_transportation',
       'secteur_Information_Technology', 'secteur_Materials',
       'secteur_Real_Estate', 'secteur_Utilities', 'region_Africa_Middle_East',
       'region_Asia_Pacific', 'region_Europe',
       'region_Latin_America_and_Caribbean'],
      dtype='object')


In [58]:
df = df.dropna(subset=variables)

In [72]:
df["gender_pay_gap"] = df["gender_pay_gap"].fillna(df["gender_pay_gap"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["gender_pay_gap"] = df["gender_pay_gap"].fillna(df["gender_pay_gap"].median(), inplace=True)


In [53]:
columns_to_fill = df.columns[11:27] 

# Remplir les valeurs manquantes avec la médiane pour ces colonnes
df[columns_to_fill] = df[columns_to_fill].fillna(df[columns_to_fill].median(), inplace=True)

  df[columns_to_fill] = df[columns_to_fill].fillna(df[columns_to_fill].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_fill] = df[columns_to_fill].fillna(df[columns_to_fill].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_fill] = df[columns_to_fill].fillna(df[columns_to_fill].median(), inplace=True)


In [73]:
# Créer un modèle probit pour modéliser le mécanisme de sélection
selection_eq = 'is_observed ~ ' + ' + '.join(variables)

# Créer un modèle de régression pour modéliser le résultat
outcome_eq = 'gender_pay_gap ~ ' + ' + '.join(variables)

# Modèle probit pour la censure
probit_model = sm.Probit.from_formula(selection_eq, df)
selection_results = probit_model.fit()

print(selection_results.summary())

# Utilisation d'un modèle IV (Instrumental Variable) pour la deuxième étape (régression)
iv2sls = IV2SLS.from_formula(outcome_eq, df,
                            endog='gender_pay_gap',
                            instruments='ceo_compensation')

iv_results = iv2sls.fit()

print(iv_results.summary())

ValueError: negative dimensions are not allowed