In [44]:
import pandas as pd
from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif
import numpy as np

In [45]:
df = pd.read_csv("./remote_mental_health_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Employee_ID                        5000 non-null   object
 1   Age                                5000 non-null   int64 
 2   Gender                             5000 non-null   object
 3   Job_Role                           5000 non-null   object
 4   Industry                           5000 non-null   object
 5   Years_of_Experience                5000 non-null   int64 
 6   Work_Location                      5000 non-null   object
 7   Hours_Worked_Per_Week              5000 non-null   int64 
 8   Number_of_Virtual_Meetings         5000 non-null   int64 
 9   Work_Life_Balance_Rating           5000 non-null   int64 
 10  Stress_Level                       5000 non-null   object
 11  Mental_Health_Condition            3804 non-null   object
 12  Access

In [49]:
print("----Before----")
print(df["Satisfaction_with_Remote_Work"].value_counts())

df['Satisfaction_with_Remote_Work'] = df['Satisfaction_with_Remote_Work'].replace('Neutral', 'Satisfied')
print("----After----")
print(df["Satisfaction_with_Remote_Work"].value_counts())

----Before----
Satisfaction_with_Remote_Work
Unsatisfied    1677
Satisfied      1675
Neutral        1648
Name: count, dtype: int64
----After----
Satisfaction_with_Remote_Work
Satisfied      3323
Unsatisfied    1677
Name: count, dtype: int64


In [53]:
# 設定目標變數
target_var = 'Satisfaction_with_Remote_Work'

# 初始化結果表格
results = []

# 對每個特徵進行卡方檢定（僅限於類別型變數）
for col in df.columns:
    if col != target_var and df[col].dtype == 'object':  # 排除目標變數和非類別型變數
        # 建立列聯表
        contingency_table = pd.crosstab(df[col], df[target_var])
        
        # 執行卡方檢定
        chi2, p, dof, _ = chi2_contingency(contingency_table)
        
        # 收集檢定結果
        results.append({
            'Feature': col,
            'Chi2 Statistic': chi2,
            'p-value': p,
            'Degrees of Freedom': dof,
            'Significant': p < 0.05  # 判斷是否顯著
        })

# 將結果轉換為 DataFrame
chi2_results = pd.DataFrame(results)

# 按照 p-value 排序
chi2_results = chi2_results.sort_values(by='p-value', ascending=True)

# 印出結果
print(chi2_results)

                              Feature  Chi2 Statistic   p-value  \
9                   Physical_Activity        1.395995  0.237395   
1                              Gender        3.496855  0.321170   
10                      Sleep_Quality        1.902950  0.386171   
0                         Employee_ID     5000.000000  0.493351   
6             Mental_Health_Condition        1.330426  0.514164   
4                       Work_Location        1.180480  0.554194   
11                             Region        2.814612  0.728541   
2                            Job_Role        2.979534  0.811411   
8                 Productivity_Change        0.321107  0.851672   
5                        Stress_Level        0.303807  0.859071   
7   Access_to_Mental_Health_Resources        0.011328  0.915239   
3                            Industry        0.970336  0.986712   

    Degrees of Freedom  Significant  
9                    1        False  
1                    3        False  
10            

In [51]:
# 計算信息增益
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features.extend(df.select_dtypes(include=["int64"]).columns.tolist())
target_var = 'Satisfaction_with_Remote_Work'

X = pd.get_dummies(df[categorical_features])  # 將類別型變數轉為數值型
y = df[target_var]

info_gain = mutual_info_classif(X, y, discrete_features=True)

# 計算分裂信息
def split_information(data, feature):
    values = data[feature].value_counts(normalize=True)
    return -np.sum(values * np.log2(values + 1e-9))  # 避免 log(0)

split_info = [split_information(df, col) for col in categorical_features]

# 計算 Gain Ratio
gain_ratios = [ig / si if si != 0 else 0 for ig, si in zip(info_gain, split_info)]

# 結果整理
gain_ratio_df = pd.DataFrame({
    'Feature': categorical_features,
    'Gain Ratio': gain_ratios
}).sort_values(by='Gain Ratio', ascending=False)

print(gain_ratio_df)

                              Feature  Gain Ratio
1                              Gender    0.001831
2                            Job_Role    0.001247
6             Mental_Health_Condition    0.000429
4                       Work_Location    0.000417
0                         Employee_ID    0.000306
3                            Industry    0.000296
5                        Stress_Level    0.000251
9       Satisfaction_with_Remote_Work    0.000237
10                  Physical_Activity    0.000219
7   Access_to_Mental_Health_Resources    0.000219
11                      Sleep_Quality    0.000138
12                             Region    0.000085
8                 Productivity_Change    0.000052
15              Hours_Worked_Per_Week    0.000041
18            Social_Isolation_Rating    0.000035
19    Company_Support_for_Remote_Work    0.000035
17           Work_Life_Balance_Rating    0.000035
16         Number_of_Virtual_Meetings    0.000020
14                Years_of_Experience    0.000016
