In [None]:
import pandas as pd
import numpy as np
from typing import List, Literal
from scipy import stats

In [None]:
data = pd.read_csv('daxuly_v3.csv')
data.drop(columns= ['Unnamed: 0','mssv', 'soquyetdinh', 'ngaycapvb'], inplace = True)
# data.head()

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7826 entries, 0 to 7825
Data columns (total 55 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   namsinh         7826 non-null   float64
 1   gioitinh        7826 non-null   float64
 2   noisinh         7826 non-null   object 
 3   lopsh           7826 non-null   object 
 4   khoa            7826 non-null   object 
 5   hedt            7826 non-null   object 
 6   khoahoc         7826 non-null   float64
 7   chuyennganh2    7826 non-null   object 
 8   tinhtrang       7826 non-null   float64
 9   diachi_tinhtp   7824 non-null   object 
 10  diemnamhoc1     7798 non-null   float64
 11  diemnamhoc2     7407 non-null   float64
 12  diemnamhoc3     7087 non-null   float64
 13  diemnamhoc4     6788 non-null   float64
 14  diemnamhoc5     3322 non-null   float64
 15  diemnamhoc6     787 non-null    float64
 16  diem_3_5        6756 non-null   float64
 17  dtb_toankhoa    7801 non-null   f

In [None]:
def cramers_v(confusion_matrix): #tính hệ số Cramer's V
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    k1, k2 = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(k1, k2) - 1)))

In [None]:
from sklearn.feature_selection import f_classif #thư viện tính anova của sklearn

def Numeric_check(df:pd.DataFrame, cols, target):
    df_ = df.copy()

    df_[cols] = df_[cols].apply(lambda col: col.fillna(col.median())) #fill NaN bằng median

    X_numeric = df_[cols]
    y_categorical = df_[target]

    F, p = f_classif(X_numeric, y_categorical)
    correlations = pd.DataFrame({'Feature': cols, 'F-statistic': F, 'p-value': p})
    return correlations

In [None]:
def Categories_check(df:pd.DataFrame, cols, target) -> dict:
    df_ = df.copy()
    results = {}
    for cat_feature in cols:
        confusion_matrix = pd.crosstab(df_[cat_feature], df_[target])
        results[cat_feature] = cramers_v(confusion_matrix.values)
    return results

In [None]:
def TimBienPhanLoai(df:pd.DataFrame, target: Literal['numeric', 'categories'], type:str):
    if type == 'numeric':
        numeric_cols = df.select_dtypes(exclude='object').columns
        return Numeric_check(df,numeric_cols,target)

    elif type == 'categories':
        cate_cols = df.select_dtypes(include='object').columns.drop(target)
        return Categories_check(df,cate_cols,target)
    else:
        print("numeric or categories")

In [None]:
cate_result = TimBienPhanLoai(data,'label','categories')
# df_num.sort_values(by='p-value', ascending=False)

In [None]:
dict(sorted(cate_result.items(), key=lambda item: item[1], reverse=True))

{'ghichu': 0.5599534336631046,
 'lopsh': 0.4323229702444928,
 'tienganh': 0.3809301637557494,
 'chuyennganh2': 0.18040669057567552,
 'hedt': 0.1613653883105353,
 'dien_tt': 0.13396704737670412,
 'khoa': 0.09008849421557732,
 'noisinh': 0.037289139838560165,
 'diachi_tinhtp': 0.03673167502720362}

In [None]:
df_num = TimBienPhanLoai(data,'label','numeric')
df_num.sort_values(by='F-statistic', ascending=False)

Unnamed: 0,Feature,F-statistic,p-value
41,socc_tienganh,4137.306959,0.0
2,khoahoc,2248.542589,0.0
0,namsinh,1943.004209,0.0
44,tb_drl,575.023905,0.0
14,drlnam1,455.317656,0.0
43,tc_yeucau,432.124453,0.0
37,tctichluy4,368.201986,0.0
16,drlnam3,330.040697,0.0
15,drlnam2,319.40647,0.0
3,tinhtrang,311.789281,0.0


In [None]:
# num = ['namsinh', 'gioitinh', 'khoahoc', 'tinhtrang', 'diemnamhoc1',
#        'diemnamhoc2', 'diemnamhoc3', 'diemnamhoc4', 'diemnamhoc5',
#        'diemnamhoc6', 'diem_3_5', 'dtb_toankhoa', 'dtb_tichluy',
#        'sotc_tichluy', 'drlnam1', 'drlnam2', 'drlnam3', 'drlnam4', 'drlnam5',
#        'drlnam6', 'drl_3_5', 'diem_tt', 'lop12_matinh', 'lop12_matruong',
#        'sotc_rot1', 'sotc_rot2', 'sotc_rot3', 'sotc_rot4', 'sotc_rot5',
#        'sotc_rot6', 'sotc_rot7', 'sotc_rot8', 'tc_rot_tong', 'rotmon_3_5',
#        'tctichluy1', 'tctichluy2', 'tctichluy3', 'tctichluy4', 'tctichluy5',
#        'tctichluy6', 'tc_tichluy_3_5', 'socc_tienganh', 'canhcao', 'tc_yeucau',
#        'du_dktn', 'tb_drl']

# cat = ['noisinh', 'lopsh', 'khoa', 'hedt',
#        'chuyennganh2', 'diachi_tinhtp', 'ghichu', 'dien_tt', 'tienganh']