In [1]:
import pandas as pd
import numpy as np
from scipy import stats
#dfの数の表示領域
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# t検定

## 1.読み込み

In [2]:
df = pd.read_csv('../data/0617_result_question.csv')

## 2.前処理

In [3]:

def preprocess(df):
    """点数の処理"""
    # 宿題をやらないような要因ほど点数が高く
    change_score_columns = ["2-1", "2-3", "2-7", "3-4.国", "3-5.国", "3-4.英", "3-5.英", "3-4.数", "3-5.数"]
    for col in change_score_columns:
        df[col] = df[col].map(lambda x: 5 - x)

    """不必要な列の削除"""
    df = df.drop(['クラス', '人',"MBTI"], axis=1)

    """欠損値の処理"""
    cols_except_mbti = [col for col in df.columns if col != 'MBTI']
    df = df.dropna(subset=cols_except_mbti)

    """従属変数の処理(当日だったら1、それ以外は0)"""
    Ys = [col for col in df.columns if "3-1" in col]
    for col in Ys:
        df[col] = df[col].map(lambda x: 1 if x == "土" or x == "未"  else 0)

    """int型変換"""
    cols_to_convert = df.select_dtypes(include=[np.number]).columns
    df[cols_to_convert] = df[cols_to_convert].astype(int)


    return df
df = preprocess(df)

## 3.t検定

In [4]:
# 正規性を検定（シャピロ・ウィルク検定）
def check_normality(data):
    _, p_value = stats.shapiro(data)
    return p_value > 0.05  # p > 0.05 なら正規性が仮定できる

# 等分散性を検定（バートレット検定）
def check_homoscedasticity(data1, data2):
    _, p_value = stats.bartlett(data1, data2)
    return p_value > 0.05  # p > 0.05 なら等分散性が仮定できる

# 各カラムに対して正規性と等分散性を検定
for col in df.columns:
    group1 = df[df['3-1.数学'] == 1][col]
    group2 = df[df['3-1.数学'] == 0][col]
    
    if check_normality(group1):
        print(f'{col} of group 1 meets the assumption of normality.')
    else:
        print(f'{col} of group 1 does not meet the assumption of normality.')
    
    if check_normality(group2):
        print(f'{col} of group 2 meets the assumption of normality.')
    else:
        print(f'{col} of group 2 does not meet the assumption of normality.')
    
    if check_homoscedasticity(group1, group2):
        print(f'{col} meets the assumption of homoscedasticity.')
    else:
        print(f'{col} does not meet the assumption of homoscedasticity.')



学年 of group 1 does not meet the assumption of normality.
学年 of group 2 does not meet the assumption of normality.
学年 meets the assumption of homoscedasticity.
性別 of group 1 does not meet the assumption of normality.
性別 of group 2 does not meet the assumption of normality.
性別 meets the assumption of homoscedasticity.
2-1 of group 1 does not meet the assumption of normality.
2-1 of group 2 meets the assumption of normality.
2-1 meets the assumption of homoscedasticity.
2-2 of group 1 does not meet the assumption of normality.
2-2 of group 2 does not meet the assumption of normality.
2-2 meets the assumption of homoscedasticity.
2-3 of group 1 does not meet the assumption of normality.
2-3 of group 2 does not meet the assumption of normality.
2-3 meets the assumption of homoscedasticity.
2-4 of group 1 does not meet the assumption of normality.
2-4 of group 2 meets the assumption of normality.
2-4 meets the assumption of homoscedasticity.
2-5 of group 1 does not meet the assumption of nor

  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)


### t検定ビミョい　正規性や分散の均一性の仮定仮定満たせていない

In [8]:
#t検定をしたい
def t(df, col,subject):
    """t検定を行う"""
    #subjectが国語だったら3-1.国を抜き出す
    subject_map = {
    "国語": '3-1.国',
    "英語": '3-1.英語',
    "数学": '3-1.数学'
}

    column_name = subject_map.get(subject)

    if column_name:
        df1 = df[df[column_name] == 1][col]
        df2 = df[df[column_name] == 0][col]

    t, p = stats.ttest_ind(df1, df2, equal_var=False)
    return t, p

#有意水準
alpha = 0.1

#t検定を行う
def t_test(df,columns_to_test,subject):
    """t検定を行う"""
    significant_cols = []
    for col in columns_to_test:
        f_stat, p_val = t(df, col,subject)
        if p_val < alpha:
            significant_cols.append(col)
    return significant_cols

columns_to_test = [col for col in df.columns if not  "3-1" in col]
print("有意のある列")
for s in ["国語","数学","英語"]:
    print(f"{s} : {t_test(df,columns_to_test,s)}")
    print("")

有意のある列
国語 : []

数学 : ['2-4', '2-6', '3-2.国']

英語 : ['2-3', '3-3.国']



## 4.分散分析

In [12]:
df = pd.read_csv('../data/0617_result_question.csv')
def preprocess(df):
    """点数の処理"""
    # 宿題をやらないような要因ほど点数が高く
    change_score_columns = ["2-1", "2-3", "2-7", "3-4.国", "3-5.国", "3-4.英", "3-5.英", "3-4.数", "3-5.数"]
    for col in change_score_columns:
        df[col] = df[col].map(lambda x: 5 - x)

    """不必要な列の削除"""
    df = df.drop(['クラス', '人',"MBTI"], axis=1)

    """欠損値の処理"""
    cols_except_mbti = [col for col in df.columns if col != 'MBTI']
    df = df.dropna(subset=cols_except_mbti)

    """従属変数の処理(当日だったら1、それ以外は0)"""
    Ys = [col for col in df.columns if "3-1" in col]
    for col in Ys:
        df[col] = df[col].map(lambda x: 1 if x == "土" else(2 if x == "未" else 0))

    """int型変換"""
    cols_to_convert = df.select_dtypes(include=[np.number]).columns
    df[cols_to_convert] = df[cols_to_convert].astype(int)


    return df
df = preprocess(df)

In [14]:
#分散分析
def anova(df, col,subject):
    """分散分析を行う"""

    subject_map = {
    "国語": '3-1.国',
    "英語": '3-1.英語',
    "数学": '3-1.数学'}
    column_name = subject_map.get(subject)

    if column_name:
        group0 = df[df[column_name] == 0][col]
        group1 = df[df[column_name] == 1][col]
        group2 = df[df[column_name] == 2][col]
    f, p = stats.f_oneway(group0, group1, group2)
    return f, p

# 有意水準
alpha = 0.05

# 有意水準を下回るカラムを格納するリスト
def anova_test(df, columns_to_test,subject):
    significant_cols = []
    for col in columns_to_test:
        f_stat, p_val = anova(df, col,subject)
        if p_val < alpha:
            significant_cols.append(col)
    return significant_cols


columns_to_test = [col for col in df.columns if not  "3-1" in col]
print("有意のある列")
for s in ["国語","数学","英語"]:
    print(f"{s} : {anova_test(df,columns_to_test,s)}")
    print("")

有意のある列
国語 : ['3-3.英']

数学 : ['2-4', '2-6', '3-3.英']

英語 : ['3-2.国']

