In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import re

from sklearn.linear_model import LinearRegression
import seaborn
import pyperclip


In [10]:
pd.set_option('display.max_columns', 50)

### ここだけ手動で設定。変更がないか毎度確認すること

In [11]:
subject_name_list = ["kumakura","kim","souma","fujii","tubota","toki"]
phase_name_list = ["rest", "practice", "boredom", "flow", "ultra", "overload"]
target_columns = ["bpm", "ibi", "lf", "hf","lf/hf","kubios_lf","kubios_hf","kubios_lf/hf","sdsd","rmssd","pnn20","pnn50","concentration_FC3","concentration_FC4","concentration_FCz","concentration_mean","FC3","FC4","FCz","mean_3ch","lf_diff", "hf_diff","lfhf_diff","hr_mad","sd1","sd2","s","sd1/sd2","breathingrate","lf_nu","hf_nu"]
target_phases = ["boredom","flow","ultra","overload"]

folder_name = "999_2_for_paper"
pyperclip.copy(f"{folder_name}")


### 分析対象のファイルのパスを正規表現で取得

In [12]:
pathes = glob.glob("/Users/miyakooti/repositories/arai_MATLAB_program/csv/?_*/HRV_and_PLI.csv")
pathes.sort()
pathes

['/Users/miyakooti/repositories/arai_MATLAB_program/csv/0_kumakura/HRV_and_PLI.csv',
 '/Users/miyakooti/repositories/arai_MATLAB_program/csv/1_kim/HRV_and_PLI.csv',
 '/Users/miyakooti/repositories/arai_MATLAB_program/csv/2_souma/HRV_and_PLI.csv',
 '/Users/miyakooti/repositories/arai_MATLAB_program/csv/3_fujii/HRV_and_PLI.csv',
 '/Users/miyakooti/repositories/arai_MATLAB_program/csv/4_tubota/HRV_and_PLI.csv',
 '/Users/miyakooti/repositories/arai_MATLAB_program/csv/5_toki/HRV_and_PLI.csv']

# StatsModelsを利用した分析

- [公式ドキュメント](https://www.statsmodels.org/stable/index.html#citation)
- [github](https://github.com/statsmodels/statsmodels)

Seabold, Skipper, and Josef Perktold. “statsmodels: Econometric and statistical modeling with python.” Proceedings of the 9th Python in Science Conference. 2010.

In [13]:
import statsmodels.api as sm

### フェーズごとの分析

In [14]:
for i,target_phase in enumerate(target_phases):
    
    for j,path in enumerate(pathes):

        df = pd.read_csv(path,index_col=0).fillna(0)
        # column
        # row
        df = df.loc[[target_phase]] # seriesとして取り出したいときはこっち
        if j == 0:
            flow_dataset = df
        else:
            flow_dataset = pd.concat([flow_dataset, df], axis=0)
    
    export_data = {    
        "target": target_columns,
        "linear-p": [],
        "linear-rsquared": [],
        "linear-rsquared_adj": [],
        "linear-coef": [],
        "nonlinear-p": [],
        "nonlinear-rsquared": [],
        "nonlinear-rsquared_adj": [],
        "nonlinear-coef": [],
    }

    for target_column in target_columns:
        
        x = flow_dataset[target_column]
        y = flow_dataset[["questionnaire_average"]]
        
        for k in ["linear", "nonlinear"]:
            if k=="linear":
                # 線形単回帰
                X = sm.add_constant(x)
                model = sm.OLS(y, X)
                results = model.fit()

                export_data["linear-p"].append(results.pvalues[target_column])
                export_data["linear-rsquared"].append(results.rsquared)
                export_data["linear-rsquared_adj"].append(results.rsquared_adj)
                export_data["linear-coef"].append(results.params[target_column])
            if k=="nonlinear":
                # ２次単回帰
                x = x**2
                
                X = sm.add_constant(x)
                model = sm.OLS(y, X)
                results = model.fit()

                export_data["nonlinear-p"].append(results.pvalues[target_column])
                export_data["nonlinear-rsquared"].append(results.rsquared)
                export_data["nonlinear-rsquared_adj"].append(results.rsquared_adj)
                export_data["nonlinear-coef"].append(results.params[target_column])
    
    print(f"\nsaving {target_phase} phase regression results...\n")
    df = pd.DataFrame(export_data)   
    print(df)
    save_path = f"/Users/miyakooti/repositories/arai_MATLAB_program/csv/{folder_name}/{i}_{target_phase}_regression.csv"
    df.to_csv(save_path)  


saving boredom phase regression results...

                target  linear-p  linear-rsquared  linear-rsquared_adj  \
0                  bpm  0.567642         0.088190            -0.139762   
1                  ibi  0.492830         0.124428            -0.094465   
2                   lf  0.231969         0.331296             0.164120   
3                   hf  0.339592         0.226850             0.033562   
4                lf/hf  0.955636         0.000875            -0.248906   
5            kubios_lf  0.377719         0.197169            -0.003538   
6            kubios_hf  0.352013         0.216827             0.021033   
7         kubios_lf/hf  0.874940         0.006984            -0.241271   
8                 sdsd  0.075808         0.586558             0.483198   
9                rmssd  0.073207         0.593049             0.491311   
10               pnn20  0.034444         0.712969             0.641212   
11               pnn50  0.134152         0.467607             0.334


saving ultra phase regression results...

                target  linear-p  linear-rsquared  linear-rsquared_adj  \
0                  bpm  0.174448         0.404755             0.255944   
1                  ibi  0.197450         0.373450             0.216813   
2                   lf  0.766378         0.024661            -0.219174   
3                   hf  0.631649         0.062915            -0.171357   
4                lf/hf  0.949022         0.001156            -0.248555   
5            kubios_lf  0.842088         0.011166            -0.236043   
6            kubios_hf  0.610416         0.070754            -0.161558   
7         kubios_lf/hf  0.518347         0.111197            -0.111004   
8                 sdsd  0.372721         0.200881             0.001101   
9                rmssd  0.442292         0.153557            -0.058054   
10               pnn20  0.418511         0.168725            -0.039094   
11               pnn50  0.590558         0.078570            -0.15178

# 被験者ごとの分析

In [15]:
def findSubjectName(path):
    for subject_name in subject_name_list:
        if subject_name in path:
            return subject_name

In [20]:
for i,path in enumerate(pathes):

    df = pd.read_csv(path,index_col=0).fillna(0)
    
    # ここだけ設定してください。ごめんね
    flow_dataset = df.iloc[2:6]
    df = df[["bpm", "ibi", "lf", "hf","lf/hf","FC3","FC4","FCz","mean_3ch","concentration_FC3","concentration_FC4","concentration_FCz","questionnaire_average"]]
    
    export_data = {
        "target": target_columns,
        "linear-p": [],
        "linear-rsquared": [],
        "linear-rsquared_adj": [],
        "linear-coef": [],
        "nonlinear-p": [],
        "nonlinear-rsquared": [],
        "nonlinear-rsquared_adj": [],
        "nonlinear-coef": [],
    }
    
    for target_column in target_columns:

            x = flow_dataset[target_column]
            y = flow_dataset[["questionnaire_average"]]

            for k in ["linear", "nonlinear"]:
                if k=="linear":
                    # 線形単回帰
                    X = sm.add_constant(x)
                    model = sm.OLS(y, X)
                    results = model.fit()

                    export_data["linear-p"].append(results.pvalues[target_column])
                    export_data["linear-rsquared"].append(results.rsquared)
                    export_data["linear-rsquared_adj"].append(results.rsquared_adj)
                    export_data["linear-coef"].append(results.params[target_column])
                if k=="nonlinear":
                    # ２次単回帰
                    x = x**2

                    X = sm.add_constant(x)
                    model = sm.OLS(y, X)
                    results = model.fit()

                    export_data["nonlinear-p"].append(results.pvalues[target_column])
                    export_data["nonlinear-rsquared"].append(results.rsquared)
                    export_data["nonlinear-rsquared_adj"].append(results.rsquared_adj)
                    export_data["nonlinear-coef"].append(results.params[target_column])
                    
    print(f"\nsaving {findSubjectName(path)} regression results...\n")    
    df = pd.DataFrame(export_data)   
    print(df)

    save_path = f"/Users/miyakooti/repositories/arai_MATLAB_program/csv/{folder_name}/subject/{i}_{findSubjectName(path)}_regression.csv"
    df.to_csv(save_path)  


saving kumakura regression results...

                target  linear-p  linear-rsquared  linear-rsquared_adj  \
0                  bpm  0.123084         0.768982             0.653473   
1                  ibi  0.121437         0.771873             0.657809   
2                   lf  0.131243         0.754739             0.632109   
3                   hf  0.026761         0.947193             0.920790   
4                lf/hf  0.194089         0.649493             0.474239   
5            kubios_lf  0.492039         0.258025            -0.112963   
6            kubios_hf  0.000353         0.999293             0.998940   
7         kubios_lf/hf  0.107461         0.796625             0.694938   
8                 sdsd  0.714842         0.081315            -0.378028   
9                rmssd  0.997796         0.000005            -0.499993   
10               pnn20  0.557148         0.196118            -0.205824   
11               pnn50  0.733817         0.070853            -0.393720  


saving souma regression results...

                target  linear-p  linear-rsquared  linear-rsquared_adj  \
0                  bpm  0.572850         0.182457            -0.226315   
1                  ibi  0.546675         0.205503            -0.191745   
2                   lf  0.267899         0.535972             0.303958   
3                   hf  0.602669         0.157872            -0.263192   
4                lf/hf  0.097006         0.815398             0.723097   
5            kubios_lf  0.157514         0.709783             0.564674   
6            kubios_hf  0.459079         0.292595            -0.061107   
7         kubios_lf/hf  0.403882         0.355356             0.033034   
8                 sdsd  0.710337         0.083904            -0.374143   
9                rmssd  0.569561         0.185278            -0.222083   
10               pnn20  0.688104         0.097279            -0.354082   
11               pnn50  0.455827         0.296124            -0.055814   
1


saving tubota regression results...

                target  linear-p  linear-rsquared  linear-rsquared_adj  \
0                  bpm  0.483799         0.266463            -0.100305   
1                  ibi  0.491448         0.258625            -0.112062   
2                   lf  0.254110         0.556352             0.334527   
3                   hf  0.116388         0.780770             0.671155   
4                lf/hf  0.959812         0.001615            -0.497577   
5            kubios_lf  0.033010         0.935069             0.902603   
6            kubios_hf  0.115503         0.782335             0.673503   
7         kubios_lf/hf  0.553919         0.198988            -0.201518   
8                 sdsd  0.723365         0.076527            -0.385209   
9                rmssd  0.949511         0.002549            -0.496176   
10               pnn20  0.946435         0.002869            -0.495696   
11               pnn50  0.848407         0.022981            -0.465529   


https://atmarkit.itmedia.co.jp/ait/articles/2109/14/news024.html

# すべてのフェーズ，すべての被験者のデータをまとめて分析

In [17]:
# 使用するデータセットは一つなので，ここでまとめて作っちゃいましょう

for i,target_phase in enumerate(target_phases):
    
    for j,path in enumerate(pathes):

        df = pd.read_csv(path,index_col=0).fillna(0)
        # column
        # row
        df = df.loc[[target_phase]] # seriesとして取り出したいときはこっち
        if j == 0 and i == 0:
            print("flow_dataset初期化")
            flow_dataset = df
        else:
            flow_dataset = pd.concat([flow_dataset, df], axis=0)
        

for i,target_phase in enumerate(target_phases):


    export_data = {    
        "target": target_columns,
        "linear-p": [],
        "linear-rsquared": [],
        "linear-rsquared_adj": [],
        "linear-coef": [],
        "nonlinear-p": [],
        "nonlinear-rsquared": [],
        "nonlinear-rsquared_adj": [],
        "nonlinear-coef": [],
    }

    for target_column in target_columns:
        
        x = flow_dataset[target_column]
        y = flow_dataset[["questionnaire_average"]]
        
        for k in ["linear", "nonlinear"]:
            if k=="linear":
                # 線形単回帰
                X = sm.add_constant(x)
                model = sm.OLS(y, X)
                results = model.fit()

                export_data["linear-p"].append(results.pvalues[target_column])
                export_data["linear-rsquared"].append(results.rsquared)
                export_data["linear-rsquared_adj"].append(results.rsquared_adj)
                export_data["linear-coef"].append(results.params[target_column])
            if k=="nonlinear":
                # ２次単回帰
                x = x**2
                
                X = sm.add_constant(x)
                model = sm.OLS(y, X)
                results = model.fit()

                export_data["nonlinear-p"].append(results.pvalues[target_column])
                export_data["nonlinear-rsquared"].append(results.rsquared)
                export_data["nonlinear-rsquared_adj"].append(results.rsquared_adj)
                export_data["nonlinear-coef"].append(results.params[target_column])
    
    print(f"\nsaving {target_phase} phase regression results...\n")
    df = pd.DataFrame(export_data)   
    print(df)
    save_path = f"/Users/miyakooti/repositories/arai_MATLAB_program/csv/{folder_name}/all_regression.csv"
    df.to_csv(save_path)  

flow_dataset初期化

saving boredom phase regression results...

                target  linear-p  linear-rsquared  linear-rsquared_adj  \
0                  bpm  0.695298         0.007108            -0.038024   
1                  ibi  0.726114         0.005690            -0.039506   
2                   lf  0.095575         0.121146             0.081198   
3                   hf  0.443696         0.026911            -0.017320   
4                lf/hf  0.845112         0.001773            -0.043601   
5            kubios_lf  0.033105         0.190255             0.153448   
6            kubios_hf  0.334639         0.042357            -0.001172   
7         kubios_lf/hf  0.664704         0.008697            -0.036362   
8                 sdsd  0.781002         0.003588            -0.041704   
9                rmssd  0.701989         0.006785            -0.038361   
10               pnn20  0.539649         0.017340            -0.027327   
11               pnn50  0.684032         0.007672  


saving ultra phase regression results...

                target  linear-p  linear-rsquared  linear-rsquared_adj  \
0                  bpm  0.695298         0.007108            -0.038024   
1                  ibi  0.726114         0.005690            -0.039506   
2                   lf  0.095575         0.121146             0.081198   
3                   hf  0.443696         0.026911            -0.017320   
4                lf/hf  0.845112         0.001773            -0.043601   
5            kubios_lf  0.033105         0.190255             0.153448   
6            kubios_hf  0.334639         0.042357            -0.001172   
7         kubios_lf/hf  0.664704         0.008697            -0.036362   
8                 sdsd  0.781002         0.003588            -0.041704   
9                rmssd  0.701989         0.006785            -0.038361   
10               pnn20  0.539649         0.017340            -0.027327   
11               pnn50  0.684032         0.007672            -0.03743

# overloadを抜いたやつで分析してみる

In [18]:
# 使用するデータセットは一つなので，ここでまとめて作っちゃいましょう

for i,target_phase in enumerate(target_phases):
    
    if target_phase == "overload":
        continue
    
    for j,path in enumerate(pathes):

        df = pd.read_csv(path,index_col=0).fillna(0)
        # column
        # row
        df = df.loc[[target_phase]] # seriesとして取り出したいときはこっち
        if j == 0 and i == 0:
            print("flow_dataset初期化")
            flow_dataset = df
        else:
            flow_dataset = pd.concat([flow_dataset, df], axis=0)
        

for i,target_phase in enumerate(target_phases):


    export_data = {    
        "target": target_columns,
        "linear-p": [],
        "linear-rsquared": [],
        "linear-rsquared_adj": [],
        "linear-coef": [],
        "nonlinear-p": [],
        "nonlinear-rsquared": [],
        "nonlinear-rsquared_adj": [],
        "nonlinear-coef": [],
    }

    for target_column in target_columns:
        
        x = flow_dataset[target_column]
        y = flow_dataset[["questionnaire_average"]]
        
        for k in ["linear", "nonlinear"]:
            if k=="linear":
                # 線形単回帰
                X = sm.add_constant(x)
                model = sm.OLS(y, X)
                results = model.fit()

                export_data["linear-p"].append(results.pvalues[target_column])
                export_data["linear-rsquared"].append(results.rsquared)
                export_data["linear-rsquared_adj"].append(results.rsquared_adj)
                export_data["linear-coef"].append(results.params[target_column])
            if k=="nonlinear":
                # ２次単回帰
                x = x**2
                
                X = sm.add_constant(x)
                model = sm.OLS(y, X)
                results = model.fit()

                export_data["nonlinear-p"].append(results.pvalues[target_column])
                export_data["nonlinear-rsquared"].append(results.rsquared)
                export_data["nonlinear-rsquared_adj"].append(results.rsquared_adj)
                export_data["nonlinear-coef"].append(results.params[target_column])
    
    print(f"\nsaving {target_phase} phase regression results...\n")
    df = pd.DataFrame(export_data)   
    print(df)
    save_path = f"/Users/miyakooti/repositories/arai_MATLAB_program/csv/{folder_name}/all_regressiton_without_overload.csv"
    df.to_csv(save_path)  

flow_dataset初期化

saving boredom phase regression results...

                target  linear-p  linear-rsquared  linear-rsquared_adj  \
0                  bpm  0.524440         0.025785            -0.035103   
1                  ibi  0.538384         0.024112            -0.036881   
2                   lf  0.055602         0.210307             0.160951   
3                   hf  0.225226         0.090458             0.033612   
4                lf/hf  0.527944         0.025358            -0.035558   
5            kubios_lf  0.013959         0.322459             0.280113   
6            kubios_hf  0.179430         0.109688             0.054043   
7         kubios_lf/hf  0.827226         0.003067            -0.059241   
8                 sdsd  0.839520         0.002641            -0.059694   
9                rmssd  0.913602         0.000759            -0.061694   
10               pnn20  0.877578         0.001529            -0.060876   
11               pnn50  0.872264         0.001666  


saving overload phase regression results...

                target  linear-p  linear-rsquared  linear-rsquared_adj  \
0                  bpm  0.524440         0.025785            -0.035103   
1                  ibi  0.538384         0.024112            -0.036881   
2                   lf  0.055602         0.210307             0.160951   
3                   hf  0.225226         0.090458             0.033612   
4                lf/hf  0.527944         0.025358            -0.035558   
5            kubios_lf  0.013959         0.322459             0.280113   
6            kubios_hf  0.179430         0.109688             0.054043   
7         kubios_lf/hf  0.827226         0.003067            -0.059241   
8                 sdsd  0.839520         0.002641            -0.059694   
9                rmssd  0.913602         0.000759            -0.061694   
10               pnn20  0.877578         0.001529            -0.060876   
11               pnn50  0.872264         0.001666            -0.06

# 線形重回帰分析（マルチこに気をつけよう）

In [None]:
# x = flow_dataset[["bpm", "ibi","mean_3ch"]]
# y = flow_dataset[["questionnaire_average"]]

# #全要素が1の列を説明変数の先頭に追加,切片をつけるために必ず必要
# X = sm.add_constant(x)
 
# #モデルの設定
# model = sm.OLS(y, X)
 
# #回帰分析の実行
# results = model.fit()
 
# #結果の詳細を表示
# print(results.summary())

## いい結果のように思えるが、多重共線性により偽の有意性が出てしまっている

### 参考
- https://takacity.blog.fc2.com/blog-entry-305.html
- https://self-development.info/%E3%80%90%E5%88%9D%E5%BF%83%E8%80%85%E8%84%B1%E5%87%BA%E3%80%91statsmodels%E3%81%AB%E3%82%88%E3%82%8B%E9%87%8D%E5%9B%9E%E5%B8%B0%E5%88%86%E6%9E%90%E7%B5%90%E6%9E%9C%E3%81%AE%E8%A6%8B%E6%96%B9/
- https://teratail.com/questions/256310

In [None]:
# 決定係数
# 0.9以上	非常によい
# 0.7以上0.9未満	よい
# 0.5以上0.7未満	あまりよくない
# 0.5未満	悪い

# Dep. Variable:     questionnaire_average   R-squared:                       0.915（決定係数。説明変数が目的変数をどれくらい説明できるか）
# Model:                               OLS   Adj. R-squared:                  0.893（自由度調整済み決定係数）
# Method:                    Least Squares   F-statistic:                     42.82（F値）
# Date:                   Fri, 13 Jan 2023   Prob (F-statistic):            0.00282（F値の現れる確率）
# Time:                           12:17:03   Log-Likelihood:                 2.8630
# No. Observations:           データの行数   AIC:                            -1.726
# Df Residuals:               残差の自由度   BIC:                            -2.142
# Df Model:                   要因の自由度                                        
# Covariance Type:  nonrobust（変数間の相関関係）


#       coef（回帰係数。傾き）    std err（標準誤差）  t      P>|t|      [0.025  0.975]
# ------------------------------------------------------------------------------
# const          7.3625      0.174     42.246      0.000       6.879       7.846
# lf         -6.729e-06   1.03e-06     -6.544      0.003   -9.58e-06   -3.87e-06
# ==============================================================================
# Omnibus:                          nan   Durbin-Watson:                   1.765
# Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.156
# Skew:                          -0.179   Prob(JB):                        0.925
# Kurtosis:                       2.296   Cond. No.                     3.93e+05
# ==============================================================================

In [None]:
dir(results)

In [None]:
print(results.__doc__)

In [None]:
dir(results.params)

- [pandas dictionary出力](https://note.nkmk.me/python-pandas-to-csv/)
- [pandas dictionary出力](https://atmarkit.itmedia.co.jp/ait/articles/2109/14/news024.html)