# データ準備

## Excelシートからcsv形式で保存

In [30]:
import pandas as pd

path="C:\Code\Python_code\_git\\voice\音声データ.xlsx"

input_file=pd.ExcelFile(path)
sheet=input_file.sheet_names

for i in range(0,len(sheet)):

        data=pd.read_excel(path,i,index_col=None)
        data.to_csv(sheet[i]+".csv",encoding="shift_jis")

## csvからデータを取り出す。

In [31]:
male_train=pd.read_csv("C:\Code\Python_code\_git\\voice\A-男性.csv")
female_train=pd.read_csv("C:\Code\Python_code\_git\\voice\A-女性.csv")

In [32]:
male_train=male_train[["f0","f1","f2","f3","f4"]]
female_train=female_train[["f0","f1","f2","f3","f4"]]
male_train["Sex"]=True
female_train["Sex"]=False

In [33]:
train_data=pd.concat([male_train,female_train],axis=0)
train_data.reset_index(drop=True,inplace=True)

## データの分割

In [34]:
from sklearn.model_selection import train_test_split

X=train_data.drop("Sex",axis=1)
y=train_data["Sex"]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## 標準化

In [35]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

# ランダムフォレスト

## モデル作成と予測

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model=RandomForestClassifier(n_estimators=100,random_state=42)
model.fit(X_train,y_train)

y_pred=model.predict(X_test)

accuracy=accuracy_score(y_test,y_pred)
print("accuracy:",accuracy)
print("正答率(%):",accuracy*100,"%")

accuracy: 0.9047619047619048
正答率(%): 90.47619047619048 %


次にベイズ最適化を行い、精度が上がるのか検証を行う。

# ランダムフォレスト（ベイズ最適化）

## ベイズ最適化

    **gp_params: ガウシアンプロセス（GP）に関するハイパーパラメータを指定します。alphaはガウシアンプロセスの正則化パラメータで、ここでは1e-5（0.00001）に設定されています。ガウシアンプロセスは、ベイズ最適化アルゴリズムの一部として使用され、目的関数の不確かさをモデル化するのに役立ちます。

In [37]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization

def randomforest_cv(n_estimators, min_samples_split, max_features):
    val = cross_val_score(
        RandomForestClassifier(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max_features,
            random_state=42
        ),
        X_train, y_train,
        scoring = 'accuracy',
        cv = 3, # 3-fold
        n_jobs = -1 # use all CPUs
    ).mean()
    return val

randomforest_cv_bo = BayesianOptimization(
    randomforest_cv,
    {'n_estimators': (10, 250),
    'min_samples_split': (2, 25),
    'max_features': (0.1, 0.999)}
)

gp_params = {"alpha": 1e-5}
randomforest_cv_bo.maximize(n_iter=50, **gp_params)

best_params=randomforest_cv_bo.max["params"]
print(best_params)


|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------


Passing acquisition function parameters or gaussian process parameters to maximize
is no longer supported, and will cause an error in future releases. Instead,
please use the "set_gp_params" method to set the gp params, and pass an instance
 of bayes_opt.util.UtilityFunction using the acquisition_function argument

  randomforest_cv_bo.maximize(n_iter=50, **gp_params)


| [0m1        [0m | [0m0.9277   [0m | [0m0.8308   [0m | [0m3.253    [0m | [0m239.9    [0m |
| [0m2        [0m | [0m0.9277   [0m | [0m0.5572   [0m | [0m11.97    [0m | [0m29.93    [0m |
| [0m3        [0m | [0m0.9039   [0m | [0m0.393    [0m | [0m6.724    [0m | [0m168.2    [0m |
| [95m4        [0m | [95m0.9396   [0m | [95m0.8548   [0m | [95m6.869    [0m | [95m158.2    [0m |
| [0m5        [0m | [0m0.7844   [0m | [0m0.1846   [0m | [0m15.64    [0m | [0m43.28    [0m |
| [0m6        [0m | [0m0.9396   [0m | [0m0.7189   [0m | [0m8.17     [0m | [0m159.1    [0m |
| [0m7        [0m | [0m0.9277   [0m | [0m0.8938   [0m | [0m8.609    [0m | [0m17.62    [0m |
| [0m8        [0m | [0m0.9277   [0m | [0m0.975    [0m | [0m15.43    [0m | [0m147.3    [0m |
| [0m9        [0m | [0m0.9277   [0m | [0m0.999    [0m | [0m24.27    [0m | [0m18.22    [0m |
| [0m10       [0m | [0m0.9277   [0m | [0m0.999    [0m | [0m25.0     [

## モデル作成と予測

In [38]:
best_model=RandomForestClassifier(
    n_estimators=int(best_params["n_estimators"]),
    min_samples_split=int(best_params["min_samples_split"]),
    max_features=best_params["max_features"]
)

best_model.fit(X_train,y_train)

y_pred=best_model.predict(X_test)

accuracy=accuracy_score(y_test,y_pred)
print("accuracy:",accuracy)
print("正答率(%):",accuracy*100,"%")

accuracy: 1.0
正答率(%): 100.0 %
