# データ準備

## Excelシートからcsv形式で保存

In [163]:
import pandas as pd

path="C:\Code\Python_code\machine_learning\音声データ.xlsx"

input_file=pd.ExcelFile(path)
sheet=input_file.sheet_names

for i in range(0,len(sheet)):

        data=pd.read_excel(path,i,index_col=None)
        data.to_csv(sheet[i]+".csv",encoding="shift_jis")

## csvからデータを取り出す。

In [164]:
male_train=pd.read_csv("C:\Code\Python_code\machine_learning\A-男性.csv")
female_train=pd.read_csv("C:\Code\Python_code\machine_learning\A-女性.csv")

In [165]:
male_train=male_train[["f0","f1","f2","f3","f4"]]
female_train=female_train[["f0","f1","f2","f3","f4"]]
male_train["Sex"]=True
female_train["Sex"]=False
print(male_train,"\n",female_train)

     f0   f1    f2    f3    f4   Sex
0   133  734  1196  4322  5327  True
1   152  794  1367  3628  5497  True
2   222  784  1467  3518  5397  True
3   115  693  1256  3196  4784  True
4   108  613  1216  3357  5065  True
..  ...  ...   ...   ...   ...   ...
69  113  744  1052  3754  6658  True
70  103  599  1016  3267  6222  True
71  153  671  1197  3048  6150  True
72  118  808  1150  2676  3596  True
73  109  753  1282  2878  3847  True

[74 rows x 6 columns] 
      f0    f1    f2    f3    f4    Sex
0   240   740  1678  2663  6000  False
1   308   623  1400  3588  6000  False
2   318   573  1307  3467  6121  False
3   177   764  1407  3166  4392  False
4   176   704  1467  3367  4452  False
5   231   667  1346  3044  3987  False
6   175   654  1233  3736  5623  False
7   171   692  1472  3874  5736  False
8   220   843  1285  3409  5175  False
9   224   579  1208  3157  5057  False
10  204   818  1409  3308  4289  False
11  292  1082  1761  3396  4314  False
12  292   843  1459  338

In [166]:
train_data=pd.concat([male_train,female_train],axis=0)
train_data.reset_index(drop=True,inplace=True)
train_data

Unnamed: 0,f0,f1,f2,f3,f4,Sex
0,133,734,1196,4322,5327,True
1,152,794,1367,3628,5497,True
2,222,784,1467,3518,5397,True
3,115,693,1256,3196,4784,True
4,108,613,1216,3357,5065,True
...,...,...,...,...,...,...
99,217,822,1464,3008,5153,False
100,203,842,1664,2827,4434,False
101,265,519,1341,3514,4843,False
102,284,519,1302,3090,4529,False


## データの分割

In [167]:
from sklearn.model_selection import train_test_split

X=train_data.drop("Sex",axis=1)
y=train_data["Sex"]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=40)

## 標準化

In [168]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

In [181]:
y_test.info()

<class 'pandas.core.series.Series'>
Int64Index: 21 entries, 17 to 96
Series name: Sex
Non-Null Count  Dtype
--------------  -----
21 non-null     bool 
dtypes: bool(1)
memory usage: 189.0 bytes


# ランダムフォレスト

## モデル作成と予測

In [169]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model=RandomForestClassifier(n_estimators=100,random_state=42)
model.fit(X_train,y_train)

y_pred=model.predict(X_test)

accuracy=accuracy_score(y_test,y_pred)
print("accuracy:",accuracy)
print("正答率(%):",accuracy*100,"%")

accuracy: 0.9523809523809523
正答率(%): 95.23809523809523 %


# ランダムフォレスト（ベイズ最適化）

## ベイズ最適化

    **gp_params: ガウシアンプロセス（GP）に関するハイパーパラメータを指定します。alphaはガウシアンプロセスの正則化パラメータで、ここでは1e-5（0.00001）に設定されています。ガウシアンプロセスは、ベイズ最適化アルゴリズムの一部として使用され、目的関数の不確かさをモデル化するのに役立ちます。

In [170]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization

def randomforest_cv(n_estimators, min_samples_split, max_features):
    val = cross_val_score(
        RandomForestClassifier(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max_features,
            random_state=42
        ),
        X_train, y_train,
        scoring = 'accuracy',
        cv = 3, # 3-fold
        n_jobs = -1 # use all CPUs
    ).mean()
    return val

randomforest_cv_bo = BayesianOptimization(
    randomforest_cv,
    {'n_estimators': (10, 250),
    'min_samples_split': (2, 25),
    'max_features': (0.1, 0.999)}
)

gp_params = {"alpha": 1e-5}
randomforest_cv_bo.maximize(n_iter=50, **gp_params)

best_params=randomforest_cv_bo.max["params"]
print(best_params)


|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------


Passing acquisition function parameters or gaussian process parameters to maximize
is no longer supported, and will cause an error in future releases. Instead,
please use the "set_gp_params" method to set the gp params, and pass an instance
 of bayes_opt.util.UtilityFunction using the acquisition_function argument

  randomforest_cv_bo.maximize(n_iter=50, **gp_params)


| [0m1        [0m | [0m0.9158   [0m | [0m0.7328   [0m | [0m15.79    [0m | [0m82.84    [0m |
| [0m2        [0m | [0m0.832    [0m | [0m0.3209   [0m | [0m22.37    [0m | [0m200.0    [0m |
| [0m3        [0m | [0m0.9039   [0m | [0m0.8649   [0m | [0m2.564    [0m | [0m63.15    [0m |
| [0m4        [0m | [0m0.9039   [0m | [0m0.4343   [0m | [0m16.33    [0m | [0m248.7    [0m |
| [95m5        [0m | [95m0.9277   [0m | [95m0.4879   [0m | [95m6.051    [0m | [95m54.44    [0m |
| [0m6        [0m | [0m0.8078   [0m | [0m0.3291   [0m | [0m21.63    [0m | [0m47.11    [0m |
| [0m7        [0m | [0m0.8911   [0m | [0m0.1831   [0m | [0m2.31     [0m | [0m48.37    [0m |
| [0m8        [0m | [0m0.8792   [0m | [0m0.3284   [0m | [0m4.718    [0m | [0m54.35    [0m |
| [0m9        [0m | [0m0.9039   [0m | [0m0.6654   [0m | [0m7.318    [0m | [0m229.5    [0m |
| [0m10       [0m | [0m0.9158   [0m | [0m0.8043   [0m | [0m20.36    [

## 

## モデル作成と予測

In [171]:
best_model=RandomForestClassifier(
    n_estimators=int(best_params["n_estimators"]),
    min_samples_split=int(best_params["min_samples_split"]),
    max_features=best_params["max_features"]
)

best_model.fit(X_train,y_train)

y_pred=best_model.predict(X_test)

accuracy=accuracy_score(y_test,y_pred)
print("accuracy:",accuracy)
print("正答率(%):",accuracy*100,"%")

accuracy: 0.9523809523809523
正答率(%): 95.23809523809523 %


# K近傍法

## モデル作成と予測

In [172]:
from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train,y_train)

y_pred_knn=knn.predict(X_test)
print(accuracy_score(y_test,y_pred_knn))

0.9047619047619048


## モデル最適化と予測

In [173]:
list_nn=[]
list_score=[]

for k in range(1,31):
    knc=KNeighborsClassifier(n_neighbors=k)
    knc.fit(X_train,y_train)

    y_pred_knc=knc.predict(X_test)

    
    score=accuracy_score(y_test,y_pred_knc)
    list_score.append(score)
    #print(score)

max_score = max(list_score)
best_k = list_score.index(max_score) + 1
print("k:",best_k)
print("accuracy:",max_score)


k: 1
accuracy: 0.9523809523809523


# XGboost(ベイズ最適化)

In [176]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

xgtrain = xgb.DMatrix(X_train, label=y_train)

def xgboost_cv(
        learning_rate,
        max_depth,
        subsample,
        colsample_bytree,
        min_child_weight,
        gamma,
        alpha):

        params = {
            'learning_rate': learning_rate,
            'max_depth': int(max_depth),
            'subsample': subsample,
            'colsample_bytree': colsample_bytree,
            'min_child_weight': min_child_weight,
            'gamma': gamma,
            'alpha': alpha,
            'objective': 'binary:logistic',
            'eval_metric': 'error'
        }

        cv_result = xgb.cv(
            params,
            xgtrain,
            num_boost_round=10, 
            nfold=3,
            seed=0
        )

        return 1.0 - cv_result['test-error-mean'].values[-1]

xgboost_cv_bo = BayesianOptimization(xgboost_cv, 
                            {
                                'learning_rate': (0.1, 0.9),
                                'max_depth': (5, 15),
                                'subsample': (0.5, 1),
                                'colsample_bytree': (0.1, 1),
                                'min_child_weight': (1, 20),
                                'gamma': (0, 10),
                                'alpha': (0, 10),
                            })

xgboost_cv_bo.maximize(n_iter=50)

best_params_xg=xgboost_cv_bo.max["params"]
print(best_params_xg)


|   iter    |  target   |   alpha   | colsam... |   gamma   | learni... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.7227   [0m | [0m2.989    [0m | [0m0.745    [0m | [0m7.137    [0m | [0m0.5481   [0m | [0m6.713    [0m | [0m4.829    [0m | [0m0.7314   [0m |
| [0m2        [0m | [0m0.7227   [0m | [0m6.856    [0m | [0m0.7048   [0m | [0m2.071    [0m | [0m0.2154   [0m | [0m12.53    [0m | [0m19.5     [0m | [0m0.9099   [0m |
| [0m3        [0m | [0m0.7227   [0m | [0m3.247    [0m | [0m0.7844   [0m | [0m8.121    [0m | [0m0.5149   [0m | [0m5.462    [0m | [0m14.14    [0m | [0m0.9917   [0m |
| [95m4        [0m | [95m0.9506   [0m | [95m3.49     [0m | [95m0.8318   [0m | [95m0.2654   [0m | [95m0.5995   [0m | [95m9.886    [0m | [95m3.337    [0m | [95m0.9062   [0m |
| [0m5        [0m | [0m0.7227   [0m | 

In [179]:
best_model_xg=XGBClassifier(
    learning_rate=best_params_xg["learning_rate"],
    max_depth=int(best_params_xg["max_depth"]),
    subsample=best_params_xg["subsample"],
    colsample_bytree=best_params_xg["colsample_bytree"],
    min_child_weight=best_params_xg["min_child_weight"],
    gamma=best_params_xg["gamma"],
    alpha=best_params_xg["alpha"]
)

best_model_xg.fit(X_train,y_train)

y_pred_xg=best_model_xg.predict(X_test)

accuracy=accuracy_score(y_test,y_pred_xg)
print("accuracy:",accuracy)
print("正答率(%):",accuracy*100,"%")

accuracy: 0.9047619047619048
正答率(%): 90.47619047619048 %


# SVM

In [180]:
from sklearn import svm

model_svm=svm.SVC(kernel="linear",C=1.0)
model_svm.fit(X_train,y_train)

y_pred_svm=model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred_svm)
print("accuracy:",accuracy)

accuracy: 0.9523809523809523
