## AutoMLライブラリのAutogluonを使用

## import関係

In [45]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

## path関係

In [46]:
citrus_path = "./citrus.csv"
healthcare_path = "./healthcare-dataset-stroke-data.csv"

## 課題1　グレープとオレンジの識別

In [47]:
citrus = pd.read_csv(citrus_path)
print(citrus.info())

train, test = train_test_split(citrus, test_size=0.1, shuffle=True, random_state=1000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      10000 non-null  object 
 1   diameter  10000 non-null  float64
 2   weight    10000 non-null  float64
 3   red       10000 non-null  int64  
 4   green     10000 non-null  int64  
 5   blue      10000 non-null  int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 468.9+ KB
None


In [48]:
label = 'name'
print("Summary of class variable: \n", train[label].describe())

Summary of class variable: 
 count       9000
unique         2
top       orange
freq        4518
Name: name, dtype: object


## train

In [49]:
save_path = 'citrus-predictClass'  # specifies folder to store trained models
predictor = TabularPredictor(label=label, path=save_path).fit(train)

Beginning AutoGluon training ...
AutoGluon will save models to "citrus-predictClass/"
AutoGluon Version:  0.5.2
Python Version:     3.8.13
Operating System:   Linux
Train Data Rows:    9000
Train Data Columns: 5
Label Column: name
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['orange', 'grapefruit']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = orange, class 0 = grapefruit
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (orange) vs negative (grapefruit) class.
	To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preprocess the data ...

[1000]	valid_set's binary_error: 0.0455556
[2000]	valid_set's binary_error: 0.0366667


	0.9633	 = Validation score   (accuracy)
	2.29s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: LightGBM ...
	0.9922	 = Validation score   (accuracy)
	0.94s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.96	 = Validation score   (accuracy)
	2.84s	 = Training   runtime
	0.58s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.9589	 = Validation score   (accuracy)
	3.02s	 = Training   runtime
	0.56s	 = Validation runtime
Fitting model: CatBoost ...
	0.9911	 = Validation score   (accuracy)
	9.11s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.9567	 = Validation score   (accuracy)
	3.13s	 = Training   runtime
	0.6s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.96	 = Validation score   (accuracy)
	3.16s	 = Training   runtime
	0.63s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	1.0	 = Validation score   (accuracy)
	18.94s	 = Training   runtime
	0.05s	 = Val

## predction

In [50]:
y_test = test[label]
test_data_nolab = test.drop(columns=[label])
test_data_nolab.head()

Unnamed: 0,diameter,weight,red,green,blue
8034,11.8,202.88,134,63,2
7122,11.22,193.76,156,66,12
7571,11.5,198.09,158,64,14
7347,11.35,196.08,143,75,17
3246,8.94,160.01,174,81,2


In [51]:
predictor = TabularPredictor.load(save_path)
y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.995
Evaluations on test data:
{
    "accuracy": 0.995,
    "balanced_accuracy": 0.9948132780082988,
    "mcc": 0.990033079788606,
    "f1": 0.9947862356621481,
    "precision": 1.0,
    "recall": 0.9896265560165975
}


Predictions:  
 8034    grapefruit
7122    grapefruit
7571    grapefruit
7347    grapefruit
3246        orange
           ...    
7973    grapefruit
7264    grapefruit
8307    grapefruit
9868    grapefruit
6244    grapefruit
Name: name, Length: 1000, dtype: object


## 評価結果
accuracyが0.993と非常に高い精度であった
precision, recallも0.99以上であったので精度として問題ないと考える

In [52]:
predictor.leaderboard(test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetTorch,0.996,0.995556,0.026806,0.026531,101.324986,0.026806,0.026531,101.324986,1,True,12
1,NeuralNetFastAI,0.995,1.0,0.057089,0.045088,18.940271,0.057089,0.045088,18.940271,1,True,10
2,WeightedEnsemble_L2,0.995,1.0,0.063728,0.047333,19.933544,0.006639,0.002245,0.993273,2,True,14
3,XGBoost,0.991,0.992222,0.017426,0.021219,1.184178,0.017426,0.021219,1.184178,1,True,11
4,LightGBMLarge,0.991,0.992222,0.040867,0.02235,2.943447,0.040867,0.02235,2.943447,1,True,13
5,LightGBM,0.988,0.992222,0.010515,0.010699,0.938281,0.010515,0.010699,0.938281,1,True,4
6,CatBoost,0.987,0.991111,0.007676,0.005901,9.105427,0.007676,0.005901,9.105427,1,True,7
7,RandomForestEntr,0.952,0.958889,0.593511,0.560535,3.015427,0.593511,0.560535,3.015427,1,True,6
8,RandomForestGini,0.952,0.96,0.602823,0.58047,2.838186,0.602823,0.58047,2.838186,1,True,5
9,ExtraTreesEntr,0.948,0.96,0.640574,0.625536,3.162647,0.640574,0.625536,3.162647,1,True,9


## 課題2 脳卒中の発症の有無

In [53]:
healthcare = pd.read_csv(healthcare_path)
print(healthcare.info())

train, test = train_test_split(healthcare, test_size=0.1, shuffle=True, random_state=1000)

print(train["stroke"].value_counts())
print(test["stroke"].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
None
0    4381
1     218
Name: stroke, dtype: int64
0    480
1     31
Name: stroke, dtype: int64


In [54]:
label = 'stroke'
print("Summary of class variable: \n", train[label].describe())

Summary of class variable: 
 count    4599.000000
mean        0.047402
std         0.212519
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: stroke, dtype: float64


## train

In [55]:
save_path = 'healthcare-predictClass'
predictor = TabularPredictor(label=label, path=save_path).fit(train)

Beginning AutoGluon training ...
AutoGluon will save models to "healthcare-predictClass/"
AutoGluon Version:  0.5.2
Python Version:     3.8.13
Operating System:   Linux
Train Data Rows:    4599
Train Data Columns: 11
Label Column: stroke
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    10888.86 MB
	Train Data (Original)  Memory Usage: 1.68 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the

## prediction

In [56]:
y_test = test[label]
test_data_nolab = test.drop(columns=[label])
test_data_nolab.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
147,62861,Female,78.0,0,0,Yes,Private,Urban,67.29,24.6,never smoked
963,4083,Female,30.0,0,0,No,Private,Rural,73.69,17.3,never smoked
1544,51020,Female,55.0,0,0,Yes,Private,Rural,87.78,25.2,formerly smoked
1178,55775,Female,59.0,0,0,Yes,Private,Rural,226.11,32.8,formerly smoked
4891,18636,Female,26.0,0,0,Yes,Govt_job,Urban,72.56,35.4,never smoked


In [57]:
predictor = TabularPredictor.load(save_path)
y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.9334637964774951
Evaluations on test data:
{
    "accuracy": 0.9334637964774951,
    "balanced_accuracy": 0.496875,
    "mcc": -0.019529404440363142,
    "f1": 0.0,
    "precision": 0.0,
    "recall": 0.0
}


Predictions:  
 147     0
963     0
1544    0
1178    0
4891    0
       ..
2880    0
2862    0
360     0
520     0
2641    0
Name: stroke, Length: 511, dtype: int64


## 評価結果
accuracyは0.937と非常に高い精度であった  
しかし、precisionやrecallが非常に低く偽陰性が高いと考えられる  
医療現場で使用されることを考えると偽陽性より偽陰性の方が問題になると考えられるためモデルを改良してprecision, recallが1に近づくようにモデルを改良するかデータの前処理をする必要がある  

In [58]:
predictor.leaderboard(test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM,0.941292,0.954,0.012458,0.011793,0.854557,0.012458,0.011793,0.854557,1,True,4
1,LightGBMLarge,0.939335,0.952,0.008245,0.012793,1.925025,0.008245,0.012793,1.925025,1,True,13
2,LightGBMXT,0.939335,0.954,0.009866,0.0111,0.983306,0.009866,0.0111,0.983306,1,True,3
3,KNeighborsUnif,0.939335,0.952,0.034587,0.040053,0.026536,0.034587,0.040053,0.026536,1,True,1
4,NeuralNetTorch,0.939335,0.952,0.042679,0.034555,11.235062,0.042679,0.034555,11.235062,1,True,12
5,NeuralNetFastAI,0.939335,0.952,0.046505,0.043019,10.063351,0.046505,0.043019,10.063351,1,True,10
6,ExtraTreesEntr,0.939335,0.952,0.601437,0.491566,2.894496,0.601437,0.491566,2.894496,1,True,9
7,ExtraTreesGini,0.939335,0.952,0.608016,0.471864,2.962943,0.608016,0.471864,2.962943,1,True,8
8,CatBoost,0.937378,0.954,0.013578,0.011005,4.3252,0.013578,0.011005,4.3252,1,True,7
9,KNeighborsDist,0.937378,0.948,0.039125,0.043575,0.031065,0.039125,0.043575,0.031065,1,True,2
