<a href="https://colab.research.google.com/github/notice4/google-colab-study/blob/main/lecture_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from  sklearn.ensemble import  RandomForestClassifier
from sklearn.model_selection import  train_test_split,GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [2]:
medicine = pd.read_csv("https://raw.githubusercontent.com/notice4/google-colab-study/refs/heads/main/Breast_cancer_data.csv")
medicine

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.80,1001.0,0.11840,0
1,20.57,17.77,132.90,1326.0,0.08474,0
2,19.69,21.25,130.00,1203.0,0.10960,0
3,11.42,20.38,77.58,386.1,0.14250,0
4,20.29,14.34,135.10,1297.0,0.10030,0
...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0
565,20.13,28.25,131.20,1261.0,0.09780,0
566,16.60,28.08,108.30,858.1,0.08455,0
567,20.60,29.33,140.10,1265.0,0.11780,0


In [4]:
medicine['diagnosis'].value_counts()

Unnamed: 0_level_0,count
diagnosis,Unnamed: 1_level_1
1,357
0,212


In [5]:
medicine.shape

(569, 6)

In [6]:
medicine.isnull().sum()

Unnamed: 0,0
mean_radius,0
mean_texture,0
mean_perimeter,0
mean_area,0
mean_smoothness,0
diagnosis,0


In [13]:
scaler = MinMaxScaler()
model = RandomForestClassifier()
y = medicine['diagnosis']
X = medicine.drop('diagnosis',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [14]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [32]:
parameters = {
    'n_estimators': [20, 40, 60, 80, 100, 200, 300]
}

# GridSearchCV is a "combination of options".
#
# For each variant, it: 1) splits the data into parts (this is called CV—cross-validation)
#                       2) Trains the model
#                       3) Checks the accuracy
#                       4) Stores the result
#
# Selects the best combination of parameters
#

grid_search = GridSearchCV (
    estimator = model, # model we are working with (RandomForestClassifier)
    param_grid = parameters, # this is a dictionary with parameter values ​​that need to be iterated through
    scoring = 'accuracy', # what we are searching (best accuracy for model)
    n_jobs = -1, # -1 means "use all cpu-s"
    cv = 5 # this is the number of folds in cross-validation (4 for train 1 for test)
    )

grid_search.fit(X_train, y_train)

In [33]:
print(f'Best parameter found by GridSearch: {grid_search.best_params_}')
print(f'Best accuracy score with best parameter: {grid_search.best_score_}')

Best parameter found by GridSearch: {'n_estimators': 80}
Best accuracy score with best parameter: 0.9318681318681319


In [34]:
medicine.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


In [35]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [36]:
print(357 * 0.85) # over = 357 -> 303
print(303 / 0.86) # under = 212 -> 353

303.45
352.3255813953488


In [37]:
pipeline = Pipeline([
    ('over', SMOTE(sampling_strategy=0.85)),
    ('under', RandomUnderSampler(sampling_strategy=0.86)),
    ('model', RandomForestClassifier())
])

pipeline.fit(X_train, y_train)

In [38]:
pipeline.score(X_test, y_test)

0.9122807017543859