# **Machine Learning Guide With Optuna**

In [1]:
# Import 
import pandas as pd 
import numpy as np 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split

In [2]:
# Some Basic Setting
N_SPLIT = 10
SEED = 42

# Load Data

In [3]:
# Load Data 
df = pd.read_csv('/kaggle/input/iris/Iris.csv')

# Head 
print(df.head())

# Target Count 
print('----------------------------------------------')
print(df['Species'].value_counts())
print('----------------------------------------------')

# Encode The Target as in PReprocessing

# Label Encoder ko Intlize krien
LE = LabelEncoder()

# Es ko Fit krien 
df['Species'] = LE.fit_transform(df['Species'])
print('Target Encoded ')

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa
----------------------------------------------
Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64
----------------------------------------------
Target Encoded 


In [4]:
# Tips 
tip = sns.load_dataset('tips')

print(tip.head())

print('----------------------------------------------')
print(tip['sex'].value_counts())
print('----------------------------------------------')

# Encode The Target as in PReprocessing
# Label Encoder ko Intlize krien
LE = LabelEncoder()

# Es ko Fit krien 
tip['sex'] = LE.fit_transform(tip['sex'])
tip['smoker'] = LE.fit_transform(tip['smoker'])
tip['day'] = LE.fit_transform(tip['day'])
tip['time'] = LE.fit_transform(tip['time'])

print('Data Encoded ')
print('----------------------------------------------')

# Scale Krien Data ko 
# Scaler Ko Intilize krn 
sc = StandardScaler()

tip[['tip','total_bill']] = sc.fit_transform(tip[['tip','total_bill']])
print('Data Scaled ')
print('----------------------------------------------')

print(tip.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
----------------------------------------------
sex
Male      157
Female     87
Name: count, dtype: int64
----------------------------------------------
Data Encoded 
----------------------------------------------
Data Scaled 
----------------------------------------------
   total_bill       tip  sex  smoker  day  time  size
0   -0.314711 -1.439947    0       0    2     0     2
1   -1.063235 -0.969205    1       0    2     0     3
2    0.137780  0.363356    1       0    2     0     3
3    0.438315  0.225754    1       0    2     0     2
4    0.540745  0.443020    0       0    2     0     4


# <p style="background-color: #00A4CCFF; font-family:Pacifico; color:#E6E6FA; font-size:200%; text-align:center; border-radius:0%;">Machine Learning</p>

I Will Show you How To Train Model and , Which Things Effect the Model , How to Get Accurate Model Each and Everthing Will Be Discussed Here.

# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">Simple Training</p>

In [7]:
# Sab se phly ham libraries ko import kr lyn gye Jo ke Important hain Hamaray Liye , Meny Idhr Data Bilkul Small Liye hy Maybe Idhr Apko
# Optuna ke Results me itna Farak Show na ho Liken Optuna is Best For Large Datsets.
# Me idhr bs 4 Models ko hi Import krn ga ap Apni Problem ke Hisab se Mukhtalif Models ko Use kr Sktay Hain 

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# CatBoost
from catboost import CatBoostClassifier

# LightGBM
from lightgbm import LGBMClassifier

# XGBoost
from xgboost import XGBClassifier

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Import The Metrix 
from sklearn.metrics import *


# # X and Y ko Alg Kr Lyn Ek br 
X_iris = df.drop('Species',axis=1)
y_iris = df['Species']

# Simple Train Test Split 
X_train_iris,X_test_iris,y_train_iris,y_test_iris = train_test_split(X_iris,y_iris,test_size=0.2,random_state=42)

# # X and Y ko Alg Kr Lyn Ek br 
X_tip = tip.drop('sex',axis=1)
y_tip = tip['sex']

# Simple Train Test Split 
X_train_tip,X_test_tip,y_train_tip,y_test_tip = train_test_split(X_tip,y_tip,test_size=0.2,random_state=42)

# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">1. Iris</p>

In [8]:
# Acha to Ab hamaray models import ho gye hain ab ham sb se phly , in models me se best model ko select krien gyen
# Jo Model sb se acha ho ga us pa ham further Parameter tunning krien gye.

# Best Model ko Kesy Find Kia Jaa Skta hy , Ager Data Set Bht Large ho to tb kesy Best Model ko Nikalna hy , Chalien Sb Se Phly Dekhtay
# Hain How to Get Best Model
def Check_All(X_train,X_test,y_train,y_test):
    # Sb se Phly Ham ek Dic Bana Lyn Gye Models ki With Their names.
    models = {
        'RF' : RandomForestClassifier(random_state=SEED),
        'DT' : DecisionTreeClassifier(random_state=SEED),
        'XGB': XGBClassifier(random_state=SEED),
        'CAT': CatBoostClassifier(verbose=0,random_state=SEED), # Idhr me Verbose 0 or -1 is liye pass kr rha hn ku ke Output ko Truncate kr rha 
        'LGBM': LGBMClassifier(verbose=-1,random_state=SEED) # hn CatBoost aur LGBM ki 
    }

    # Ab Hamara Next Step ho ga Ham ek Loop Run Krien Gye Jo ke Hamarai Dic me se Model aur un ke Names ko Pick Kry ga aur Phr One 
    # by one Un Models ko Train kr Ky Un ke Resultsn ko Store kry ga Then ham un Results ko ek Dataframe me Store Krwaien Gye. 

    # Idhr ab ek List bhi Intilize krien gye Jis me Hamary Scores aur Model ka Name Store Hota Rahe 
    Scores_of_Models = []

    # Lets Run The Loop.
    for name,model in models.items():  # .items() use karna zaroori hai
        M = model

        # Train Model 
        M.fit(X_train,y_train)

        # Predict 
        y_Pred = M.predict(X_test)

        # Idhr me Bs Accuracy is Check krn ga , Ap Apni Problem ke Hisab se Kr Sktay Hain Kuch aur Matrix add
        acc = accuracy_score(y_test,y_Pred)

        Scores_of_Models.append((name,acc))

        # Print The Score 
    #     print(f'Accuracy of The {name} is {acc}')

    # Ab Jo Scores Ayen Hain Ham Un Ko Ek Data Frame me Ly Jayen gye aur Show Krwaien Gye 
    scores_df = pd.DataFrame(Scores_of_Models, columns=['Model', 'Accuracy'])
    return scores_df

Check_All(X_train_iris,X_test_iris,y_train_iris,y_test_iris)

Unnamed: 0,Model,Accuracy
0,RF,1.0
1,DT,1.0
2,XGB,1.0
3,CAT,1.0
4,LGBM,1.0


- **train_test_split ko use krty huway hamary sab model ne 100% acc achieve kii hy , dekhnay me to yeh bht Acha lag rha hy , now I have a Question Kia Yeh Sb models Accurate hain? Kiya inhon ne jo score dia hy woh asal me bhi itna hi hy? Kiye yeh model 100% sahi jawab hi deyn gyen ? Ab In Question Ko Ly ke Chaltay hain aur me explain krn ga ke yeh results sahi nahi , hamien es pa focus krna ho ga aur inhien acha krna ho ga.**

- Ab ager Ek Non-Expereience band Yeh Result Dekhay Ga to Woh to Inhie ko Best Score man ly gye aur in Future us ka Model sahi predictions nahi dey ga.

# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">2. Tip</p>

In [9]:
Check_All(X_train_tip,X_test_tip,y_train_tip,y_test_tip)

Unnamed: 0,Model,Accuracy
0,RF,0.591837
1,DT,0.591837
2,XGB,0.632653
3,CAT,0.612245
4,LGBM,0.673469


# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">More Effiecient Trainig</p>

# Cross Validation Split

Model ko train krny ke 2 Tareeqay hain ek to Simple tareeqa hy ke ham direct model ko Train kr deyn... Without any Cross Validation, matlb Model ko Cross validate kiya hi na Jaye woh jo ek br me Results Show kr rha hy usi pa yaqeen kr lia jaye..

ek tareeeqa hota hy ke model ko cross_validate kia Jaye... Cross_validation ek achi approch hoti hy es se hamay accurate and best models miltay hain. 

Ab Main Sawal Yeh hy ke Kia Hamay Hr Bar hi Cross_Validation Krni Chaiye ? Ya Kuch Special case hain Jahan Es ko use krna Bht Zarori Smjha Jata hy..

Chalo Ab Yeh Smjhtay Hain ke Cross_Validation kia hy aur es ka use Ku Zarori hy.

# What is Cross Validation / CV
Cross-validation (CV) ek technique hai jo machine learning model ki performance ko evaluate aur test karne ke liye istemal hoti hai. Ye technique aam tor par applied ML tasks mein istemal hoti hai. Iska maqsad kisi specific predictive modeling problem ke liye behtareen model ko compare aur select karna hota hai.

CV samajhne mein asaan hai, implement karna asaan hai, aur yeh doosri methods se kam bias rakhti hai jo model ki efficiency ko count karne mein istemal hoti hain. Yeh sab qualities cross-validation ko specific tasks ke liye behtareen model select karne ke liye ek taqatwar tool banati hain.

# Kab Cross Validation ko use krna Chahiye 
Ager to Short Awnser chahiye to CV ko tb use krna chahiye jab data kam ho apke pass.

Cross-validation ko tab use karna chahiye jab aap machine learning model ki performance ko sahi tareeqe se evaluate karna chahte hain aur model ko robust banane ki zaroorat hai. Jab aap ek predictive modeling problem ke liye behtareen model select karna chahte hain, to cross-validation ek zaroori tool hai.

Is technique ka istemal tab kiya jata hai jab aapko model ki accuracy, precision, recall, aur doosre performance metrics ko sahi tareeqe se measure karna hai. Cross-validation models ke bias aur variance ko assess karne mein madadgar hoti hai aur yeh ensure karta hai ke model generalized hai aur overfitting se bacha hua hai.

Aam tor par, jab data limited ho ya model complexity zyada ho, tab cross-validation ka istemal zyada faydemand hota hai. Yeh technique data ke sahi tareeqe se use aur model ki sahi parameter tuning mein bhi madad deti hai.

liken ap big datasets pa bhi good results ke liye esy use kr skty hain.

- Ager ap Interest rkhtay hain es ko detail me smjhanay kii to es link pa jayen aur blog read krien ::
    - **Cross Validation Blog Link :: https://neptune.ai/blog/cross-validation-in-machine-learning-how-to-do-it-right**
    
Kuch Smjh me na aye , So ap Dm kr ky Poch Sktay hain , Me Explain kr dn ga

In [10]:
# Es se Bhi Phly Yeh Smjhna Zarori hy ke Data ko Split Kesy Kia Jaye , So meny Ek Achi Website Dhoondhi hy Jis ka Link me Attach kar
# Rha hn Idhr Ap ne Es Website pa Jaa ke Cross Validation ke Tareeqay Dekh Lenay Hain . Beacuse Idhr me Explain krn ga to 
# Session Lamba ho Jaye ga , Ager Apko Smjh Nahi aye gyii , So Ap Dm me Question krien , me Ap ko Complete Guide kr dn ga.

# i am Using StratifiedKFold in My Session , StratifiedKFold is Much Better Than Train Test Split , You Can Use any other According to you Problem,
# Link me Complete Guide se Batya Huwa hy ke Kab Kn sa Cross Validation Method Use krna hy.

### StratifiedKFold Istemaal Karna

"""So StratifiedKFold hy Kia ? 

Stratified k-Fold ek variation hai standard k-Fold CV technique ka jo target imbalance cases me effective hota hai. 

Iska kaam kuch is tarah hota hai: Stratified k-Fold dataset ko k folds me is tarah se split karta hai ke har fold me
approximately same percentage of samples hoti hain har target class ki, jaise ke complete set me hoti hain. Regression
ke case me, Stratified k-Fold yeh ensure karta hai ke har fold me mean target value approximately equal ho.

Stratified k-Fold algorithm kuch is tarah hai:

1. Folds ki tadaad ka intekhab karein – k
2. Dataset ko k folds me split karein. Har fold me approximately same percentage of samples hone chahiyein har target
class ki jo ke complete set me hoti hain.
3. k – 1 folds ko training set ke liye use karein. Baqi ek fold test set hoga.
4. Training set pe model ko train karein. Har iteration me ek naya model train hona chahiye.
5. Test set pe validate karein.
6. Validation ka result save karein.
7. Steps 3 – 6 ko k martaba repeat karein. Har dafa baqi fold ko test set ke taur pe use karein. Aakhir me aapne har
fold pe model validate kar liya hoga.
8. Final score hasil karne ke liye step 6 ke results ko average karein."""

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=N_SPLIT) # Idhr n_Splits ki Value Ham khud Define kr Sktay hain Apni Mrzi se 

for train_index, test_index in skf.split(X_iris, y_iris):
    X_train_iris, X_test_iris = X_iris.iloc[train_index], X_iris.iloc[test_index]
    y_train_iris, y_test_iris = y_iris.iloc[train_index], y_iris.iloc[test_index]
    
for train_index, test_index in skf.split(X_tip, y_tip):
    X_train_tip, X_test_tip = X_tip.iloc[train_index], X_tip.iloc[test_index]
    y_train_tip, y_test_tip = y_tip.iloc[train_index], y_tip.iloc[test_index]

# ### Smjhana :

# 1. StratifiedKFold Object Banane:
#    `StratifiedKFold` ek tarah ka KFold hai jo data ko split karta hai, lekin yeh ensure karta hai ke har fold mein har class ke samples ki proportional distribution ho.

# 2. Data Split Karna:
#    skf.split(X, y)` function `X` (features) aur `y` (labels) ko lekar data ko split karta hai.
#    Har iteration mein, `train_index` aur `test_index` milte hain jo training aur test sets ke indexes hote hain.

# 3. Training aur Test Data Banane:
#    X_train` aur `X_test` features ke liye training aur test data hain jo `train_index` aur `test_index` ke basis par select kiya jata hai.
#    y_train` aur `y_test` labels ke liye training aur test data hain jo unhi indexes ke basis par select kiya jata hai.

# Yeh approach istemaal kar ke, aap data ko stratified manner mein split kar sakte hain, jisse har fold mein har class ke samples ki proportional distribution maintain hoti hai.

# How to Get The Best Model
To find Best Model , Make a Dic Containing all The Models You want to evaluate and the run and loop and train all the model one by one , predict the results , evluate the scores and make a dataframe.
Now You have all the models with their scores. now you can easily Select the Best Model.

Maan Lo ke hamaray pass dataset bht large hy, ab es case me kia krna chahiye , es case me best and optimal solution hy ke ap data ka sample lyn aur us ko achay se split krien random_seed ke sth , phr us data set pa same upper wala method apply krien aur best model select kr lyn.

# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">1st Way If Data is Small</p>

## Iris 

In [11]:
# Acha to Ab hamaray models import ho gye hain ab ham sb se phly , in models me se best model ko select krien gyen
# Jo Model sb se acha ho ga us pa ham further Parameter tunning krien gye.

# Best Model ko Kesy Find Kia Jaa Skta hy , Ager Data Set Bht Large ho to tb kesy Best Model ko Nikalna hy , Chalien Sb Se Phly Dekhtay
# Hain How to Get Best Model

# Sb se Phly Ham ek Dic Bana Lyn Gye Models ki With Their names.
models = {
    'RF' : RandomForestClassifier(random_state=SEED),
    'DT' : DecisionTreeClassifier(random_state=SEED),
    'XGB': XGBClassifier(random_state=SEED),
    'CAT': CatBoostClassifier(verbose=0,random_state=SEED), # Idhr me Verbose 0 or -1 is liye pass kr rha hn ku ke Output ko Truncate kr rha 
    'LGBM': LGBMClassifier(verbose=-1,random_state=SEED) # hn CatBoost aur LGBM ki 
}

# Ab Hamara Next Step ho ga Ham ek Loop Run Krien Gye Jo ke Hamarai Dic me se Model aur un ke Names ko Pick Kry ga aur Phr One 
# by one Un Models ko Train kr Ky Un ke Resultsn ko Store kry ga Then ham un Results ko ek Dataframe me Store Krwaien Gye. 

# Idhr ab ek List bhi Intilize krien gye Jis me Hamary Scores aur Model ka Name Store Hota Rahe 
Scores_of_Models = []

# Lets Run The Loop.
for name,model in models.items():  # .items() use karna zaroori hai
    M = model
    
    # Train Model 
    M.fit(X_train_iris,y_train_iris)
    
    # Predict 
    y_Pred = M.predict(X_test_iris)
    
    # Idhr me Bs Accuracy is Check krn ga , Ap Apni Problem ke Hisab se Kr Sktay Hain Kuch aur Matrix add
    acc = accuracy_score(y_test_iris,y_Pred)
    
    Scores_of_Models.append((name,acc))
    
    # Print The Score 
#     print(f'Accuracy of The {name} is {acc}')
    
# Ab Jo Scores Ayen Hain Ham Un Ko Ek Data Frame me Ly Jayen gye aur Show Krwaien Gye 
scores_df = pd.DataFrame(Scores_of_Models, columns=['Model', 'Accuracy'])
scores_df

Unnamed: 0,Model,Accuracy
0,RF,0.933333
1,DT,0.866667
2,XGB,1.0
3,CAT,1.0
4,LGBM,0.733333


- **Ab,Farak dekhien , Cross Validation ko Use krty Huway Model ko Evaluate krny pa Ap dekh sektay hain Results me kitna change aa gya hy. Jo Model Phly 100% acc de rhy thy ab woh Kitni Kam accuracy de rhy Hain.**

- **Jab Apke Pass Dataset Small ho to usko Cross-Validate Lazmi krien.**

## Tips

In [12]:
# Acha to Ab hamaray models import ho gye hain ab ham sb se phly , in models me se best model ko select krien gyen
# Jo Model sb se acha ho ga us pa ham further Parameter tunning krien gye.

# Best Model ko Kesy Find Kia Jaa Skta hy , Ager Data Set Bht Large ho to tb kesy Best Model ko Nikalna hy , Chalien Sb Se Phly Dekhtay
# Hain How to Get Best Model

# Sb se Phly Ham ek Dic Bana Lyn Gye Models ki With Their names.
models = {
    'RF' : RandomForestClassifier(random_state=SEED),
    'DT' : DecisionTreeClassifier(random_state=SEED),
    'XGB': XGBClassifier(random_state=SEED),
    'CAT': CatBoostClassifier(verbose=0,random_state=SEED), # Idhr me Verbose 0 or -1 is liye pass kr rha hn ku ke Output ko Truncate kr rha 
    'LGBM': LGBMClassifier(verbose=-1,random_state=SEED) # hn CatBoost aur LGBM ki 
}

# Ab Hamara Next Step ho ga Ham ek Loop Run Krien Gye Jo ke Hamarai Dic me se Model aur un ke Names ko Pick Kry ga aur Phr One 
# by one Un Models ko Train kr Ky Un ke Resultsn ko Store kry ga Then ham un Results ko ek Dataframe me Store Krwaien Gye. 

# Idhr ab ek List bhi Intilize krien gye Jis me Hamary Scores aur Model ka Name Store Hota Rahe 
Scores_of_Models = []

# Lets Run The Loop.
for name,model in models.items():  # .items() use karna zaroori hai
    M = model
    
    # Train Model 
    M.fit(X_train_tip,y_train_tip)
    
    # Predict 
    y_Pred = M.predict(X_test_tip)
    
    # Idhr me Bs Accuracy is Check krn ga , Ap Apni Problem ke Hisab se Kr Sktay Hain Kuch aur Matrix add
    acc = accuracy_score(y_test_tip,y_Pred)
    
    Scores_of_Models.append((name,acc))
    
    # Print The Score 
#     print(f'Accuracy of The {name} is {acc}')
    
# Ab Jo Scores Ayen Hain Ham Un Ko Ek Data Frame me Ly Jayen gye aur Show Krwaien Gye 
scores_df = pd.DataFrame(Scores_of_Models, columns=['Model', 'Accuracy'])
scores_df

Unnamed: 0,Model,Accuracy
0,RF,0.541667
1,DT,0.458333
2,XGB,0.583333
3,CAT,0.583333
4,LGBM,0.666667


# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">2nd Way If Data is Large</p>

In [13]:
"""Es ke Liye sab se phly ham ek data load krien gye Jo ke diamonds ka data hy , woh large Dataset hy , Jo ke Zayda 
# Models pa Kaffi Zayda Time Ly Jaye ga So Us ko Handle Krien gye Ham Idhr aur Us Same code pa dekhien gye ke kesy
ham problem handle krty hain"""
diamond = sns.load_dataset('diamonds')
print(diamond.head())
print("------------------------------------")
"""Ab Es data ki Shape bhi dekh letay hain ek br"""
print(f'The Shape Of Data is {diamond.shape}')
print(f'The No Of Rows in Our Data is {diamond.shape[0]}')

"""Jesa ka ab hamara data kaafi large hy , ab ager ham es pa wohi code run krien gye to , yeh Time bht Zayda Consume kry ga,
so ab es Situation me kia krna chahiye, Ab Es situation me ham data ka sample ly ln gye randomseed ke sth takay data 
sahi se shuffle ho ke aye aur modeling effect na ho

Us se bhi Phly Hamy data ko preprocess krna ho ga , takay modeling me koi error na aye.

How to Preprocess Data for Machine Learning. es me Kuch setps hain Jo Ham one by One Idhr Dekhien gye.

A. Sab se Phly Ham Data me Check krien Gye Ke Koi Null Value to Nahi Naw?"""
print('--------------------------------------------------')
print(f"Null Values in Data is \n {diamond.isnull().sum()}")
print('--------------------------------------------------')

"""So hamara data null Values se to bacha huwa hy mean ek step clear hy , chalien Sochien ager yahan pa null values 
hoti to ham unhien kesy handle krty ? 

Null Values ko Handle krny ke liye bht se different methods hain un me se kuch Advance hain Kuch Beginners hain

Null Values ko Bht dehan se handle krna parta hy , yeh Machine learning pa bht effect krti hain. Ager Ham Null Values ko Sahi se 
Handle nahi krty to hamara model glt predictions bhi de skta hy ya model achi performence tk nahi ponchta.

Chalien Sb se Phly Kuch Beginners Level Ke Method ko Discuss krty hain

1. Mean
2. Median
3. Mode

Yeh 3 Method hain Jo Basically Beginner level pa use hotay, Like Yen Methods Sahi nahi ager data me ek large scale pa 
data missing hai
Large scale pa jab in methods ko use kia jaye to yeh data ko bht bura affect krty hain aur may be model ki performence
bht krb ho jaye

Ab Kuch Advance Methods ki Baat Kr Letay hain..

1. Iterative Imputer for Numerical Values :: Iterative Imputer ek behtareen method hai numerical null values ko handle
karne ka. Agar data mein large scale par null values hain, to aap is method se impute kar sakte hain. Is method mein, 
missing values ko multiple regression models ke through predict karke fill kiya jata hai. Har feature ko doosre features 
ke basis par estimate kiya jata hai. 

    Algorithm ka overview Iterative Imputer ek iterative process use karta hai. Pehle, missing values ko kuch initial
    guesses se fill kiya jata hai (jaise mean se). Phir, har feature ko ek regression model banaya jata hai jisme baaki
    features predictors ke tor par use hote hain. Ye process tab tak repeat hota hai jab tak imputations stabilize nahi
    ho jati.

    Advantage Ye method missing values ko zyada accurately fill karta hai kyunki har feature ko baaki features ke context
    mein consider kiya jata hai. Ye method zyada tar situations mein accurate predictions provide karta hai aur model 
    performance ko improve karta hai.

    Experience Mere experience ke mutabiq, ye best method hai agar aapke data mein large scale par numerical null values
    hain. Ye method data ke pattern ko achi tarah capture karta hai aur realistic imputations provide karta hai.

Note: Aap yeh method use karne se pehle ensure karen ke aapke paas sufficient computational resources hain, kyunki yeh
method computationally intensive ho sakta hai, especially large datasets ke liye

Also You can use other Mehtod instead of iteravtive imputer , like rf , other tree methods , Woh bhi Null Values ko achay
se predict kr ky impute krty hain


2. Agar data mein ek large scale par Categorical Null Values hain :: to sab se best option hai ke aap usko domain
knowledge ki base par impute karein. Agar aapke paas domain knowledge nahi hai, to phir aapko kuch advanced methods 
ki taraf jana chahiye, jaise model predictions.

    Domain Knowledge: Pehle step mein, domain knowledge ko use karte hue missing values ko fill karna ideal hota hai. 
    Example ke tor par, agar aapke pass customer data hai aur gender missing hai, to aap industry trends ya specific 
    business context ke basis par guess kar sakte hain.

    Model Predictions**: Agar domain knowledge available nahi hai, to aap advanced methods use kar sakte hain jaise 
    tree models (decision trees, random forests) ya gradient boosting models. Ye models categorical values ko accurately
    predict kar sakte hain. Iska ek complete code bhi mein aapke saath share kar dunga. However, ye method time-consuming hai agar data large ho. Lekin agar aapke paas achi computational resources hain, to aap is method ko try kar sakte hain.

    Constants se Imputation: Ek aur best way imputation ka yeh hai ke aap constants use karein jaise "No", "None", 
    "No record", etc. Ye method maine kaafi dafa use kiya hai aur isse model ki performance bhi achi hui hai. 
    Ye method simple hai aur computationally efficient bhi.

    Advice : Hamesha ek hi flow par rely nahi karna chahiye. Aapko mukhtalif methods se testing karni chahiye
    aur jo best lage usi ko aage leke jana chahiye. Ek best model hasil karne ke liye har technique ko try out
    karna zaroori hai.

Yeh kuch tareeqay hain jo ke null values ko impute karne ke liye zaroori hain.

---------------------------------------------------------------------------------------------------------------------------
B. Outliers

Null values ke baad hum data mein outliers ko dekhenge. Outliers data ko bohot bura effect karte hain aur saath hi saath
models ko bhi effect karte hain. Outliers kabhi kabhi faida bhi dete hain aur nuqsan bhi.

Outliers ko handle karna zaroori hai aur kuch models itne robust hote hain jo ke outliers ko khud hi deal kar lete hain.
In models ke names hain:

Decision Trees :: Decision trees naturally segment data based on feature values aur ye outliers se kam effect hote hain.
Random Forests :: Random forests bhi decision trees ka ensemble hote hain, isliye ye bhi outliers ke liye robust hote hain.
Gradient Boosting Machines (GBM) :: GBMs bhi outliers ko kuch had tak handle kar sakte hain apni iterative nature aur
misclassified examples par focus karne ki wajah se.
Support Vector Machines (SVM) :: SVMs bhi outliers ke liye robust ho sakte hain agar appropriate kernel aur parameters 
choose kiye jayen.
Robust Regression Models :: Models jaise Huber Regression aur RANSAC (Random Sample Consensus) specifically outliers ke 
liye robust design kiye gaye hain.

Outliers ko handle karne ke liye aap different techniques use kar sakte hain, jaise:

1. Trimming/Removing Outliers: Outliers ko data se nikal dena.
2. Transforming Data: Log transformation ya other transformations use karna.
3. Capping: Outliers ko upper ya lower bounds par cap kar dena. Capping me IQR , Z_Score Jesy Method use Hotay hain
4. Imputation: Outliers ko mean, median, ya mode se replace karna.

Kuch specific use cases mein outliers ko maintain karna bhi beneficial ho sakta hai, especially agar wo data ke specific
characteristics ko represent karte hain.

Baqi Ager Kisi ne es ko Detail ne krna ho to, You can check this Video :: 1. https://youtu.be/Lln1PKgGr_M?si=DGi1huwW1q0njFYj
2. https://youtu.be/OnPE-Z8jtqM?si=eFMKekI9dIFdRaNf
3. https://youtu.be/Ccv1-W5ilak?si=k6WNgh_dlhgGhywb
4. https://youtu.be/bcXA4CqRXvM?si=ifJJqYBmnNiTPnuG

Yeh 4 Videos Dekh Sktay Hain Ager Mazeed Information Leni hy to.

-----------------------------------------------------------------------------------------------------------------------------
C. Scaling

Scaling machine learning preprocessing ka important part hai. Lekin kuch models hain jo ke scaling se 
robust hain aur un par scaling ka baghair bhi kuch zyada effect nahi hota. Yeh models hain:

Decision Trees: Decision trees scaling se mutasir nahi hote.
Random Forests: Random forests bhi scaling se robust hain.
Gradient Boosting Machines (GBM): GBMs bhi scaling se itna effect nahi hote.
Naive Bayes: Naive Bayes scaling ka muhtaaj nahi hota.
K-Nearest Neighbors (KNN): KNN scaling se directly mutasir ho sakta hai, lekin distance metric ko modify kar ke isko 
robust bana sakte hain.
Support Vector Machines (SVM) with RBF or Polynomial Kernels: SVMs with certain kernels scaling se kam effect hoti hain.
Tree-based Ensemble Methods: Bagging aur boosting methods jaise AdaBoost scaling se robust hain.

Scaling ke Different Methods hain 

Min-Max Scaling: Is method mein data ko ek specific range (usually 0 se 1) mein scale kiya jata hai. Yeh method outliers
ke liye sensitive hota hai.

Standard Scaling: Is method mein data ko aise scale kiya jata hai ke uska mean 0 aur standard deviation 1 ho jaye. 
Yeh method zyada tar cases mein use hota hai.

Robust Scaling: Is method mein data ko aise scale kiya jata hai ke uske median aur interquartile range (IQR) ke basis 
par scaling ho. Yeh method outliers ke liye zyada robust hota hai.

Kn se Method kab Chose krna hy Es ki Koi ek Specific se Limit nahi ap as an Experiment bhi esy kr skty ho.
Baqi For More Deep Knwodlge Me Kuch Links Share kr rha hn Inhien Dekh lena 
1. https://youtu.be/eBrGyuA2MIg?si=6XxDh8HU3Y55zlbK
2. https://youtu.be/1Yw9sC0PNwY?si=TIA92CiBgGnO2y0I

Note :: Wesy Jahan hamy ek value ki Limit Pata ho wahan ham MinMax ko use krty hain aur Jahan Value ki Range ka Andaza na ho to Ham
Standarad Scaler ko Use krty Hain

Chalo ab Data ko Scale Krty hain. So Meny Ek Code Likh ke Rkha huwa hy Jis ne Scaling ki hr Command hy Bs ap ne Apni Choice 
Provide krni hy aur data ne scale ho jana hy. 
---------------------------------------------------------------------------------------------------------------------------
"""
# Import The Scalers 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer

# Function to Scale Data
def apply_scaling(data, columns, scaler_type):
    # Check the type of scaler and initialize the appropriate scaler object
    if scaler_type == 'S':
        scaler = StandardScaler()  # Initialize StandardScaler
    elif scaler_type == 'M':
        scaler = MinMaxScaler()  # Initialize MinMaxScaler
    elif scaler_type == 'R':
        scaler = RobustScaler()  # Initialize RobustScaler
    elif scaler_type == 'A':
        scaler = MaxAbsScaler()  # Initialize MaxAbsScaler
    elif scaler_type == 'Q':
        scaler = QuantileTransformer(output_distribution='normal')  # Initialize QuantileTransformer
    elif scaler_type == 'P':
        scaler = PowerTransformer()  # Initialize PowerTransformer
    else:
        raise ValueError("Invalid scaler type. Choose 'S' for StandardScaler, 'M' for MinMaxScaler, 'R' for RobustScaler, 'A' for MaxAbsScaler,'Q' for QuantileTransformer, or 'P' for PowerTransformer.")

    # Create a copy of the input data to avoid modifying the original data
    scaled_data = data.copy()

    # Loop through each column to be scaled
    for col in columns:
        # Apply the scaler to the current column and update the data with the scaled values
        scaled_data[col] = scaler.fit_transform(scaled_data[[col]])

    # Return the scaled data
    return scaled_data

# Specify columns and scaler type
columns_to_scale = [col for col in diamond.columns if diamond[col].dtype == 'float']
scaler_type = 'S' 

# Apply scaling data
diamond = apply_scaling(diamond, columns_to_scale, scaler_type)
print('Training data scaled successfully.')
print('--------------------------------------------------')


"""
---------------------------------------------------------------------------------------------------------------------------
D. Encoding
Data ko encode karna bhi bohot important step hai kyunki computers sirf numeric form mein data samajhte hain. Data ko
encode karne ke different methods hain:

One-Hot Encoding (OHE): Yeh method har category ka ek alag column bana deta hai. Yeh tab useful hota hai jab categories 
mein kisi tarah ka order nahi hota.

Label Encoding: Yeh method categories ko unique numeric values assign karta hai. Yeh tab use hota hai jab categories mein
kisi tarah ka natural order hota hai.

Ordinal Encoding: Yeh method bhi categories ko numeric values assign karta hai, lekin aap khud se order decide kar sakte
hain. Yeh tab useful hota hai jab categories ke beech mein ek specific order hota hai jo aap define karna chahte hain.

meny es ka bhi function bana rkha hy , jo ke Apke bht Kam aa skta hy. 
"""
# Function to Encode Data
def encode(data, columns, encoding_type='label'):
    try:
        # Initialize encoders
        label_encoder = LabelEncoder()
        one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
        
        # Idhr ham Data ki Copy bana Lyn Gye
        encoded_data = data.copy()
        
        for col in columns:
            if encoding_type == 'label':
                # Apply Label Encoding
                encoded_data[col] = label_encoder.fit_transform(data[col])
            elif encoding_type == 'onehot':
                # Idhr Ham Ohe Ko Apply krien gye Sth Sth New Transform Cols aur Dusray Cols ko Combine krien gye
                one_hot_encoded = one_hot_encoder.fit_transform(data[[col]])
                one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=[f"{col}_{category}" for category in label_encoder.fit(data[col]).classes_[1:]])
                encoded_data = pd.concat([encoded_data.drop(col, axis=1), one_hot_encoded_df], axis=1)
            else:
                raise ValueError("Invalid encoding type. Choose 'label' for Label Encoding or 'onehot' for One-Hot Encoding.")
        
        return encoded_data

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Call 
columns_to_encode = ['color','clarity','cut']
encoding_type = 'label'

# Apply encoding to the data
diamond = encode(diamond, columns_to_encode, encoding_type)
print('Data Encoded')
print('--------------------------------------------------')


#--------------------------------------------------------------------------------------------------------------------------
"""
Ap Hamara Data Machine Learning ke Liye Bilkul Ready hy , ap ham Apnay Models ko apply kr Sktay Hain but Main Question , Data 
bht Bhara hy , Bht time ly gyii Training to Kia Kiya Jaye , Ab Ham Data ka Random seed ke Sth Sample Lyn Gye 

"""
# Ap Kitnay Sample lena Chahtay hain Us ke Mutaqib Idhr Number add Kr Lyn, Jesy me Ihdr bs 10,000 Sample le rha hn
n_samples = 2000
# Ab Data ko Sample kr Lyn gye
diamond_sampled = diamond.sample(n=n_samples,random_state=SEED)
print(f"The Shape Of Sampled Df is {diamond_sampled.shape}")
print('--------------------------------------------------')


"""
Ab ham apnay data ko again X and y me divide krien gye Then Us ko Train and Test me Cross validate Split krien gye 
and phr Model Build krien gye
"""
X = diamond_sampled.drop('cut',axis=1)
y = diamond_sampled['cut']

"""Es se Bhi Phly Yeh Smjhna Zarori hy ke Data ko Split Kesy Kia Jaye , So meny Ek Achi Website Dhoondhi hy Jis ka Link 
me Attach karRha hn Idhr Ap ne Es Website pa Jaa ke Cross Validation ke Tareeqay Dekh Lenay Hain . Beacuse Idhr 
me Explain krn ga to Session Lamba ho Jaye ga , Ager Apko Smjh Nahi aye gyii , So Ap Dm me Question krien , me Ap ko 
Complete Guide kr dn ga.

# i am Using StratifiedKFold in My Session , StratifiedKFold is Much Better Than Train Test Split ,
You Can Use any other According to you Problem,Here Our Problem is Multiclass that Why I use This
# Link me Complete Guide se Batya Huwa hy ke Kab Kn sa Cross Validation Method Use krna hy.

### StratifiedKFold Istemaal Karna

"""

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=N_SPLIT) # Idhr n_Splits ki Value Ham khud Define kr Sktay hain Apni Mrzi se 

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

"""Smjhana :

1. StratifiedKFold Object Banane:
    `StratifiedKFold` ek tarah ka KFold hai jo data ko split karta hai, lekin yeh ensure karta hai ke har 
    fold mein har class ke samples ki proportional distribution ho.

2. Data Split Karna:
    skf.split(X, y)` function `X` (features) aur `y` (labels) ko lekar data ko split karta hai.
    Har iteration mein, `train_index` aur `test_index` milte hain jo training aur test sets ke indexes hote hain.

3. Training aur Test Data Banane:
    X_train` aur `X_test` features ke liye training aur test data hain jo `train_index` aur `test_index` ke basis par 
    select kiya jata hai.
    y_train` aur `y_test` labels ke liye training aur test data hain jo unhi indexes ke basis par select kiya jata hai.

# Yeh approach istemaal kar ke, aap data ko stratified manner mein split kar sakte hain, jisse har fold mein har class ke
samples ki proportional distribution maintain hoti hai.

_________________________________________________________________________________________________________________________

Acha to Ab hamaray models import ho gye hain ab ham sb se phly , in models me se best model ko select krien gyen
Jo Model sb se acha ho ga us pa ham further Parameter tunning krien gye.

Best Model ko Kesy Find Kia Jaa Skta hy , Ager Data Set Bht Large ho to tb kesy Best Model ko Nikalna hy , Chalien Sb Se Phly Dekhtay
Hain How to Get Best Model
"""

# Sb se Phly Ham ek Dic Bana Lyn Gye Models ki With Their names.
models = {
    'RF' : RandomForestClassifier(random_state=SEED),
    'DT' : DecisionTreeClassifier(random_state=SEED),
    'XGB': XGBClassifier(random_state=SEED),
    'CAT': CatBoostClassifier(verbose=0,random_state=SEED), # Idhr me Verbose 0 or -1 is liye pass kr rha hn ku ke Output ko Truncate kr rha 
    'LGBM': LGBMClassifier(verbose=-1,random_state=SEED) # hn CatBoost aur LGBM ki 
}

# Ab Hamara Next Step ho ga Ham ek Loop Run Krien Gye Jo ke Hamarai Dic me se Model aur un ke Names ko Pick Kry ga aur Phr One 
# by one Un Models ko Train kr Ky Un ke Resultsn ko Store kry ga Then ham un Results ko ek Dataframe me Store Krwaien Gye. 

# Idhr ab ek List bhi Intilize krien gye Jis me Hamary Scores aur Model ka Name Store Hota Rahe 
Scores_of_Models = []

# Lets Run The Loop.
for name,model in models.items():  # .items() use karna zaroori hai
    M = model
    
    # Train Model 
    M.fit(X_train,y_train)
    
    # Predict 
    y_Pred = M.predict(X_test)
    
    # Idhr me Bs Accuracy is Check krn ga , Ap Apni Problem ke Hisab se Kr Sktay Hain Kuch aur Matrix add
    acc = accuracy_score(y_test,y_Pred)
    
    Scores_of_Models.append((name,acc))
    
    # Print The Score 
#     print(f'Accuracy of The {name} is {acc}')
    
# Ab Jo Scores Ayen Hain Ham Un Ko Ek Data Frame me Ly Jayen gye aur Show Krwaien Gye 
scores_df = pd.DataFrame(Scores_of_Models, columns=['Model', 'Accuracy'])
scores_df

   carat      cut color clarity  depth  table  price     x     y     z
0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
------------------------------------
The Shape Of Data is (53940, 10)
The No Of Rows in Our Data is 53940
--------------------------------------------------
Null Values in Data is 
 carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64
--------------------------------------------------
Training data scaled successfully.
--------------------------------------------------
Data Encoded
--------------------------------------------------
The Shape Of Sampled Df is (2000, 10)
----------

Unnamed: 0,Model,Accuracy
0,RF,0.73
1,DT,0.615
2,XGB,0.765
3,CAT,0.74
4,LGBM,0.76


# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">How to Tune Model || Optuna Introduction</p>

Parametron ka sahi tashkeel model ke liye bohat ahmiyat rakhti hain. Agar hum model ke behtareen parametron ko dhoondh lein to hum ek behtareen model hasil kar sakte hain. Ab sawal yeh hai ke hum behtareen parametron ko kaise hasil kar sakte hain? Behtareen parametron ko dhoondhne ko hum hyperparameter tuning kehte hain.

Hyperparameter tuning ek aisi prakriya hai jisme hum model ke parametron ko optimize karne ka tareeqa tajweez karte hain. Is mein Optuna naam ka ek framework bohat zyada mashhoor hai. Optuna machine learning engineers aur data scientists ke darmiyan popular hai kyunki yeh ek automatic hyperparameter optimization framework hai.

Optuna ke zariye, hum model ke liye behtareen hyperparametron ko dhoondhne ke liye algorithmic search ka istemal karte hain. Is mein hum bas kuch initial parameters (jaise ke learning rate, batch size, ya tree depth) provide karte hain, aur Optuna khud baqi kaam karta hai. Yeh har ek parameter ko ek ek karke try karta hai, har experiment ke natijay mein model ki performance ko evaluate karta hai, aur sab se behtareen hyperparametron ko suggest karta hai jo model ke liye optimize hotay hain.

Optuna ki flexibility aur automation ki wajah se, yeh aam tor par machine learning projects mein istemal kiya jata hai, jahan kehli wajah se is ke zariye complex models ko bhi efficiently optimize kiya ja sakta hai.

Ager Ap Optune ko Mazeed study krna chahien to ap optune ki Documentation check kr skty hain link me attach kr dn ga.
- Optuna :: https://optuna.org/

# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">RF Params Tunning</p>

**`n_trials` Optuna ko batata hai ke kitne alag-alag trials (experiments) run karne hain. Har trial mein, Optuna alag-alag parameter combinations ko evaluate karega aur best parameters ko dhundega.**

**`trial` ek individual trial ko represent karta hai jo objective function ke andar pass hota hai. trial object ke through, aap parameters ko suggest (recommend) kar sakte hain jo us specific trial mein use hone hain. Yeh trial object parameters ko sample karta hai specified ranges aur distributions ke andar.**

**Iska matlab hai ke n_trials=20 ka matlab hai ke Optuna 20 trials run karega, aur har trial mein ek trial object pass kiya jayega jo naye parameters suggest karega.**

In [None]:
# Kuch Model ki Performence bht Achy hy Kuch ki Bs Normal hy , Zayda achi nahi ab ham esy krien gye RF ko Lyn gye aur Optuna se 
# Params Tunned krien Gye aur Check Krien Gye ke Kia Koi Improvement aa rhi hy Score me ? 

# Import kr lyn Optuna ko 
import optuna

# Ap Optuna ka Code Likhtay Hain aur Usy Use Krty Hain , Ham bs 20 Trails pa Results ko Check Krien Gye aur Dekhien Gye 
# Params ko Optimized Krny ke Baad Kia Score ata hy 

def objective(trial):
    
    """Objective function ke andar, sabse pehle parameters define kiye jate hain jo optimize hone hain.
    Yeh parameters trial object ke methods se suggest kiye jate hain, jo ki search space ko define karte hain."""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),  # Number of trees in the forest.
        'max_depth': trial.suggest_int('max_depth', 2, 50),  # Maximum depth of each tree.
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),  # Minimum samples required to split an internal node.
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),  # Minimum samples required to be a leaf node.
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),  # Number of features to consider when looking for the best split.
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),  # Whether bootstrap samples are used when building trees.
        'random_state': SEED,
    }
    
    val_scores = []
    
    """Ek StratifiedKFold object create kiya jata hai jisse data ko n splits mein divide kiya jata hai aur ensure
    kiya jata hai ki har split mein class distribution consistent ho."""
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    """For loop ke through, har split par training aur validation data define kiya jata hai. Phir model train kiya jata 
    hai aur validation set par predict kiya jata hai. Har trial ke validation accuracy ko store kiya jata hai."""
    for train_index, test_index in skf.split(X_tip, y_tip):
        X_train_tip, X_test_tip = X_tip.iloc[train_index], X_tip.iloc[test_index]
        y_train_tip, y_test_tip = y_tip.iloc[train_index], y_tip.iloc[test_index]

        model = RandomForestClassifier(**params)
        model.fit(X_train_tip, y_train_tip)

        y_test_pred = model.predict(X_test_tip)

        val_acc = accuracy_score(y_test_tip, y_test_pred)
        val_scores.append(val_acc)
        
    """Har split ke baad, mean validation accuracy calculate ki jati hai aur return ki jati hai. 
    Optuna is value ko use karta hai parameter set ke performance ko evaluate karne ke liye."""
    mean_val_acc = np.mean(val_scores)
    return mean_val_acc

    """Optimize method ko 20 trials ke liye call kiya jata hai. Har trial mein Optuna alag parameter 
    values try karta hai aur objective function ko call karta hai. Objective function
    return karta hai accuracy score, jo Optuna use karta hai best parameter set ko identify karne ke liye."""
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_params = study.best_params
print("Best parameters:", best_params)

In [14]:
# After 20 trails , ham ne Yeh Paramsters Hasil Kiye Hain Hain 
# {'n_estimators': 827, 'max_depth': 20, 'min_samples_split': 18, 'min_samples_leaf': 18, 'max_features': 0.615820169162841, 'bootstrap': True}

# Aur in Params pa Hamaray Model ka Score aa rha hy 1 , Jo ke ek Bht Amazing Change aya hy , Its means Optuna is Very 
# Good Option for Params Tunnings.

# Chalien ek br Phr se apna Upper Wala Code Run KRty Hain , Aur Dekhaty Hain Kia Improvement ati hy 


# Idhr ham Tunned Params ko Use kRien Gye 
T_Params = {'n_estimators': 363, 'max_depth': 15, 'min_samples_split': 2,
            'min_samples_leaf': 15, 'max_features': 0.9398700877827628, 'bootstrap': False}

# Yeh Params Tips ke 20 Trials ki Improvement ke Hain 
Tips_P =  {'n_estimators': 81, 'max_depth': 31, 'min_samples_split': 16, 
           'min_samples_leaf': 15, 'max_features': 0.16076546012392506, 'bootstrap': True}

# Idhr ham ek Code Define Krien Gye Jo Ke Fold Me Check Kry ga aur end pa Mean Score Return kr dey ga 
def Train_ML(model,Type):
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    train_scores = []
    val_scores = []
    
    if Type == 'I':
        
        # # X and Y ko Alg Kr Lyn Ek br 
        X = df.drop('Species',axis=1)
        y = df['Species']

        for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
            X_train, X_val = X.iloc[train_index], X.iloc[test_index]
            y_train, y_val = y.iloc[train_index], y.iloc[test_index]

            # Train the model
            model.fit(X_train, y_train)

            # Make predictions on training data
            y_train_pred = model.predict(X_train)
            train_accuracy = accuracy_score(y_train, y_train_pred)
            train_scores.append(train_accuracy)

            # Make predictions on validation data
            y_val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, y_val_pred)
            val_scores.append(val_accuracy)

            # Print accuracy for this fold
            print(f"Fold {fold}: Train Accuracy = {train_accuracy:.4f}, Validation Accuracy = {val_accuracy:.4f}")

        # Calculate mean accuracy
        mean_train_accuracy = np.mean(train_scores)
        mean_val_accuracy = np.mean(val_scores)

        print(f"\nMean Train Accuracy: {mean_train_accuracy:.4f}")
        print(f"Mean Validation Accuracy: {mean_val_accuracy:.4f}")
        
    if Type == 'T':
        
        # # X and Y ko Alg Kr Lyn Ek br 
        X = tip.drop('sex',axis=1)
        y = tip['sex']
        
        for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
            X_train, X_val = X.iloc[train_index], X.iloc[test_index]
            y_train, y_val = y.iloc[train_index], y.iloc[test_index]

            # Train the model
            model.fit(X_train, y_train)

            # Make predictions on training data
            y_train_pred = model.predict(X_train)
            train_accuracy = accuracy_score(y_train, y_train_pred)
            train_scores.append(train_accuracy)

            # Make predictions on validation data
            y_val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, y_val_pred)
            val_scores.append(val_accuracy)

            # Print accuracy for this fold
            print(f"Fold {fold}: Train Accuracy = {train_accuracy:.4f}, Validation Accuracy = {val_accuracy:.4f}")

        # Calculate mean accuracy
        mean_train_accuracy = np.mean(train_scores)
        mean_val_accuracy = np.mean(val_scores)

        print(f"\nMean Train Accuracy: {mean_train_accuracy:.4f}")
        print(f"Mean Validation Accuracy: {mean_val_accuracy:.4f}")

    return model, train_scores, val_scores

# Ab Ham Apnay Model ko Check krty hain ke Kia , Params Ke Tunned krny se Kuch Faida Huwa ya nahi
model = RandomForestClassifier(random_state=SEED,**T_Params)
model2 = RandomForestClassifier(random_state=SEED,**Tips_P)

print('Model Training Iris After Tunning')
print("------------------------------------------------------")
trained_model_I, train_scores, val_scores = Train_ML(model,'I')
print("------------------------------------------------------")

print('\n')

print('Model Training Tips After Tunning')
print("------------------------------------------------------")
trained_model_T, train_scores, val_scores = Train_ML(model2,"T")

Model Training Iris After Tunning
------------------------------------------------------
Fold 1: Train Accuracy = 1.0000, Validation Accuracy = 0.9333
Fold 2: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 3: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 4: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 5: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 6: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 7: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 8: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 9: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 10: Train Accuracy = 1.0000, Validation Accuracy = 1.0000

Mean Train Accuracy: 1.0000
Mean Validation Accuracy: 0.9933
------------------------------------------------------


Model Training Tips After Tunning
------------------------------------------------------
Fold 1: Train Accuracy = 0.6575, Validation Accuracy = 0.7200
Fold 2: Train Accura

Now As we See , With Out Tunning The Params , We Are Getting the Accuracy of 0.93 and After tunning we get 100%. Which Is Very Good Score , also We Cross validate Our Model 10 Times Its Means Our Accurcy is Very good and Cross-Validate

# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">LGBM Params Tunning</p>

- **If you want to Study The LGBM Parameters :: https://lightgbm.readthedocs.io/en/stable/Parameters.html#objective-parameters**

- **Parameter Tunning Guide :: https://lightgbm.readthedocs.io/en/stable/Parameters-Tuning.html**

- **CatBoost Parameters :: https://catboost.ai/en/docs/references/training-parameters/**

- **CatBoost Overfitting Detection :: https://catboost.ai/en/docs/references/training-parameters/overfitting-detection**

In [None]:
# # Ab ham Same Kam LGBM ke Liye Bhi Krien Gye aur Check Krien Gye es ke Score me Kia Improvements ati hain.

# def objective(trial):
    
#     # Parameters to be optimized
#     params = {
#         'objective': 'multiclass',  # Yeh batata hai ke humari problem multiple classes pe based hai.
#         'metric': 'multi_logloss',  # Evaluation metric jo multi-class log loss use karta hai.
#         'verbosity': -1,  # Output ki details ko kam karta hai (silence mode).
#         'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree ko use karta hai.
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),  # L1 regularization parameter, Optuna se suggest hota hai.
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),  # L2 regularization parameter, Optuna se suggest hota hai.
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),  # Decision tree ke leaves ki maximum count.
#         'max_depth': trial.suggest_int('max_depth', 1, 50),  # Tree ki maximum depth.
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),  # Model ki learning rate, Optuna se suggest hota hai.
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),  # Har iteration mein use hone wale features ka fraction.
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),  # Bagging mein use hone wale data ka fraction.
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),  # Bagging ke intervals ki frequency.
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),  # Minimum samples jo ek leaf mein hone chahiye.
#         'n_estimators': trial.suggest_int('n_estimators', 50, 1000),  # Boosting rounds ki total count.
# #         'num_classes' : 3,
# #         'device': 'gpu'
#     # 'gpu_platform_id': 0,
#     # 'gpu_device_id': 0
#     }
    
#     val_scores = []
#     n_splits = 10
#     kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

#     for train_index, test_index in kf.split(X, y):
#         X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#         y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#         model = LGBMClassifier(**params, random_state=SEED)
#         model.fit(X_train, y_train)

#         y_test_pred = model.predict(X_test)

#         val_acc = accuracy_score(y_test, y_test_pred)
#         val_scores.append(val_acc)

#     mean_val_acc = np.mean(val_scores)
#     return mean_val_acc

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# best_params = study.best_params
# print("Best parameters:", best_params)

In [17]:
"""# After 20 trails , ham ne Yeh Paramsters Hasil Kiye Hain Hain 
#  {'lambda_l1': 0.0026239141876874686, 'lambda_l2': 0.30759140476770286, 'num_leaves': 163, 'max_depth': 17,
# 'learning_rate': 0.006675339914806045, 'feature_fraction': 0.6166615017267428, 
# 'bagging_fraction': 0.5213145800455292, 'bagging_freq': 4, 'min_child_samples': 35, 'n_estimators': 636}"""

"""# Aur in Params pa Hamaray Model ka Score aa rha hy 1 , Jo ke ek Bht Amazing Change aya hy , Its means Optuna is Very 
# Good Option for Params Tunnings.

# Chalien ek br Phr se apna Upper Wala Code Run Krty Hain , Aur Dekhaty Hain Kia Improvement ati hy """

# Idhr ham Tunned Params ko Use kRien Gye 
T_Params_LGBM =  {'lambda_l1': 0.0026239141876874686, 'lambda_l2': 0.30759140476770286, 'num_leaves': 163,
                  'max_depth': 17, 'learning_rate': 0.006675339914806045, 
             'feature_fraction': 0.6166615017267428, 'bagging_fraction': 0.5213145800455292, 'bagging_freq': 4,
                  'min_child_samples': 35, 'n_estimators': 636,'verbose':-1}

# Ab Ham Apnay Model ko Check krty hain ke Kia , Params Ke Tunned krny se Kuch Faida Huwa ya nahi
print('Model Training Iris After Tunning')
print("------------------------------------------------------")
model = LGBMClassifier(**T_Params_LGBM,random_state=SEED)

avg_accuracy = Train_ML(model,"I")

Model Training Iris After Tunning
------------------------------------------------------
Fold 1: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 2: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 3: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 4: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 5: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 6: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 7: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 8: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 9: Train Accuracy = 1.0000, Validation Accuracy = 1.0000
Fold 10: Train Accuracy = 1.0000, Validation Accuracy = 1.0000

Mean Train Accuracy: 1.0000
Mean Validation Accuracy: 1.0000


Now As we See , With Out Tunning The Params Of LGBM , We Are Getting the Accuracy of 0.73 and After tunning we get 1.0. Which Is Very Good Score , also We Cross validate Our Model 10 Times Its Means Our Accurcy is Very good and Cross-Validate

# <p style="background-color: #00A4CCFF; font-family: Pacifico, cursive; color: #E6E6FA; font-size: 100%; text-align: left; border-radius: 5px; padding: 10px 30px; display: inline-block;">Use Full Functions</p>

In [None]:
# '''             Scaling                         '''
# # Function to Scale Data
# def apply_scaling(data, columns, scaler_type):
#     # Check the type of scaler and initialize the appropriate scaler object
#     if scaler_type == 'S':
#         scaler = StandardScaler()  # Initialize StandardScaler
#     elif scaler_type == 'M':
#         scaler = MinMaxScaler()  # Initialize MinMaxScaler
#     elif scaler_type == 'R':
#         scaler = RobustScaler()  # Initialize RobustScaler
#     elif scaler_type == 'A':
#         scaler = MaxAbsScaler()  # Initialize MaxAbsScaler
#     elif scaler_type == 'Q':
#         scaler = QuantileTransformer(output_distribution='normal')  # Initialize QuantileTransformer
#     elif scaler_type == 'P':
#         scaler = PowerTransformer()  # Initialize PowerTransformer
#     else:
#         raise ValueError("Invalid scaler type. Choose 'S' for StandardScaler, 'M' for MinMaxScaler, 'R' for RobustScaler, 'A' for MaxAbsScaler,'Q' for QuantileTransformer, or 'P' for PowerTransformer.")

#     # Create a copy of the input data to avoid modifying the original data
#     scaled_data = data.copy()

#     # Loop through each column to be scaled
#     for col in columns:
#         # Apply the scaler to the current column and update the data with the scaled values
#         scaled_data[col] = scaler.fit_transform(scaled_data[[col]])

#     # Return the scaled data
#     return scaled_data

# # Specify columns and scaler type
# columns_to_scale = [col for col in diamond.columns if diamond[col].dtype == 'float']
# scaler_type = 'S' 

# # Apply scaling data
# diamond = apply_scaling(diamond, columns_to_scale, scaler_type)


# '''             Encoding                         '''
# # Function to Encode Data
# def encode(data, columns, encoding_type='label'):
#     try:
#         # Initialize encoders
#         label_encoder = LabelEncoder()
#         one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
        
#         # Idhr ham Data ki Copy bana Lyn Gye
#         encoded_data = data.copy()
        
#         for col in columns:
#             if encoding_type == 'label':
#                 # Apply Label Encoding
#                 encoded_data[col] = label_encoder.fit_transform(data[col])
#             elif encoding_type == 'onehot':
#                 # Idhr Ham Ohe Ko Apply krien gye Sth Sth New Transform Cols aur Dusray Cols ko Combine krien gye
#                 one_hot_encoded = one_hot_encoder.fit_transform(data[[col]])
#                 one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=[f"{col}_{category}" for category in label_encoder.fit(data[col]).classes_[1:]])
#                 encoded_data = pd.concat([encoded_data.drop(col, axis=1), one_hot_encoded_df], axis=1)
#             else:
#                 raise ValueError("Invalid encoding type. Choose 'label' for Label Encoding or 'onehot' for One-Hot Encoding.")
        
#         return encoded_data

#     except Exception as e:
#         print(f"An error occurred: {e}")
#         return None

# # Call 
# columns_to_encode = add you Columns 
# encoding_type = Add Encoding Type Here

# # Apply encoding to the data
# diamond = encode(diamond, columns_to_encode, encoding_type)


# '''             Train ML                         '''
# def Train_ML(model,X,y):
    
#     skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
#     train_scores = []
#     val_scores = []

#     for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
#         X_train, X_val = X.iloc[train_index], X.iloc[test_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[test_index]

#         # Train the model
#         model.fit(X_train, y_train)

#         # Make predictions on training data
#         y_train_pred = model.predict(X_train)
#         train_accuracy = accuracy_score(y_train, y_train_pred)
#         train_scores.append(train_accuracy)

#         # Make predictions on validation data
#         y_val_pred = model.predict(X_val)
#         val_accuracy = accuracy_score(y_val, y_val_pred)
#         val_scores.append(val_accuracy)

#         # Print accuracy for this fold
#         print(f"Fold {fold}: Train Accuracy = {train_accuracy:.4f}, Validation Accuracy = {val_accuracy:.4f}")

#     # Calculate mean accuracy
#     mean_train_accuracy = np.mean(train_scores)
#     mean_val_accuracy = np.mean(val_scores)

#     print(f"\nMean Train Accuracy: {mean_train_accuracy:.4f}")
#     print(f"Mean Validation Accuracy: {mean_val_accuracy:.4f}")

#     return model, train_scores, val_scores

# model = Here Add your Model With Params 
# trained_model_I, train_scores, val_scores = Train_ML(model,X,y)


# # Ab ham Same Kam LGBM ke Liye Bhi Krien Gye aur Check Krien Gye es ke Score me Kia Improvements ati hain.

# def objective(trial):
    
#     # Parameters to be optimized
#     params = """Add params According to you Model and Problem"""
    
#     val_scores = []
    
#     skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED) # Idhr Ap Koi aur bhi chose kr skty ho Apni Mrzi ka

#     for train_index, test_index in kf.split(X, y):
#         X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#         y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#         model = """Model With Params"""
#         model.fit(X_train, y_train)

#         y_test_pred = model.predict(X_test)

#         val_acc = accuracy_score(y_test, y_test_pred)
#         val_scores.append(val_acc)

#     mean_val_acc = np.mean(val_scores)
#     return mean_val_acc

# study = optuna.create_study(direction='maximize') # Ager to Accuracy Like Metrix hain to Maximun 
#                                                     # Ager Loss hy to minimize
# study.optimize(objective, n_trials=100) # Trials ki Values Bhi Apni Choice pa Experiments kr skty hain

# best_params = study.best_params
# print("Best parameters:", best_params)