<a href="https://colab.research.google.com/github/nikhilanirudh01/scikit-learn-pipeline-for-census-income-data/blob/main/mlPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Name: NIKHIL ANIRUDH NAISHADHAM
#ASU Id: 1229454124
#Date: November 14th, 2024

**PART 1**: Baseline ML Pipeline to create an estimator with cross validation.

In [None]:
#import the datasets:
from sklearn.datasets import load_svmlight_file as load_SVM
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline as MPL
from sklearn.model_selection import cross_val_score as CVS
from scipy.sparse import csr_matrix as CSR
import numpy as npy

#Import the dataset into code:
(x_train, y_train) = load_SVM('/content/a9a.txt')

#converting sparse data into dense data:
xTrain_dense = x_train.todense()
xTrain_dense = npy.asarray(xTrain_dense)

#creating a standrd scalar object
SS = StandardScaler ()

#Create decision tree object:
DT = DecisionTreeClassifier()

#Create a pipeline:
na_pipe = MPL(SS, DT)

#Using cross validation score:
crossVal_Score = CVS(na_pipe, xTrain_dense, y_train, cv = 5)

print("The Average Classification Score is: ", crossVal_Score.mean())

The Average Classification Score is:  0.7933726904535288


**PART 2**: Hyperparameter Tuning with GridSearchCV

In [None]:
#import the datasets:
from sklearn.datasets import load_svmlight_file as load_SVM
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline as MPL
from sklearn.model_selection import cross_val_score as CVS
from scipy.sparse import csr_matrix as CSR
from sklearn.model_selection import GridSearchCV as GSCV
import numpy as npy

#Import the dataset into code:
(x_train, y_train) = load_SVM('/content/a9a.txt')

#converting sparse data into dense data:
xTrain_dense = x_train.todense()
xTrain_dense = npy.asarray(xTrain_dense)

#creating a standrd scalar object
SS = StandardScaler ()

#Create decision tree object:
DT = DecisionTreeClassifier()

#Create a pipeline:
na_pipe = MPL(SS, DT)

#Create parameters:
gridParameters = {
    'decisiontreeclassifier__criterion': ['gini', 'entropy'],
    'decisiontreeclassifier__max_depth': [5, 10, 15, 20]
}

#Create grid search:
gridSearch = GSCV(na_pipe, gridParameters, cv = 5)
gridSearch.fit(xTrain_dense, y_train)

print("Best Parameter is: ", gridSearch.best_params_)
print("Best Score is: ", gridSearch.best_score_)

Best Parameter is:  {'decisiontreeclassifier__criterion': 'entropy', 'decisiontreeclassifier__max_depth': 10}
Best Score is:  0.8351710536590776


**PART 3**: Train-Test Split with Tuned Model Evaluation

In [None]:
#import the datasets:
from sklearn.datasets import load_svmlight_file as load_SVM
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline as MPL
from sklearn.model_selection import cross_val_score as CVS
from sklearn.model_selection import train_test_split as TTSPlit
from sklearn.model_selection import GridSearchCV as GSCV
import numpy as npy

#Import the dataset into code:
(x, y) = load_SVM('/content/a9a.txt')

#converting sparse data into dense data:
x = x.todense()
x = npy.asarray(x)

#converting sparse data into dense data:
(x_train, x_test, y_train, y_test) = TTSPlit(x, y, test_size = 0.2, random_state = 42)

#creating a standrd scalar object
SS = StandardScaler ()

#Create decision tree object:
DT = DecisionTreeClassifier()

#Create a pipeline:
na_pipe = MPL(SS, DT)

#Create parameters:
gridParameters = {
    'decisiontreeclassifier__criterion': ['gini', 'entropy'],
    'decisiontreeclassifier__max_depth': [5, 10, 15, 20]
}

#Create grid search:
gridSearch = GSCV(na_pipe, gridParameters, cv = 5)
gridSearch.fit(x_train, y_train)

#fit SS on training data
SS.fit(x_train)

#get the best model:
bestMod = gridSearch.best_estimator_

#scale the test data
x_testScale = SS.transform(x_test)

#obtain accuracy
tesAcc = bestMod.score(x_testScale, y_test)

print("Test Accuracy is: ", tesAcc)


Test Accuracy is:  0.7660064486411792
