In [1]:
# EDA libraries 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

# sampling libraby
from sklearn.model_selection import train_test_split

# Data Transformation libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# custom Transformations
from sklearn.base import BaseEstimator, TransformerMixin

# Data Pipelines 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Machine learning Models and evaluation Metrices
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Custom Evaluation Metric
def rmse(y, y_hat):
    return np.sqrt(mean_squared_error(y, y_hat))



    sklearn model classes are of two types
    
        Estimators -> ML models  fit, predict, score
        
        Transformers -> fit, transform, fit_transform
        
        these functions can take input from the previous step and give output to the next step

In [2]:
# load data_set
def get_data(value):
    return sns.load_dataset(value)

In [3]:
mpg = get_data("mpg")

In [4]:
mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


    features -> cylinders, displacement, horsepower, weight, accelaration , model_year, origin, name
    
    target -> mpg

In [5]:
X = mpg.drop("mpg", axis =1) #  2D array
y = mpg["mpg"]  # 1D array

In [6]:
X.head()  # features 

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [7]:
y.head()  # target / labels 

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

## Linear Regression Model

    HYPOTHESIS FUNCTION:-
    mpg = bias + w1*displacement + w2*horsepower + w3*weight + w4*accelaration + w5*model_year w6*origin 
    + w7*name

name -> company name extract --> ordinal encoder(because there are more company) 

raw_data -> pipeline -> data_moel --> machine learning model 

In [8]:
# sampling 
X_train, X_test, y_train , y_test = train_test_split(X, y,  test_size=0.05, random_state = 145 )
# 10% of the value are picked for testing and rest are used for training purpose 

In [9]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
372,4,151.0,90.0,2735,18.0,82,usa,pontiac phoenix
68,8,350.0,155.0,4502,13.5,72,usa,buick lesabre custom
32,4,98.0,,2046,19.0,71,usa,ford pinto
195,4,85.0,52.0,2035,22.2,76,usa,chevrolet chevette
216,4,98.0,68.0,2045,18.5,77,japan,honda accord cvcc


In [10]:
y_train.head()

372    27.0
68     13.0
32     25.0
195    29.0
216    31.5
Name: mpg, dtype: float64

     we need to build custom transform which will convert name column
     into company name  
     
     ford torino -> company name extractor(Tranformer) -> ford 
             
## Custom Transformation 
    1. Create a class by inherting BaseEstimator and TransforMixin
    2. create a method fit which will calculate all statistic needed to transform data using training data
    3. create a method transform which will do transform using statistics learning in previous step
    4. create a method called fit_transform which will just call fit method first and than transform method 

In [11]:
class ComapanyNameExtracter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.copy()
        #  X -> DataFrame -> X["name"] which needs to be replaced 
        X.loc[:,"name"] = X["name"].apply(self.process_string)
        return X
    @staticmethod    # to specify that process_string is a normal function and not a method of class 
    def process_string(value):
        map_d = {'maxda': 'mazda', 
        "toyouta":  "toyota",
        "vokswagen":  "vw",
        "volkswagen": "vw"}
        result = value.lower().strip().split(" ")
        name = result[0]
        if name in map_d.keys():
            name = map_d[name]
        return name
    

In [12]:
trans = ComapanyNameExtracter()
temp_tr = trans.fit_transform(X_train)
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
372,4,151.0,90.0,2735,18.0,82,usa,pontiac phoenix
68,8,350.0,155.0,4502,13.5,72,usa,buick lesabre custom
32,4,98.0,,2046,19.0,71,usa,ford pinto
195,4,85.0,52.0,2035,22.2,76,usa,chevrolet chevette
216,4,98.0,68.0,2045,18.5,77,japan,honda accord cvcc


In [13]:
temp_tr.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
372,4,151.0,90.0,2735,18.0,82,usa,pontiac
68,8,350.0,155.0,4502,13.5,72,usa,buick
32,4,98.0,,2046,19.0,71,usa,ford
195,4,85.0,52.0,2035,22.2,76,usa,chevrolet
216,4,98.0,68.0,2045,18.5,77,japan,honda


In [14]:
y_train.head()

372    27.0
68     13.0
32     25.0
195    29.0
216    31.5
Name: mpg, dtype: float64

In [15]:
num_features = ["displacement", "horsepower", "weight", "acceleration"]
ord_features = ["cylinders","model_year"]
nominal_features = ["origin"]
pass_through_cols = []
drop_cols = ["model_year"]

In [16]:
numerical_pipeline = Pipeline([("imputer", SimpleImputer()), ("std scaler", StandardScaler())])
ordinal_pipeline = Pipeline([ #("extract company name", ComapanyNameExtracter()), 
                         ("ordinal encode", OrdinalEncoder()),
                         ("std scaling", StandardScaler())])
nominal_pipeline = Pipeline([("one hot encoding", OneHotEncoder())])

In [17]:
pipeline = ColumnTransformer([
    ("numerical pipeline", numerical_pipeline, num_features), #["displacement", "horsepower", "weight", "acceleration"]
    ("ordinal pipeline", ordinal_pipeline, ord_features),     #["name"]
    ("nominal pipeline", nominal_pipeline, nominal_features), #["origin"]
    ("passing columns", "passthrough", pass_through_cols),    #["cylinders"]
    ("drop columns", "drop", drop_cols)                       #["model_year"]
])
output_cols = ["displacement", "horsepower", "weight", "acceleration",
                "cylinders", "model_year",  "europe", "japan", "usa", 
               ]

In [18]:
X_train.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
372,4,151.0,90.0,2735,18.0,82,usa,pontiac phoenix
68,8,350.0,155.0,4502,13.5,72,usa,buick lesabre custom


In [19]:
X_train_tr = pipeline.fit_transform(X_train)  # this will give 2d array 
X_train_tr = pd.DataFrame(X_train_tr, columns = output_cols)
X_train_tr.head(2)

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders,model_year,europe,japan,usa
0,-0.391983,-0.371907,-0.258306,0.890229,-0.871829,1.619269,0.0,0.0,1.0
1,1.493836,1.303962,1.824603,-0.732619,1.375112,-1.067665,0.0,0.0,1.0


In [20]:
X_test.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
333,6,168.0,132.0,2910,11.4,80,japan,datsun 280-zx
99,6,232.0,100.0,2945,16.0,73,usa,amc hornet
258,6,231.0,105.0,3380,15.8,78,usa,buick century special
122,4,121.0,110.0,2660,14.0,73,europe,saab 99le
191,6,225.0,100.0,3233,15.4,76,usa,plymouth valiant


In [21]:
X_test_tr = pipeline.transform(X_test)
X_test_tr = pd.DataFrame(X_test_tr, columns = output_cols)
X_test_tr.head(2)

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders,model_year,europe,japan,usa
0,-0.230882,0.710962,-0.052019,-1.489949,0.626132,1.081882,0.0,1.0,0.0
1,0.375612,-0.114081,-0.010762,0.168963,0.626132,-0.798972,0.0,0.0,1.0


In [22]:
#mL


In [23]:
def generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test):
    width = 116 
    print("_" * width)
    title = "|{:^30}|{:^20}|{:^20}|{:^20}|{:^20}|"
    print(title.format("Model Name", "Train Error", "Test Error", "Train Accuracy", "Test accuracy"))
    print("_"*width)
    row = "|{:^30}|{:^20.2f}|{:^20.2f}|{:^20.2f}|{:^20.2f}|"
    for model in models:
        model.fit(X_train_tr, y_train)
        y_hat_train = model.predict(X_train_tr)
        y_hat_test = model.predict(X_test_tr)
        train_error = rmse(y_train, y_hat_train)
        test_error = rmse(y_test, y_hat_test)
        train_accuracy = r2_score(y_train, y_hat_train)
        test_accuracy = r2_score(y_test, y_hat_test)
        model_name = str(model)
        print(row.format(model_name, train_error, test_error, train_accuracy*100, test_accuracy*100))
        print("-"*width)

In [24]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
models = [LinearRegression(), SGDRegressor(), SVR(), DecisionTreeRegressor(), RandomForestRegressor()]

In [25]:
generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test)

____________________________________________________________________________________________________________________
|          Model Name          |    Train Error     |     Test Error     |   Train Accuracy   |   Test accuracy    |
____________________________________________________________________________________________________________________
|      LinearRegression()      |        3.26        |        2.85        |       82.90        |       76.75        |
--------------------------------------------------------------------------------------------------------------------
|        SGDRegressor()        |        3.28        |        2.79        |       82.68        |       77.80        |
--------------------------------------------------------------------------------------------------------------------
|            SVR()             |        2.98        |        3.03        |       85.69        |       73.85        |
----------------------------------------------------------------

    overfiiting / underfitting
        hyper parameter tuning 


In [26]:
model = [RandomForestRegressor(n_estimators=200), RandomForestRegressor(max_depth=3) , RandomForestRegressor(max_depth = 5),
        RandomForestRegressor(max_depth= 7), RandomForestRegressor(max_depth= 4, min_samples_split=20)]

In [27]:
generate_models_report(model, X_train_tr, y_train, X_test_tr, y_test)

____________________________________________________________________________________________________________________
|          Model Name          |    Train Error     |     Test Error     |   Train Accuracy   |   Test accuracy    |
____________________________________________________________________________________________________________________
|RandomForestRegressor(n_estimators=200)|        1.00        |        3.49        |       98.37        |       65.29        |
--------------------------------------------------------------------------------------------------------------------
|RandomForestRegressor(max_depth=3)|        2.74        |        3.38        |       87.86        |       67.44        |
--------------------------------------------------------------------------------------------------------------------
|RandomForestRegressor(max_depth=5)|        1.90        |        3.46        |       94.18        |       65.75        |
-----------------------------------------------

In [28]:
def generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test):
    width = 116 
    print("_" * width)
    title = "|{:^30}|{:^20}|{:^20}|{:^20}|{:^20}|"
    print(title.format("Model Name", "Train Error", "Test Error", "Train Accuracy", "Test accuracy"))
    print("_"*width)
    row = "|{:^30}|{:^20.2f}|{:^20.2f}|{:^20.2f}|{:^20.2f}|"
    for model in models:
        model.fit(X_train_tr, y_train)
        y_hat_train = model.predict(X_train_tr)
        y_hat_test = model.predict(X_test_tr)
        train_error = rmse(y_train, y_hat_train)
        test_error = rmse(y_test, y_hat_test)
        train_accuracy = r2_score(y_train, y_hat_train)
        test_accuracy = r2_score(y_test, y_hat_test)
        model_name = str(model)
        print(model_name, end="\n\n")
        print(row.format("", train_error, test_error, train_accuracy*100, test_accuracy*100))
        print("-"*width)

In [29]:
model = [RandomForestRegressor(n_estimators=200), RandomForestRegressor(max_depth=3) , RandomForestRegressor(max_depth = 5),
        RandomForestRegressor(max_depth= 7), RandomForestRegressor(max_depth= 4, min_samples_split=20)]
generate_models_report(model, X_train_tr, y_train, X_test_tr, y_test)

____________________________________________________________________________________________________________________
|          Model Name          |    Train Error     |     Test Error     |   Train Accuracy   |   Test accuracy    |
____________________________________________________________________________________________________________________
RandomForestRegressor(n_estimators=200)

|                              |        0.99        |        3.41        |       98.42        |       66.83        |
--------------------------------------------------------------------------------------------------------------------
RandomForestRegressor(max_depth=3)

|                              |        2.74        |        3.20        |       87.85        |       70.73        |
--------------------------------------------------------------------------------------------------------------------
RandomForestRegressor(max_depth=5)

|                              |        1.91        |        3.39   

In [30]:
len(X_train.columns)

8

In [31]:
result = {
    "models":[],
    "train_error":[],
    "test_error":[],
    "train_acc":[],
    "test_acc":[],
    "parameters":[]
}

In [32]:
n_estimators = [10, 50, 100, 200, 300, 500] # 600 models 
max_depth = [1, 3, 6, 10] # 100
min_sample_split = [5, 10, 15, 20, 50] # 5 -> 20
max_features = [2, 3, 5, 7] # 4

c = 0
for n_est in n_estimators: 
    for dth in max_depth:
        for min_sm in min_sample_split:
            for max_fe in max_features: 
                model  = RandomForestRegressor(
                n_estimators = n_est,
                max_depth = dth,
                min_samples_split = min_sm,
                max_features = max_fe
                )
                model.fit(X_train_tr, y_train)
                y_hat_train = model.predict(X_train_tr)
                y_hat_test = model.predict(X_test_tr)
                train_error = rmse(y_train, y_hat_train)
                test_error = rmse(y_test, y_hat_test)
                train_accuracy = r2_score(y_train, y_hat_train)
                test_accuracy = r2_score(y_test, y_hat_test)
                
                result["models"].append(model)
                result["train_error"].append(train_error)
                result["test_error"].append(test_accuracy)
                result["train_acc"].append(train_accuracy)
                result["test_acc"].append(test_accuracy)
                result["parameters"].append({
                    "n_estimators":n_est,
                    "max_depth":dth,
                    "min_sample_split":min_sm,
                    "max_features":max_fe
                })
                c = c + 1
print("no of model trained: ", c)                

no of model trained:  480


In [33]:
result

{'models': [RandomForestRegressor(max_depth=1, max_features=2, min_samples_split=5,
                        n_estimators=10),
  RandomForestRegressor(max_depth=1, max_features=3, min_samples_split=5,
                        n_estimators=10),
  RandomForestRegressor(max_depth=1, max_features=5, min_samples_split=5,
                        n_estimators=10),
  RandomForestRegressor(max_depth=1, max_features=7, min_samples_split=5,
                        n_estimators=10),
  RandomForestRegressor(max_depth=1, max_features=2, min_samples_split=10,
                        n_estimators=10),
  RandomForestRegressor(max_depth=1, max_features=3, min_samples_split=10,
                        n_estimators=10),
  RandomForestRegressor(max_depth=1, max_features=5, min_samples_split=10,
                        n_estimators=10),
  RandomForestRegressor(max_depth=1, max_features=7, min_samples_split=10,
                        n_estimators=10),
  RandomForestRegressor(max_depth=1, max_features=2, min_s

In [34]:
result = pd.DataFrame(result)

In [35]:
result

Unnamed: 0,models,train_error,test_error,train_acc,test_acc,parameters
0,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.899035,0.608895,0.612840,0.608895,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."
1,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.730164,0.389624,0.639071,0.389624,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."
2,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.999149,0.368122,0.596855,0.368122,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."
3,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.951262,0.415248,0.604541,0.415248,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."
4,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.546160,0.503064,0.666605,0.503064,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."
...,...,...,...,...,...,...
475,"(DecisionTreeRegressor(max_depth=10, max_featu...",2.116091,0.690005,0.927767,0.690005,"{'n_estimators': 500, 'max_depth': 10, 'min_sa..."
476,"(DecisionTreeRegressor(max_depth=10, max_featu...",3.049491,0.718485,0.849989,0.718485,"{'n_estimators': 500, 'max_depth': 10, 'min_sa..."
477,"(DecisionTreeRegressor(max_depth=10, max_featu...",2.946743,0.709888,0.859927,0.709888,"{'n_estimators': 500, 'max_depth': 10, 'min_sa..."
478,"(DecisionTreeRegressor(max_depth=10, max_featu...",2.829302,0.676678,0.870870,0.676678,"{'n_estimators': 500, 'max_depth': 10, 'min_sa..."


 ## Hyperparameter Tuning using sklearn
         GridSearch
         
          RandomizedSearch
    
    

In [37]:
result.sort_values(["test_acc", "train_acc"], ascending = False).head(5)

Unnamed: 0,models,train_error,test_error,train_acc,test_acc,parameters
42,"(DecisionTreeRegressor(max_depth=6, max_featur...",1.821857,0.81771,0.946458,0.81771,"{'n_estimators': 10, 'max_depth': 6, 'min_samp..."
23,"(DecisionTreeRegressor(max_depth=3, max_featur...",2.770189,0.787711,0.876209,0.787711,"{'n_estimators': 10, 'max_depth': 3, 'min_samp..."
44,"(DecisionTreeRegressor(max_depth=6, max_featur...",2.294213,0.785428,0.915094,0.785428,"{'n_estimators': 10, 'max_depth': 6, 'min_samp..."
120,"(DecisionTreeRegressor(max_depth=6, max_featur...",2.02437,0.783792,0.933893,0.783792,"{'n_estimators': 50, 'max_depth': 6, 'min_samp..."
125,"(DecisionTreeRegressor(max_depth=6, max_featur...",2.012043,0.778798,0.934695,0.778798,"{'n_estimators': 50, 'max_depth': 6, 'min_samp..."


In [40]:
from sklearn.model_selection import GridSearchCV

 ## estimators to be passed in GridSearchCV()
 
    estimator  --> Ml model
    
    parameter_grid --> hyper parameter you want to try 
    
    score --> (greatest is best) r2-score ,neag_mean_square_error

In [46]:
params = [
    {
        "n_estimators": [10, 50, 100, 200, 300, 500], # 600 models 
        "max_depth": [1, 3, 6, 10], # 100
        "min_samples_split": [ 5, 10, 15, 20, 50], # 5 -> 20
        "max_features": [ 2, 3, 5, 7] # 4 
    }
]

In [51]:
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=params, scoring="r2")
# grid_search = GridSearchCV(RandomForestRegressor(), param_grid=params, scoring="neg_mean_squared_error")

In [52]:
grid_search

In [54]:
grid_search.fit(X_train_tr, y_train)

ValueError: Invalid parameter 'min_sample_split' for estimator RandomForestRegressor(max_depth=1, max_features=2). Valid parameters are: ['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].