### Mileage Prediction Project 

    sklearn model classes
    
                        Estimators -> ML Models    fit, predict, score
                        
                        Transformers ->            fit, transform, fit_transform
                        
                        
                                   input -> T -> output 

In [1]:
# EDA Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sampling Library
from sklearn.model_selection import train_test_split

# Data Transformation Libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# Custom Transformation 
from sklearn.base import BaseEstimator, TransformerMixin

# Data Pipelines 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 

# Machine Learning Models and Evaluation Metrices
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Custom Evaluation Metric
def rmse(y, y_hat):
    return np.sqrt(mean_squared_error(y, y_hat))

# Custom Transformer 
class CompanyNameExtracter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # X -> DataFrame -> X["name"] which needs to be transformed
        X.loc[:, "name"] = X["name"].apply(self.process_string) 
        return X
    @staticmethod
    def process_string(value):
        map_d = {'maxda': 'mazda', 
        "toyouta":  "toyota",
        "vokswagen":  "vw",
        "volkswagen": "vw",}
        result = value.lower().strip().split(" ")
        name = result[0]
        if name in map_d.keys():
            name = map_d[name]
        return name

In [2]:
# load data
def get_data(dataset_name):
    return sns.load_dataset(dataset_name)

mpg = get_data("mpg")
mpg.head()
# features -> cylinders, displacement, horsepower, weight, acceleration, model_year, origin, name
# target  -> mpg

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [3]:
X = mpg.drop("mpg", axis=1) # 2-D
y = mpg["mpg"] # 1-D

In [4]:
X.head() # features

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [5]:
y.head() # target / labels

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

### Linear Regression Model 


    Hypothesis / Prediction

    mpg = bias+w1*displacement+w2*horsepower+w3*weight+w4*acceleration+w5*model_year+w6*origin+w7*name

    we need to build a custom transformer which will convert name column
    into company
    
        peugeot 504  -> CompanyNameExtracter -> peugeot
        fiat 124b    ->                      -> fiat
        
#### Custom Transformation

        1. Create a class by inherting BaseEstimator and TransformerMixin
        2. Create a Method fit which will calculate all statistics needed to transform data using training data
        3. Create a Method transform which will do the transformation using statistics learning in previous step
        4. Create a Method called fit_transform which will just call fit method first and than transform method
        
        
        name -> company name extract --> ordinal encoder
        
        raw_data -> pipeline -> data model --> machine learning model

In [6]:
# Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.05,
                                                   random_state=145)

In [7]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
372,4,151.0,90.0,2735,18.0,82,usa,pontiac phoenix
68,8,350.0,155.0,4502,13.5,72,usa,buick lesabre custom
32,4,98.0,,2046,19.0,71,usa,ford pinto
195,4,85.0,52.0,2035,22.2,76,usa,chevrolet chevette
216,4,98.0,68.0,2045,18.5,77,japan,honda accord cvcc


In [8]:
y_train.head()

372    27.0
68     13.0
32     25.0
195    29.0
216    31.5
Name: mpg, dtype: float64

In [9]:
num_features = ["displacement", "horsepower", "weight", "acceleration"]
ord_features = [ "cylinders", "model_year"]
nominal_features = ["origin"]
pass_through_cols = []
drop_cols = ["name",]

In [10]:
numerical_pipeline = Pipeline([ ("imputer", SimpleImputer()), ("std scaler", StandardScaler())])
ordinal_pipeline = Pipeline([ #("extract company name", CompanyNameExtracter()),
                         ("ordinal encoder", OrdinalEncoder()),
                         ("std scaling", StandardScaler())
                        ])
nominal_pipeline = Pipeline([ ("one hot encoding", OneHotEncoder() ) ])

In [11]:
pipeline = ColumnTransformer([
    ("numerical pipeline", numerical_pipeline, num_features), # ["displacement", "horsepower", "weight", "acceleration"]
    ("ordinal pipeline", ordinal_pipeline, ord_features), #  [ "cylinders", "model_year"]
    ("nominal pipeline", nominal_pipeline, nominal_features), # ["europe", "japan", "usa"]
    ("passing columns", "passthrough", pass_through_cols), # ["cylinders"]
    ("drop columns", "drop", drop_cols)# "name",
    
])

output_cols = ["displacement", "horsepower", "weight", "acceleration",
                "cylinders", "model_year",  "europe", "japan", "usa", 
               ]

In [12]:
X_train.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
372,4,151.0,90.0,2735,18.0,82,usa,pontiac phoenix
68,8,350.0,155.0,4502,13.5,72,usa,buick lesabre custom


In [13]:
X_train_tr = pipeline.fit_transform(X_train)
X_train_tr = pd.DataFrame(X_train_tr, columns=output_cols)
X_train_tr.head(2)

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders,model_year,europe,japan,usa
0,-0.391983,-0.371907,-0.258306,0.890229,-0.871829,1.619269,0.0,0.0,1.0
1,1.493836,1.303962,1.824603,-0.732619,1.375112,-1.067665,0.0,0.0,1.0


In [14]:
X_test.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
333,6,168.0,132.0,2910,11.4,80,japan,datsun 280-zx
99,6,232.0,100.0,2945,16.0,73,usa,amc hornet


In [15]:
X_test_tr = pipeline.transform(X_test)
X_test_tr = pd.DataFrame(X_test_tr, columns=output_cols)
X_test_tr.head(2)

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders,model_year,europe,japan,usa
0,-0.230882,0.710962,-0.052019,-1.489949,0.626132,1.081882,0.0,1.0,0.0
1,0.375612,-0.114081,-0.010762,0.168963,0.626132,-0.798972,0.0,0.0,1.0


In [16]:
# ML
model = LinearRegression()
model.fit(X_train_tr, y_train)
print(model.intercept_, model.coef_)

24.36795094312869 [ 3.51958174 -0.92657813 -5.77982665  0.31020559 -1.62418694  2.90843501
  0.8242213   0.8913853  -1.71560659]


In [17]:
# ML

def generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test):
    width = 116
    print("_"*width)
    title = "|{:^30}|{:^20}|{:^20}|{:^20}|{:^20}|"
    print(title.format("Model Name", "Train Error", "Test Error", "Train Accuracy", "Test Accuracy"))
    print("_"*width)
    row = "|{:^30}|{:^20.2f}|{:^20.2f}|{:^20.2f}|{:^20.2f}|"
    for model in models:
        model.fit(X_train_tr, y_train)
        y_hat_train = model.predict(X_train_tr)
        y_hat_test = model.predict(X_test_tr)
        train_error = rmse(y_train, y_hat_train)
        test_error = rmse(y_test, y_hat_test)
        train_accuracy = r2_score(y_train, y_hat_train)
        test_accuracy = r2_score(y_test, y_hat_test)
        model_name = str(model)
        print(row.format(model_name, train_error, test_error, train_accuracy*100, test_accuracy*100))
        print("-"*width)

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = [LinearRegression(), SGDRegressor(), SVR(), DecisionTreeRegressor(), RandomForestRegressor()]

In [19]:
generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test)
# latest

____________________________________________________________________________________________________________________
|          Model Name          |    Train Error     |     Test Error     |   Train Accuracy   |   Test Accuracy    |
____________________________________________________________________________________________________________________
|      LinearRegression()      |        3.26        |        2.85        |       82.90        |       76.75        |
--------------------------------------------------------------------------------------------------------------------
|        SGDRegressor()        |        3.29        |        2.78        |       82.59        |       77.88        |
--------------------------------------------------------------------------------------------------------------------
|            SVR()             |        2.98        |        3.03        |       85.69        |       73.85        |
----------------------------------------------------------------

In [20]:
generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test)
# previous # using all features

____________________________________________________________________________________________________________________
|          Model Name          |    Train Error     |     Test Error     |   Train Accuracy   |   Test Accuracy    |
____________________________________________________________________________________________________________________
|      LinearRegression()      |        3.26        |        2.85        |       82.90        |       76.75        |
--------------------------------------------------------------------------------------------------------------------
|        SGDRegressor()        |        3.27        |        2.79        |       82.76        |       77.74        |
--------------------------------------------------------------------------------------------------------------------
|            SVR()             |        2.98        |        3.03        |       85.69        |       73.85        |
----------------------------------------------------------------

In [21]:
generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test)
# previous previous # model_year drop 

____________________________________________________________________________________________________________________
|          Model Name          |    Train Error     |     Test Error     |   Train Accuracy   |   Test Accuracy    |
____________________________________________________________________________________________________________________
|      LinearRegression()      |        3.26        |        2.85        |       82.90        |       76.75        |
--------------------------------------------------------------------------------------------------------------------
|        SGDRegressor()        |        3.28        |        2.78        |       82.64        |       77.91        |
--------------------------------------------------------------------------------------------------------------------
|            SVR()             |        2.98        |        3.03        |       85.69        |       73.85        |
----------------------------------------------------------------

#### Overfitting / Underfitting

#### Hyper parameter tunning 

    California Housing Price Predictions 
    
            follow book 
            
                Chapter-2

In [22]:
models = [RandomForestRegressor(n_estimators=200), RandomForestRegressor(max_depth=3), RandomForestRegressor(max_depth=5),
         RandomForestRegressor(max_depth=4, min_samples_split=20), RandomForestRegressor(max_depth=6)]

In [23]:
# ML

def generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test):
    width = 116
    print("_"*width)
    title = "|{:^30}|{:^20}|{:^20}|{:^20}|{:^20}|"
    print(title.format("Model Name", "Train Error", "Test Error", "Train Accuracy", "Test Accuracy"))
    print("_"*width)
    row = "|{:^30}|{:^20.2f}|{:^20.2f}|{:^20.2f}|{:^20.2f}|"
    for model in models:
        model.fit(X_train_tr, y_train)
        y_hat_train = model.predict(X_train_tr)
        y_hat_test = model.predict(X_test_tr)
        train_error = rmse(y_train, y_hat_train)
        test_error = rmse(y_test, y_hat_test)
        train_accuracy = r2_score(y_train, y_hat_train)
        test_accuracy = r2_score(y_test, y_hat_test)
        model_name = str(model)
        print(model_name, end="\n\n")
        print(row.format("", train_error, test_error, train_accuracy*100, test_accuracy*100))
        print("-"*width)

In [24]:
generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test)


____________________________________________________________________________________________________________________
|          Model Name          |    Train Error     |     Test Error     |   Train Accuracy   |   Test Accuracy    |
____________________________________________________________________________________________________________________
RandomForestRegressor(n_estimators=200)

|                              |        1.01        |        3.33        |       98.35        |       68.32        |
--------------------------------------------------------------------------------------------------------------------
RandomForestRegressor(max_depth=3)

|                              |        2.73        |        3.18        |       87.97        |       71.17        |
--------------------------------------------------------------------------------------------------------------------
RandomForestRegressor(max_depth=5)

|                              |        1.89        |        3.33   

In [25]:
len(X_train.columns)

8

In [26]:
result = {
    "models": [],
    "train_error": [],
    "test_error": [],
    "train_acc": [],
    "test_acc": [],
    "parameters": []
}

In [27]:
n_estimators = [10, 50, 100, 200, 300, 500] # 600 models 
max_depth = [1, 3, 6, 10] # 100
min_samples_split = [ 5, 10, 15, 20, 50] # 5 -> 20
max_features = [ 2, 3, 5, 7] # 4 
c = 0
for n_est in n_estimators:
    for dth in max_depth:
        for min_sm in min_samples_split:
            for mx_fe in max_features:
                model = RandomForestRegressor(
                    n_estimators=n_est, 
                    max_depth=dth,
                    min_samples_split= min_sm,
                    max_features=mx_fe
                )
                model.fit(X_train_tr, y_train)
                y_hat_train = model.predict(X_train_tr)
                y_hat_test = model.predict(X_test_tr)
                train_error = rmse(y_train, y_hat_train)
                test_error = rmse(y_test, y_hat_test)
                train_accuracy = r2_score(y_train, y_hat_train)
                test_accuracy = r2_score(y_test, y_hat_test)
                
                result["models"].append(model)
                result["train_error"].append(train_error)
                result["test_error"].append(test_error)
                result["train_acc"].append(train_accuracy)
                result["test_acc"].append(test_accuracy)
                result["parameters"].append({
                    "n_estimators": n_est,
                    "max_depth": dth,
                    "min_samples_split": min_sm,
                    "max_features": mx_fe
                })
                c += 1
                
print("no of models trained: ", c)

no of models trained:  480


In [28]:
result = pd.DataFrame(result)

In [29]:
result.head()

Unnamed: 0,models,train_error,test_error,train_acc,test_acc,parameters
0,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.50473,3.582229,0.672654,0.633699,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."
1,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.65324,3.737978,0.650715,0.601154,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."
2,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.896747,4.663521,0.613201,0.379189,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."
3,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.978569,4.481376,0.600167,0.426736,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."
4,"(DecisionTreeRegressor(max_depth=1, max_featur...",4.960783,3.859956,0.603019,0.574699,"{'n_estimators': 10, 'max_depth': 1, 'min_samp..."


In [30]:
result.shape[0]

480

In [31]:
result.sort_values(["test_acc", "train_acc"], ascending=False).head(5)

Unnamed: 0,models,train_error,test_error,train_acc,test_acc,parameters
49,"(DecisionTreeRegressor(max_depth=6, max_featur...",2.304126,2.500528,0.914359,0.821518,"{'n_estimators': 10, 'max_depth': 6, 'min_samp..."
24,"(DecisionTreeRegressor(max_depth=3, max_featur...",3.391225,2.740129,0.814483,0.785675,"{'n_estimators': 10, 'max_depth': 3, 'min_samp..."
76,"(DecisionTreeRegressor(max_depth=10, max_featu...",3.047916,2.776412,0.850143,0.779961,"{'n_estimators': 10, 'max_depth': 10, 'min_sam..."
77,"(DecisionTreeRegressor(max_depth=10, max_featu...",3.220083,2.782071,0.832735,0.779063,"{'n_estimators': 10, 'max_depth': 10, 'min_sam..."
232,"(DecisionTreeRegressor(max_depth=10, max_featu...",2.397403,2.789968,0.907285,0.777807,"{'n_estimators': 100, 'max_depth': 10, 'min_sa..."


In [32]:
row = result.loc[66, ]

In [33]:
model = row["models"]
print(model)

RandomForestRegressor(max_depth=10, max_features=5, min_samples_split=10,
                      n_estimators=10)


In [34]:
para = row["parameters"]
print(para) # best value for hyper parameter

{'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 10, 'max_features': 5}


In [35]:
X_train.shape

(378, 8)

In [36]:
X_test.shape

(20, 8)

    Hyperparameter Tunning using sklearn
    
            GridSearch
            
            RandomizedSearch
            
            
     Classification 

    estimators --> ML Model 
    
    parameter_grid --> hyperparameter values you want to try
    
    score -> (greastest is best) r2_score, neg_mean_squared_error

## Hyper-parameter Tunning

In [37]:
from sklearn.model_selection import GridSearchCV

params = [
    {
        "n_estimators": [10, 50, 100, 200, 300, 500], # 600 models 
        "max_depth": [1, 3, 6, 10], # 100
        "min_samples_split": [ 5, 10, 15, 20, 50], # 5 -> 20
        "max_features": [ 2, 3, 5, 7] # 4 
    }
]

In [38]:
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=params, scoring="r2")
# grid_search = GridSearchCV(RandomForestRegressor(), param_grid=params, scoring="neg_mean_squared_error")

In [39]:
grid_search.fit(X_train_tr, y_train)

In [40]:
best_model = grid_search.best_estimator_
print(best_model)

RandomForestRegressor(max_depth=10, max_features=7, min_samples_split=5,
                      n_estimators=500)


In [41]:
best_params = grid_search.best_params_
print(best_params)

{'max_depth': 10, 'max_features': 7, 'min_samples_split': 5, 'n_estimators': 500}


In [42]:
acc = grid_search.best_score_ 
# neg_sq_error = grid_search.best_score_
# error = np.sqrt(-neg_sq_error)
print(acc*100)
# print(error)

87.56323677080516
