# Step1: Importing Dataset from source( Data base - Boston House Price)

In [None]:
import pandas as pd
import numpy as np                                                               # selected for finals

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])               # Stack the data to create the feature matrix (X) and target vector (y)
target = raw_df.values[1::2, 2]

column_names = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX",
    "PTRATIO", "B", "LSTAT"
]
boston_df = pd.DataFrame(data, columns=column_names)                             # Convert to Pandas DataFrame for easier handling
boston_df['MEDV'] = target                                                       # Add target column (MEDV)
print(boston_df.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


## Step2 : Data Pre-processing

In [None]:
print(boston_df.isnull().sum())                                                  #checking for empty elements in the dataset

print(boston_df.describe())                                                      #Statistical data (minimum,maximum, mean, Standard deviation, etc.. )
print(boston_df.head())

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64
             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.601545   23.322453    6.860353    0.253994    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.082045    0.000000    5.190000    0.000000    0.449000    5.885500   
50%      0.256510    0.000000    9.690000    0.000000    0.538000    6.208500   
75%      3.677083   12.500000   18.100000    0.000000    0.624000    6.623500   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              AGE         DIS         RAD         TAX     PTRATIO          

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = boston_df.drop('MEDV', axis=1)                                                         # Define features (X) and target (y)
y = boston_df['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Split into training and testing set in 80:20 ratio


# Step3: Creating Model( XGBOOST)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

xgboost_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgboost_model.fit(X_train, y_train)
y_pred = xgboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")                                         #Model evaluation using Mean squared Error

Mean Squared Error (MSE): 6.909231565384943


# Step4: Hyper-parameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {                                                                     # We are performing Hyper-parameter tuning using RandomizedsearchCv algo
    'n_estimators': [200, 300, 400],                                               # hyperparameters and their ranges
    'max_depth': [3, 5, 8],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(                                                # Perform hyperparameter tuning using RandomizedSearchCV
    estimator=xgboost_model, param_distributions=param_grid,
    n_iter=10, scoring='neg_mean_squared_error', cv=5, verbose=2, random_state=42
)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_                                            # Get the best parameters
print(f"Best Hyperparameters: {best_params}")


best_model = random_search.best_estimator_                                        # Evaluate the model with best parameters
y_pred_tuned = best_model.predict(X_test)

mse_tuned = mean_squared_error(y_test, y_pred_tuned)                              #MSE after hyper-parameter tuning
rmse_tuned = np.sqrt(mse_tuned)
print(f"Tuned Root Mean Squared Error (RMSE): {rmse_tuned}")
print(f"Tuned Mean Squared Error (MSE): {mse_tuned}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.6; total time=   0.2s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.6; total time=   0.2s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.6; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.6; total time=   0.1s
[CV]

Test1 BEST
Best Hyperparameters: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Tuned Root Mean Squared Error (RMSE): 2.190574488963846
Tuned Mean Squared Error (MSE): 4.798616591699215

TEST2:
Best Hyperparameters: {'subsample': 0.6, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Tuned Root Mean Squared Error (RMSE): 2.5263695955890624
Tuned Mean Squared Error (MSE): 6.382543333516843


Test3:
Best Hyperparameters: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Tuned Root Mean Squared Error (RMSE): 2.4501927451197707
Tuned Mean Squared Error (MSE): 6.003444488237557
"""

#Step5: I created a model with specific hyper parameter from the above analysis

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
                                                                                #Model with specific h-parameter
xgboost_model = xgb.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.8, colsample_bytree=0.8)
xgboost_model.fit(X_train, y_train)
y_pred = xgboost_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)                                         # Evaluate the model performance
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Squared Error (MSE): {mse}")


Root Mean Squared Error (RMSE): 2.5466378278954207
Mean Squared Error (MSE): 6.485364226467907


# Step 6: Creating Docker Container for the above program

# These steps doesn't included in Colab process

In [None]:
FROM python:3.8


RUN pip install xgboost scikit-learn google-cloud-storage                        # Install necessary libraries
RUN pip install pandas

COPY main.py .                                                                  # Copy the training script into the container

ENTRYPOINT ["python", "./main.py"]                                              # Set the command to run your training script





*   Download Docker Desktop and Install it in the procedure
*   Procedure PDF link is attached
           https://docs.docker.com/get-started/
*   Use a Editor with your comfort. I used Visual studio
*   Create Docker file and paste the above docker file instructions








1.   Navigate to the file path in the terminal
2.   and follow the commands
       docker build -t <image_name> .
       docker run <image_name_like_hpar_model>


*   Make sure you included the nessary libraries in the RUN command in the docker file
*   I Included xgboost scikit-learn google-cloud-storage , Pandas






# Next Is setting up Google cloud platform for vertex AI

Enable Artifact registary API : It required Billing so process it

## And More I played with few of the option on how to run hyperparameter tuning in Vertex AI and It doesnt gone as planned. It shows error "raining pipeline failed with error message: The following quota metrics exceed quota limits: aiplatform.googleapis.com/custom_model_training_cpus,aiplatform.googleapis.com/custom_model_training_nvidia_v100_gpus"

I played with virtual machines to work with Kubernetes and I Had a hardtime learning this because of other interviews

Learned alot and Hope I will learn awesome tech in ML and I am a fresher and I posible give me change to learn more with your team

Hope I Hear back good news from you . Thank you