<a href="https://colab.research.google.com/github/mohammadham/End-to-End-Machine-Learning-Project/blob/main/End_to_End_Machine_Learning_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

excersise 1:In this code, we're using the 
fetch_california_housing
 function from scikit-learn to load the California housing dataset. We then split the data into training and test sets using the 
train_test_split
 function, which randomly splits the data into two sets based on the 
test_size
 parameter (in this case, 20% of the data is used for testing).

In [5]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the California housing dataset
housing = fetch_california_housing()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

# Define the parameter grid to search
param_grid = [
    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]},
    {'kernel': ['rbf'], 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001, 0.0001]},
]

# Create an SVR model
svr = SVR()

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding RMSE score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best RMSE score: ", np.sqrt(-grid_search.best_score_))

# Evaluate the best model on the test set
final_model = grid_search.best_estimator_
y_pred = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Final RMSE score: ", final_rmse)

KeyboardInterrupt: ignored

exercise 2: We define a dictionary of parameter distributions to search over, which includes different distributions for the C and gamma hyperparameters for the 'rbf' kernel, as well as the 'linear' kernel.

We create an SVR model and use 
RandomizedSearchCV
 to search over the parameter distributions for the best hyperparameters, using 5-fold cross-validation and the negative mean squared error as the scoring metric. After fitting the 
RandomizedSearchCV
 object to the training data, we print out the best hyperparameters and corresponding RMSE score.

Finally, we evaluate the best model on the test set and print out the final RMSE score.


In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.stats import expon, reciprocal
import numpy as np

# Load the California Housing dataset
housing = fetch_california_housing()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

# Define the parameter distributions to search
param_distribs = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20, 200000),
    'gamma': expon(scale=1.0),
}

# Create an SVR model
svr = SVR()

# Use RandomizedSearchCV to search for the best hyperparameters
rnd_search = RandomizedSearchCV(svr, param_distributions=param_distribs, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding RMSE score
print("Best hyperparameters: ", rnd_search.best_params_)
print("Best RMSE score: ", np.sqrt(-rnd_search.best_score_))

# Evaluate the best model on the test set
final_model = rnd_search.best_estimator_
y_pred = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Final RMSE score: ", final_rmse)

exercise 3 : 
We define the numerical and categorical attributes, and then define the preparation pipeline using 
ColumnTransformer
 to apply the 
num_pipeline
 to the numerical attributes and pass through the categorical attributes.

We fit a 
RandomForestRegressor
 to the training data to compute feature importances, and then define a transformer 
TopFeatureSelector
 to select only the top k features based on the feature importances.

We define a new pipeline 
preparation_and_feature_selection_pipeline
 that includes the feature selection transformer, and fit and transform the training data using this pipeline. We then print out the top k feature indices and names, and double check that the selected features match the top k features.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# Load the California Housing dataset
housing = fetch_california_housing()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

# Define the numerical and categorical attributes
num_attribs = list(housing.feature_names)
cat_attribs = []

# Define the preparation pipeline
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', 'passthrough', cat_attribs),
])

# Fit a RandomForestRegressor to the training data to compute feature importances
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X_train, y_train)
feature_importances = forest_reg.feature_importances_

# Define a transformer to select only the top k features
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

# Define the number of top features we want to keep
k = 5

# Define a new pipeline that includes the feature selection transformer
preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

# Fit the pipeline to the training data and transform the data
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(X_train)

# Print the top k feature indices and names
top_k_feature_indices = indices_of_top_k(feature_importances, k)
print("Top k feature indices: ", top_k_feature_indices)
print("Top k feature names: ", np.array(num_attribs)[top_k_feature_indices])

# Double check that the selected features match the top k features
print("Selected features for first 3 instances: ", housing_prepared_top_k_features[:3, :])
print("Top k features for first 3 instances: ", X_train[:3, top_k_feature_indices])

excersise 4:
We define the numerical and categorical attributes, and then define the preparation pipeline using 
ColumnTransformer
 to apply the 
num_pipeline
 to the numerical attributes and pass through the categorical attributes.

We fit a 
RandomForestRegressor
 to the training data to compute feature importances, and then define a transformer 
TopFeatureSelector
 to select only the top k features based on the feature importances.

We define the parameter distributions to search for the best hyperparameters for an SVR model, and use 
RandomizedSearchCV
 to search for the best hyperparameters.

We define the final pipeline 
prepare_select_and_predict_pipeline
 that includes data preparation, feature selection, and prediction using the best SVR model found by 
RandomizedSearchCV
. We fit the pipeline to the full housing dataset and make predictions on some test data.

Finally, we print out the predictions and labels for the test data.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# Load the California Housing dataset
housing = fetch_california_housing()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

# Define the numerical and categorical attributes
num_attribs = list(housing.feature_names)
cat_attribs = []

# Define the preparation pipeline
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', 'passthrough', cat_attribs),
])

# Fit a RandomForestRegressor to the training data to compute feature importances
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X_train, y_train)
feature_importances = forest_reg.feature_importances_

# Define a transformer to select only the top k features
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

# Define the parameter distributions to search
param_distribs = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20, 200000),
    'gamma': expon(scale=1.0),
}

# Use RandomizedSearchCV to search for the best hyperparameters for an SVR model
svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)

# Define the final pipeline that includes data preparation, feature selection, and prediction
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(**rnd_search.best_params_)),
])

# Fit the pipeline to the training data and make predictions on some test data
prepare_select_and_predict_pipeline.fit(housing.data, housing.target)
some_data = housing.data[:4]
some_labels = housing.target[:4]
print("Predictions:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("Labels:\t\t", list(some_labels))

excersise 5: 
We define the numerical and categorical attributes, and then define the preparation pipeline using 
ColumnTransformer
 to apply the 
num_pipeline
 to the numerical attributes and 
OneHotEncoder
 to the categorical attributes.

We fit a 
RandomForestRegressor
 to the training data to compute feature importances, and then define a transformer 
TopFeatureSelector
 to select only the top k features based on the feature importances.

We define the parameter grid to search for the best hyperparameters for the final pipeline, which includes data preparation, feature selection, and prediction using an SVR model.

We use 
GridSearchCV
 to search for the best hyperparameters, and print out the best hyperparameters found by 
GridSearchCV
.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from scipy.stats import reciprocal, expon
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# Load the California Housing dataset
housing = fetch_california_housing()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

# Define the numerical and categorical attributes
num_attribs = list(housing.feature_names)
cat_attribs = ['ocean_proximity']

# Define the preparation pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_attribs),
])

# Fit a RandomForestRegressor to the training data to compute feature importances
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X_train, y_train)
feature_importances = forest_reg.feature_importances_

# Define a transformer to select only the top k features
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

# Define the parameter grid to search
param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

# Define the final pipeline that includes data preparation, feature selection, and prediction using an SVR model
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR()),
])

# Use GridSearchCV to search for the best hyperparameters
grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(housing.data, housing.target)

# Print the best hyperparameters found by GridSearchCV
print("Best hyperparameters: ", grid_search_prep.best_params_)