# Model Training
--- 

## Zillow CA Rent Model
---

### Training Data
---

In [None]:
ZILLOW_TRAINING_DATA_ROOT = '../data/Zillow/training/'

zillow_ca_rental_training_data_1 = f'{ZILLOW_TRAINING_DATA_ROOT}ca_rental_training_set1.csv'

## Model Training
---

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder


def train_linear_regression_model(data, target_column):
    # Split the data into features and target
    X = data.drop(columns=[target_column])
    y = data[target_column]
    # Encode all categorical data in the dataset
    
    encoder = OneHotEncoder(sparse=False)
    encoded_data = pd.DataFrame(encoder.fit_transform(X))
    X = encoded_data
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize the linear regression model
    model = LinearRegression()
    
    # Train the model
    model.fit(X_train, y_train)
    
    return model

In [None]:
from sklearn.model_selection import GridSearchCV

def find_optimal_parameters(X, y, model):
    """
    Finds the optimal parameters for a given linear regression model using GridSearchCV.

    Parameters:
    X (DataFrame): Features.
    y (Series): Target.
    model (LinearRegression): The linear regression model instance.

    Returns:
    dict: A dictionary containing the optimal parameters.
    """
    # Define the parameters to search
    param_grid = {
        'fit_intercept': [True, False],
        'normalize': [True, False],
        'copy_X': [True, False]
    }
    
    # Initialize the GridSearchCV with the model and parameters
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    
    # Fit the GridSearchCV to the data
    grid_search.fit(X, y)
    
    # Return the optimal parameters
    return grid_search.best_params_

In [None]:
def print_model_metrics(model, x_test, y_test):
    from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score
    
    # Make predictions on the test set
    y_pred = model.predict(x_test)
    
    # Encode categorical data in x_test and y_test
    x_test_encoded = pd.get_dummies(x_test, drop_first=True)
    y_test_encoded = pd.get_dummies(y_test, drop_first=True)
    
    # Calculate the metrics on encoded data
    mse = mean_squared_error(y_test_encoded, y_pred)
    mae = mean_absolute_error(y_test_encoded, y_pred)
    r2 = r2_score(y_test_encoded, y_pred)
    evs = explained_variance_score(y_test_encoded, y_pred)
    
    # Print the metrics
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R^2 Score: {r2}")
    print(f"Explained Variance Score: {evs}")

# Example usage
# X = training_set1.drop(columns=['rent'])
# y = training_set1['rent']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import VotingRegressor

def create_ensembled_model(models, X_train, y_train):
    """
    Creates an ensembled model from a given set of models using VotingRegressor.

    Parameters:
    models (list): List of tuples where each tuple contains a model name and the model instance.
    X_train (DataFrame): Training features.
    y_train (Series): Training target.

    Returns:
    VotingRegressor: An ensembled model.
    """
    ensembled_model = VotingRegressor(estimators=models)
    ensembled_model.fit(X_train, y_train)
    return ensembled_model

# Example usage:
# models = [('model1', model1), ('model2', model2)]
# ensembled_model = create_ensembled_model(models, X_train, y_train)

#print_model_metrics(ensembled_model, X_test, y_test)

In [None]:
import pandas as pd

# Load training model to pandas.
zillow_ca_rental_training_df_1 = pd.read_csv(zillow_ca_rental_training_data_1)

#### Model 1
----

In [None]:
rent_model_1 = train_linear_regression_model(zillow_ca_rental_training_df_1,'rent')

In [None]:

X = zillow_ca_rental_training_df_1.drop(columns=['rent'])
y = zillow_ca_rental_training_df_1['rent']

print_model_metrics(model=rent_model_1,x_test=X,y_test=y)