![tower_bridge](tower_bridge.jpeg)

As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of `sklearn` and `MLflow`.

You will be working with data stored in `london_weather.csv`, which contains the following columns:
- **date** - recorded date of measurement - (**int**)
- **cloud_cover** - cloud cover measurement in oktas - (**float**)
- **sunshine** - sunshine measurement in hours (hrs) - (**float**)
- **global_radiation** - irradiance measurement in Watt per square meter (W/m2) - (**float**)
- **max_temp** - maximum temperature recorded in degrees Celsius (°C) - (**float**)
- **mean_temp** - mean temperature in degrees Celsius (°C) - (**float**)
- **min_temp** - minimum temperature recorded in degrees Celsius (°C) - (**float**)
- **precipitation** - precipitation measurement in millimeters (mm) - (**float**)
- **pressure** - pressure measurement in Pascals (Pa) - (**float**)
- **snow_depth** - snow depth measurement in centimeters (cm) - (**float**)

In [19]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Load data and perform exploratory analysis
weather = pd.read_csv("london_weather.csv")
print(weather.head())

def preprocess_df(weather, feature_selection, target_var):
    """
    Split dataframe into X and y, and train and test consecutively.
    Then impute and scale both train and test features.
    Returns the train and test sets.
    """
    X = weather[feature_selection].values
    y = weather[target_var].values

    # Combine X and y for easier preprocessing
    combined_data = np.column_stack((X, y))

    # Drop rows with missing values
    combined_data = combined_data[~np.isnan(combined_data).any(axis=1)]

    # Split the data into features (X) and target variable (y)
    X = combined_data[:, :-1]
    y = combined_data[:, -1]

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    
    return X_train, X_test, y_train, y_test

feature_selection = ['cloud_cover', 'sunshine', 'global_radiation', 'max_temp', 'min_temp', 'precipitation', 'pressure', 'snow_depth']
target_var = 'mean_temp'

X_train, X_test, y_train, y_test = preprocess_df(weather, feature_selection, target_var)

def predict_and_evaluate(model, x_test, y_test):
    """
    Predict values from test set, calculate and return the root mean squared error.
    """
    y_pred = model.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))    
    return rmse

experiment_name = "Weather Prediction"
# Remove the line that creates the experiment
# EXPERIMENT_ID = mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

# Adjust the parameters
max_depth_parameters = [1, 2]

for idx, depth in enumerate([1, 2, 5, 10, 20]):
    parameters = {
        'max_depth': depth,
        'random_state': 1
    }    
    RUN_NAME = f"run_{idx}"
    with mlflow.start_run(run_name=RUN_NAME) as run:
        # Create and train the model
        model = DecisionTreeRegressor(**parameters)
        model.fit(X_train, y_train)

        # Make predictions and evaluate the model
        rmse = predict_and_evaluate(model, X_test, y_test)

        # Log the model and metrics
        mlflow.sklearn.log_model(model, "model")
        mlflow.log_params(parameters)
        mlflow.log_metric("rmse", rmse)



experiment_results = mlflow.search_runs(experiment_names=[experiment_name])
experiment_results

       date  cloud_cover  sunshine  ...  precipitation  pressure  snow_depth
0  19790101          2.0       7.0  ...            0.4  101900.0         9.0
1  19790102          6.0       1.7  ...            0.0  102530.0         8.0
2  19790103          5.0       0.0  ...            0.0  102050.0         4.0
3  19790104          8.0       0.0  ...            0.0  100840.0         2.0
4  19790105          6.0       2.0  ...            0.0  102250.0         1.0

[5 rows x 10 columns]


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.rmse,params.random_state,params.max_depth,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.log-model.history,tags.mlflow.source.name,tags.mlflow.user
0,56f3d36215f744a6afa5a2af8c94c099,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:35:12.364000+00:00,2023-05-31 13:35:14.513000+00:00,1.258087,1.0,20.0,run_4,LOCAL,"[{""run_id"": ""56f3d36215f744a6afa5a2af8c94c099""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
1,20b64fc8cfb84d979cbee9eae500517c,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:35:10.220000+00:00,2023-05-31 13:35:12.345000+00:00,1.037427,1.0,10.0,run_3,LOCAL,"[{""run_id"": ""20b64fc8cfb84d979cbee9eae500517c""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
2,dce68626f32f4fd29456af97571d550f,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:35:07.992000+00:00,2023-05-31 13:35:10.198000+00:00,1.27314,1.0,5.0,run_2,LOCAL,"[{""run_id"": ""dce68626f32f4fd29456af97571d550f""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
3,2672870851b04a6cb58e88972a823cff,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:35:05.952000+00:00,2023-05-31 13:35:07.974000+00:00,2.323201,1.0,2.0,run_1,LOCAL,"[{""run_id"": ""2672870851b04a6cb58e88972a823cff""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
4,c7c5b5455ee8475e8248306264fd2968,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:35:03.846000+00:00,2023-05-31 13:35:05.934000+00:00,3.416296,1.0,1.0,run_0,LOCAL,"[{""run_id"": ""c7c5b5455ee8475e8248306264fd2968""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
5,a0e05c465d8746209d3021d83217ab06,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:35:00.299000+00:00,2023-05-31 13:35:02.407000+00:00,1.258087,1.0,20.0,run_4,LOCAL,"[{""run_id"": ""a0e05c465d8746209d3021d83217ab06""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
6,884011d9dd1741bc99f2e50c804a9ec2,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:34:58.209000+00:00,2023-05-31 13:35:00.280000+00:00,1.037427,1.0,10.0,run_3,LOCAL,"[{""run_id"": ""884011d9dd1741bc99f2e50c804a9ec2""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
7,cdc3daa370714856a195bf1d87672ff3,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:34:56.103000+00:00,2023-05-31 13:34:58.188000+00:00,1.27314,1.0,5.0,run_2,LOCAL,"[{""run_id"": ""cdc3daa370714856a195bf1d87672ff3""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
8,452840e512744dc5b9c081f9f8059c6e,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:34:54.012000+00:00,2023-05-31 13:34:56.083000+00:00,2.323201,1.0,2.0,run_1,LOCAL,"[{""run_id"": ""452840e512744dc5b9c081f9f8059c6e""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
9,9ed428c073b8439785735b02e8c6aac5,824431996169724647,FINISHED,file:///work/files/workspace/mlruns/8244319961...,2023-05-31 13:34:51.932000+00:00,2023-05-31 13:34:53.993000+00:00,3.416296,1.0,1.0,run_0,LOCAL,"[{""run_id"": ""9ed428c073b8439785735b02e8c6aac5""...",/usr/lib/python3/dist-packages/python_kernel/k...,repl
