In [1]:
import io
import json
import logging
import numpy as np
import pandas as pd
import pickle
import time

from sklearn.datasets import fetch_california_housing
from sqlalchemy import create_engine

from datetime import datetime, timedelta
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from typing import Any, Dict, NoReturn, Literal

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository

from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.providers.postgres.hooks.postgres import PostgresHook
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago

In [35]:
housing = fetch_california_housing(as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(housing['data'], housing['target'])
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)

In [40]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2580 entries, 11220 to 1914
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      2580 non-null   float64
 1   HouseAge    2580 non-null   float64
 2   AveRooms    2580 non-null   float64
 3   AveBedrms   2580 non-null   float64
 4   Population  2580 non-null   float64
 5   AveOccup    2580 non-null   float64
 6   Latitude    2580 non-null   float64
 7   Longitude   2580 non-null   float64
dtypes: float64(8)
memory usage: 181.4 KB


In [32]:
pd.DataFrame(data["X_val"], columns = FEATURES)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
20154,6.6257,39.0,5.541779,1.026954,939.0,2.530997,34.46,-119.19
12009,4.8167,24.0,5.514735,0.984283,1653.0,3.247544,33.92,-117.55
11258,2.8042,26.0,4.017982,1.079920,2457.0,2.454545,33.81,-117.97
8408,1.7344,35.0,4.449153,1.067797,918.0,3.889831,33.93,-118.35
3191,3.1071,24.0,5.132075,1.056604,442.0,4.169811,36.28,-119.81
...,...,...,...,...,...,...,...,...
679,4.3864,35.0,5.415638,0.919753,1349.0,2.775720,37.68,-122.15
13004,3.8906,15.0,4.832386,0.982955,723.0,2.053977,38.67,-121.27
8441,4.3333,42.0,4.607565,0.997636,1184.0,2.799054,33.91,-118.36
8707,5.8972,35.0,5.663551,1.099688,853.0,2.657321,33.83,-118.34


In [33]:
y_val

19686    0.641
9486     1.342
19313    2.683
1919     1.286
15084    1.344
         ...  
5356     4.125
9955     2.167
847      1.514
11235    1.774
17311    2.306
Name: MedHouseVal, Length: 2580, dtype: float64

In [34]:
pd.Series(data["y_val"])

20154    4.27600
12009    1.63300
11258    2.06300
8408     1.46900
3191     0.56100
          ...   
679      2.05200
13004    1.28700
8441     2.25600
8707     3.50900
10751    5.00001
Name: MedHouseVal, Length: 2580, dtype: float64

In [29]:
s3_hook = S3Hook("s3_connection")
BUCKET = "test-bucket-nicolas-1"
FEATURES = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"]

In [30]:
data = {}

m_name = "rf"

for name in ["X_train", "X_test", "X_val", "y_train", "y_test", "y_val"]:
    file = s3_hook.download_file(key=f'NicolasPal/project1/{m_name}/datasets/{name}.pkl', bucket_name=BUCKET)
    data[name] = pd.read_pickle(file)

[[34m2024-10-08T21:35:09.579+0500[0m] {[34ms3.py:[0m1387} INFO[0m - Downloading source S3 file from Bucket test-bucket-nicolas-1 with path NicolasPal/project1/rf/datasets/X_train.pkl[0m
[[34m2024-10-08T21:35:09.582+0500[0m] {[34mbase.py:[0m84} INFO[0m - Retrieving connection 's3_connection'[0m
[[34m2024-10-08T21:35:09.583+0500[0m] {[34mconnection_wrapper.py:[0m384} INFO[0m - AWS Connection (conn_id='s3_connection', conn_type='aws') credentials retrieved from login and password.[0m
[[34m2024-10-08T21:35:11.296+0500[0m] {[34ms3.py:[0m1387} INFO[0m - Downloading source S3 file from Bucket test-bucket-nicolas-1 with path NicolasPal/project1/rf/datasets/X_test.pkl[0m
[[34m2024-10-08T21:35:11.790+0500[0m] {[34ms3.py:[0m1387} INFO[0m - Downloading source S3 file from Bucket test-bucket-nicolas-1 with path NicolasPal/project1/rf/datasets/X_val.pkl[0m
[[34m2024-10-08T21:35:12.299+0500[0m] {[34ms3.py:[0m1387} INFO[0m - Downloading source S3 file from Bucket tes

In [23]:
pd.DataFrame(data["X_train"], columns = FEATURES).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15480 entries, 0 to 15479
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      15480 non-null  float64
 1   HouseAge    15480 non-null  float64
 2   AveRooms    15480 non-null  float64
 3   AveBedrms   15480 non-null  float64
 4   Population  15480 non-null  float64
 5   AveOccup    15480 non-null  float64
 6   Latitude    15480 non-null  float64
 7   Longitude   15480 non-null  float64
dtypes: float64(8)
memory usage: 967.6 KB
