In [2]:
import pandas as pd
from pathlib import Path

In [3]:
dataset = pd.read_csv(Path("../artifacts/data_ingestion/raw_data.csv"), sep=',')
dataset.head()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [4]:
# train-test split
X=dataset.iloc[:,:-1]
y=dataset.iloc[:,-1]

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [5]:
# Standardise the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
import pickle
pickle.dump(scaler, open('scaling.pkl','wb'))

In [6]:
# ENTITY
from dataclasses import dataclass

@dataclass(frozen=True)
class DataProcessConfig:
    root_dir: Path
    data_filepath: Path
    scaled_pickle_file: Path

In [7]:
# CONFIG

from pathlib import Path
from housing.utils import read_yaml, create_directories
from housing.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from housing.entity import DataIngestionConfig

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath: Path=CONFIG_FILE_PATH,
        params_filepath: Path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
    
    def get_data_ingerstion_config(self):
        config = self.config.data_ingestion

        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            raw_data_URL=config.raw_data_URL,
            tar_filepath=config.tar_filepath,
            untar_filepath=config.untar_filepath,
            prepared_datapath=config.prepared_datapath
        )
        return data_ingestion_config

    def get_data_process_config(self):
        config = self.config.data_process
        create_directories([config.root_dir])
        data_process_config = DataProcessConfig(
            root_dir=config.root_dir,
            data_filepath=config.data_filepath,
            scaled_pickle_file=config.scaled_pickle_file
        )

In [8]:
# COMPONENT

class DataProcess:
    def __init__(self, config):
        self.config = config
    
    def read_data(self):
        data_filepath = self.config.data_filepath
        self.df = pd.read_csv(data_filepath, sep=',')

    def split_data(self):
        self.X = self.df.iloc[:,:-1]
        self.y = self.df.iloc[:,-1]
        self.X_train,self.X_test,self.y_train,self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=42)

    def transform_data(self):
        self.scaler=StandardScaler()
        self.scaler.fit_transform(self.X_train)
        self.scaler.transform(self.X_test)

    def save_scaler_pickle(self):
        pickle.dump(self.scaler, open(self.config.scaled_pickle_file, 'wb'))