In [10]:
import os
from pathlib import Path
import tarfile
import urllib.request as request

In [11]:
URL = "https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz"
tarfile_name = "housing.tgz"
request.urlretrieve(URL, tarfile_name)

URLError: <urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>

In [None]:
with tarfile.open(tarfile_name) as tar_f:
    tar_f.extractall()

In [3]:
import pandas as pd

with open("CaliforniaHousing/cal_housing.domain", 'r') as f:
    content = f.readlines()

col_name = [col.split(':')[0] for col in content]

df = pd.read_csv("CaliforniaHousing/cal_housing.data", sep=",", names=col_name)
df.head()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [7]:
# 

from housing.utils import read_yaml, create_directories

config = read_yaml(Path("../configs/config.yaml"))
config

ConfigBox({'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'raw_data_URL': 'https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz', 'tar_filepath': 'artifacts/data_ingestion/housing.tgz', 'untar_filepath': 'artifacts/data_ingestion/CaliforniaHousing', 'prepared_datapath': 'artifacts/data_ingestion/raw_data.csv'}})

In [8]:
# CONSTANTS

CONFIG_FILE_PATH = Path("configs/config.yaml")
PARAMS_FILE_PATH = Path("params.yaml")

In [9]:
# ENTITY 

from dataclasses import dataclass

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    raw_data_URL: str
    tar_filepath: Path
    untar_filepath: Path
    prepared_datapath: Path

In [13]:
## CONFIG 

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath: Path=CONFIG_FILE_PATH,
        params_filepath: Path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
    
    def get_data_ingerstion_config(self):
        config = self.config.data_ingestion

        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            raw_data_URL=config.raw_data_URL,
            tar_filepath=config.tar_filepath,
            untar_filepath=config.untar_filepath,
            prepared_datapath=config.prepared_datapath
        )
        return data_ingestion_config

In [14]:
# COMPONENT 

class DataIngestion:
    def __init__(self, config):
        self.config = config

    def download(self):
        URL = self.config.raw_data_URL
        tarfile_name = self.config.tar_filepath
        request.urlretrieve(URL, tarfile_name)
    
    def extract_file(self):
        with tarfile.open(tarfile_name) as tar_f:
            tar_f.extractall(self.config.untar_filepath)

    def prepare(self):
        with open(f"{self.config.untar_filepath}/CaliforniaHousing/cal_housing.domain", 'r') as f:
            content = f.readlines()

        self.col_name = [col.split(':')[0] for col in content]

        self.df = pd.read_csv(f"{self.config.untar_filepath}/CaliforniaHousing/cal_housing.data", sep=",", names=self.col_name)

    def save_data(self):
        self.df.to_csv(self.config.prepared_datapath,index=False)