In [1]:
import os
import sqlite3
from dataclasses import dataclass
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
%pwd
os.chdir('..')
%pwd

'c:\\Users\\karthikeya\\New_Delhi_Reviews'

In [3]:
from src.db_paths import table_name, db_path, query
from src.logger import logger

In [4]:
@dataclass
class DataBaseConfig:
    database_path:str = db_path
    table:str = table_name
    query:str = query

In [5]:
class DataBaseHandler:

    def __init__(self):
        self.DataBaseConn = DataBaseConfig()
        pass
    def DataFrameExraction(self):
        try:
            logger.info("Establising Connection With SQL Database")
            self.conn = sqlite3.connect(self.DataBaseConn.database_path)
            self.cursor = self.conn.cursor()
            logger.info("Successfully connected to the SQLite database.")
        except:
            raise Exception
        try:
            logger.info(f"Reading {self.DataBaseConn.table} table ")
            df = pd.read_sql_query(sql=self.DataBaseConn.query, con=self.conn)
            logger.info(f"Successfully read the {self.DataBaseConn.table} as pandas dataframe")
            return df
        except:
            raise Exception

In [6]:
connection = DataBaseHandler()
df = connection.DataFrameExraction()

[2024-12-13 22:12:28,530, 882292963, INFO, Establising Connection With SQL Database ]
[2024-12-13 22:12:28,532, 882292963, INFO, Successfully connected to the SQLite database. ]
[2024-12-13 22:12:28,534, 882292963, INFO, Reading New_Delhi_Reviews table  ]
[2024-12-13 22:12:28,883, 882292963, INFO, Successfully read the New_Delhi_Reviews as pandas dataframe ]


In [11]:
@dataclass
class DataIngestionConfig:
    raw_data_path  = os.path.join('artifacts', 'raw_data.csv')
    train_data_path = os.path.join('artifacts', 'train_data.csv')
    test_data_path = os.path.join('artifacts', 'test_data.csv')


class DataIngestion:

    def __init__(self):
        self.data_config = DataIngestionConfig()
        
        pass
    
    def initate_data_ingestion(self):
        logger.info("Initiating data ingestion")

        try:
            db_handler = DataBaseHandler()
            raw_data = db_handler.DataFrameExraction()
            os.makedirs("artifacts", exist_ok=True)
            raw_data.to_csv(self.data_config.raw_data_path, header=True, index=False)
            logger.info(f"succesfully ingested the raw data as a csv file into {self.data_config.raw_data_path}")
            logger.info("Initiating train test split")
            train_data, test_data = train_test_split(raw_data, test_size=0.3, random_state=42)
            train_data.to_csv(self.data_config.train_data_path, header=True, index=False)
            test_data.to_csv(self.data_config.test_data_path, header=True, index=False)      
            logger.info(f"train and test data split successful and stored respectively as csv files at {self.data_config.train_data_path}, {self.data_config.test_data_path}")     

            return self.data_config.train_data_path, self.data_config.test_data_path
        except:
            raise Exception

In [12]:
data_ingest = DataIngestion()
data_ingest.initate_data_ingestion()

[2024-12-13 22:13:18,097, 3806098344, INFO, Initiating data ingestion ]
[2024-12-13 22:13:18,099, 882292963, INFO, Establising Connection With SQL Database ]
[2024-12-13 22:13:18,101, 882292963, INFO, Successfully connected to the SQLite database. ]
[2024-12-13 22:13:18,102, 882292963, INFO, Reading New_Delhi_Reviews table  ]
[2024-12-13 22:13:18,439, 882292963, INFO, Successfully read the New_Delhi_Reviews as pandas dataframe ]
[2024-12-13 22:13:20,028, 3806098344, INFO, succesfully ingested the raw data as a csv file into artifacts\raw_data.csv ]
[2024-12-13 22:13:20,029, 3806098344, INFO, Initiating train test split ]
[2024-12-13 22:13:21,763, 3806098344, INFO, train and test data split successful and stored respectively as csv files at artifacts\train_data.csv, artifacts\test_data.csv ]


('artifacts\\train_data.csv', 'artifacts\\test_data.csv')