In [1]:
%pwd

'/Users/melihaltin/Documents/Development/data-science/Lung-Cancer/Lung-Cancer/research'

In [2]:
import os 

In [3]:
os.chdir('../')

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_file: Path
    transformed_data_file: Path

In [5]:
from Lung_Cancer.constants import *
from Lung_Cancer.utils.common import read_yaml , create_directories


class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self):
        self.config = self.config.data_transformation
        
        create_directories([self.config.root_dir])
        config = DataTransformationConfig(
            root_dir = Path(self.config.root_dir),
            data_file = Path(self.config.data_file),
            transformed_data_file = Path(self.config.transformed_data_file)
        )
        return config
    
        

In [6]:
from Lung_Cancer.logging import logger
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.data = None

    def load_data(self):
        logger.info(f"Loading data from {self.config.data_file}")
        self.data = pd.read_csv(self.config.data_file)
        logger.info("Data loaded successfully")
        return self.data

    def transform_data(self):
        logger.info("Transforming data")
        self.data.drop_duplicates(inplace=True)
        
        

        encoder = LabelEncoder()
        self.data['LUNG_CANCER'] = encoder.fit_transform(self.data['LUNG_CANCER'])
        self.data['GENDER'] = encoder.fit_transform(self.data['GENDER'])
    
    
        X = self.data.drop(['LUNG_CANCER'],axis=1)
        y = self.data['LUNG_CANCER']
        
        
        smote = SMOTE()
        X_smote,y_smote = smote.fit_resample(X,y)
        X_train,X_test,y_train,y_test = train_test_split(X_smote,y_smote,random_state=42,stratify=y_smote)
        
        scaler=StandardScaler()
        X_train['AGE']=scaler.fit_transform(X_train[['AGE']])
        X_test['AGE']=scaler.transform(X_test[['AGE']])
        
        train = pd.concat([X_train,y_train],axis=1)
        test = pd.concat([X_test,y_test],axis=1)
        
        logger.info("Data transformed successfully")
        return train,test

    def save_data(self):
        logger.info(f"Saving data to {self.config.root_dir}")
        train , test = self.transform_data()
        train.to_csv(self.config.root_dir / "train.csv", index=False)
        test.to_csv(self.config.root_dir / "test.csv", index=False)
        logger.info("Data saved successfully")

    def run(self):
        self.load_data()
        self.transform_data()
        self.save_data()

In [8]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation_config = DataTransformation(data_transformation_config)
    data_transformation_config.run()
except Exception as e:
    logger.error(e)

[2024-04-22 14:21:11,267]- Lung_Cancer.logging - INFO - yaml file: config/config.yaml loaded successfully
[2024-04-22 14:21:11,269]- Lung_Cancer.logging - INFO - yaml file: params.yaml loaded successfully
[2024-04-22 14:21:11,270]- Lung_Cancer.logging - INFO - yaml file: schema.yaml loaded successfully
[2024-04-22 14:21:11,271]- Lung_Cancer.logging - INFO - created directory at: artifacts
[2024-04-22 14:21:11,272]- Lung_Cancer.logging - INFO - created directory at: artifacts/data_transformation
[2024-04-22 14:21:11,272]- Lung_Cancer.logging - INFO - Loading data from artifacts/data_ingestion/survey lung cancer.csv
[2024-04-22 14:21:11,275]- Lung_Cancer.logging - INFO - Data loaded successfully
[2024-04-22 14:21:11,275]- Lung_Cancer.logging - INFO - Transforming data
[2024-04-22 14:21:11,283]- Lung_Cancer.logging - INFO - Data transformed successfully
[2024-04-22 14:21:11,284]- Lung_Cancer.logging - INFO - Saving data to artifacts/data_transformation
[2024-04-22 14:21:11,286]- Lung_Canc