In [5]:
import os


In [6]:
os.chdir("../")

In [7]:
%pwd

'e:\\parth\\DonorsChoose-Application-Screening-app'

In [10]:
# data_transformation:
#   root_dir: artifacts/data_transform
#   local_data_cleanfile: artifacts/data_cleaning/clean_data.csv
#   save_transformed_datafile: artifacts/data_transform

In [24]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    local_data_cleanfile: Path
    save_transformed_datafile: Path

In [25]:
from donorschoose.constants import *
import os
from donorschoose.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        # params_filepath = PARAMS_FILE_PATH
        ):

        self.config = read_yaml(config_filepath)
        # self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transform_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_clean_config = DataTransformationConfig(
            root_dir=config.root_dir,
            local_data_cleanfile =  config.local_data_cleanfile,
            save_transformed_datafile = config.save_transformed_datafile,
        )
        return data_clean_config

In [85]:
import os
from donorschoose import logger
from pathlib import Path
from donorschoose.utils.common import read_csv
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy.sparse import save_npz
from scipy.sparse import hstack

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.clean_df = None
        self.X_train = None
        self.y_train = None 
        self.X_val = None 
        self.y_val = None 
        self.X_test = None
        self.y_test = None
        self.X_train_transformed = None
        self.X_val_transformed =None
        self.X_test_transformed = None
        
    def read_files(self):
        '''
        Fetches data from the specified URLs and returns DataFrames.
        '''
        try: 
            root_dir = self.config.root_dir
            os.makedirs(root_dir, exist_ok=True)
            clean_data_path = Path(self.config.local_data_cleanfile)          
            self.clean_df = read_csv(clean_data_path)
            
        except Exception as e:
            raise e
        
    def split_data(self , columnName: str):
        X=self.clean_df.drop(columnName , axis=1)
        Y=self.clean_df[columnName]
        
        self.X_train , self.X_val  , self.y_train , self.y_val = train_test_split(X , Y , test_size=0.30 , stratify=Y)
        self.X_val ,self.X_test,self.y_val ,self.y_test= train_test_split(self.X_val , self.y_val , test_size=0.30 , stratify=self.y_val)
        logger.info(f"Data Split into Train val test")
        logger.info(f"Train data shape: {self.X_train.shape} , {self.y_train.shape}")
        logger.info(f"Val data shape: {self.X_val.shape} , {self.y_val.shape}")
        logger.info(f"test data shape: {self.X_test.shape} , {self.y_test.shape}")
    
    def tfIdef_text(self , columnName: str):
        tfidf_vectorizer = TfidfVectorizer(min_df =20 ,max_features=5000)
        self.X_train_transformed = tfidf_vectorizer.fit_transform(self.X_train[columnName])
        self.X_val_transformed = tfidf_vectorizer.transform(self.X_val[columnName])
        self.X_test_transformed = tfidf_vectorizer.transform(self.X_test[columnName])
        logger.info(f"{columnName} transformed to tfidf")
                
    def onehotencoding_feature(self , column_list: list):
        
        label_encoder = ColumnTransformer(
            transformers=[
                ('onehot',OneHotEncoder(handle_unknown='ignore'), ['teacher_prefix', 'school_state', 'project_grade_category'])
            ],
            remainder='passthrough'
        )   
        
        X_train_transformed_onehot = label_encoder.fit_transform(self.X_train[column_list])
        X_val_transformed_onehot = label_encoder.transform(self.X_val[column_list])
        X_test_transformed_onehot = label_encoder.transform(self.X_test[column_list])
        
        self.X_train_transformed = hstack([self.X_train_transformed , X_train_transformed_onehot])
        self.X_val_transformed = hstack([self.X_val_transformed , X_val_transformed_onehot])
        self.X_test_transformed = hstack([self.X_test_transformed , X_test_transformed_onehot])
        
        logger.info(f"features {column_list} transformed to Onehot encode features")
        logger.info(f"one hot encode features shape:{self.X_train_transformed.shape , self.X_val_transformed.shape, self.X_test_transformed.shape} ")
                
    def feature_hash(self , columnName : str , n_feature: int):
        self.X_train[columnName] = self.X_train[columnName].apply(lambda x: [x])
        self.X_test[columnName] = self.X_test[columnName].apply(lambda x: [x])
        self.X_val[columnName] = self.X_val[columnName].apply(lambda x: [x])
        
        hasher = FeatureHasher(n_features= n_feature, input_type="string", alternate_sign=False)
        X_train_hashed = hasher.fit_transform(self.X_train[columnName])
        X_val_hashed = hasher.transform(self.X_val[columnName])
        X_test_hashed = hasher.transform(self.X_test[columnName])  

        self.X_train_transformed = hstack([self.X_train_transformed , X_train_hashed])
        self.X_val_transformed = hstack([self.X_val_transformed , X_val_hashed])
        self.X_test_transformed = hstack([self.X_test_transformed , X_test_hashed])
                   
        logger.info(f"feature hash for {columnName} and number of features {n_feature}  ")
        logger.info(f"feature hash shape:{self.X_train_transformed.shape , self.X_val_transformed.shape, self.X_test_transformed.shape} ")
        
    def normalize_column(self ,columnName: str):
        try:
            scaler = StandardScaler()
            scaler.fit(self.X_train[columnName].values.reshape(-1, 1))
            X_train_normalized_feature=scaler.transform(self.X_train[columnName].values.reshape(-1, 1) )
            X_val_normalized_feature=scaler.transform(self.X_val[columnName].values.reshape(-1, 1) )
            X_test_normalized_feature=scaler.transform(self.X_test[columnName].values.reshape(-1, 1) )
            
            self.X_train_transformed = hstack([self.X_train_transformed ,X_train_normalized_feature])
            self.X_val_transformed = hstack([self.X_val_transformed ,X_val_normalized_feature])
            self.X_test_transformed = hstack([self.X_test_transformed , X_test_normalized_feature])
                    
            logger.info(f"Normalized feature for {columnName} ")
            logger.info(f"Normalized feature shape:{self.X_train_transformed.shape , self.X_val_transformed.shape, self.X_test_transformed.shape} ")
        
        except Exception as e:
            raise e

    def save_transformed_data(self):

        self.X_train_transformed = hstack([self.X_train_transformed ,self.y_train.values.reshape(-1, 1)])
        self.X_val_transformed = hstack([self.X_val_transformed ,self.y_val.values.reshape(-1, 1)])
        self.X_test_transformed = hstack([self.X_test_transformed , self.y_test.values.reshape(-1, 1)])
        
        scaler = StandardScaler(with_mean=False)

        self.X_train_transformed = scaler.fit_transform(self.X_train_transformed)
        self.X_val_transformed = scaler.transform(self.X_val_transformed)
        self.X_test_transformed = scaler.transform(self.X_test_transformed)
        
        local_dir_path = self.config.root_dir
        save_npz(Path(local_dir_path+"/train.npz"), self.X_train_transformed)
        save_npz(Path(local_dir_path+"/val.npz"), self.X_val_transformed)
        save_npz(Path(local_dir_path+"/test.npz"), self.X_test_transformed)


            

In [86]:
data_Config = ConfigurationManager()
data_transform = DataTransformation(data_Config.get_data_transform_config())
data_transform.read_files()
data_transform.split_data("project_is_approved")
data_transform.tfIdef_text("essay")

onehot_columns_list =['teacher_prefix', 'school_state', 'project_grade_category']
data_transform.onehotencoding_feature(onehot_columns_list)

data_transform.feature_hash("clean_categories" , 500)
data_transform.feature_hash("clean_subcategories" ,100)

data_transform.normalize_column("price")
data_transform.normalize_column("quantity")

data_transform.save_transformed_data()


[2024-02-11 00:04:25,186: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-11 00:04:25,189: INFO: common: created directory at: artifacts]
[2024-02-11 00:04:25,193: INFO: common: created directory at: artifacts/data_transform]
[2024-02-11 00:04:25,240: INFO: common: csv file: artifacts\data_cleaning\clean_data.csv, df Shape:(1000, 12) loaded successfully]
[2024-02-11 00:04:25,250: INFO: 1295837172: Data Split into Train val test]
[2024-02-11 00:04:25,260: INFO: 1295837172: Train data shape: (700, 11) , (700,)]
[2024-02-11 00:04:25,262: INFO: 1295837172: Val data shape: (210, 11) , (210,)]
[2024-02-11 00:04:25,264: INFO: 1295837172: test data shape: (90, 11) , (90,)]
[2024-02-11 00:04:25,679: INFO: 1295837172: essay transformed to tfidf]
[2024-02-11 00:04:25,720: INFO: 1295837172: features ['teacher_prefix', 'school_state', 'project_grade_category'] transformed to Onehot encode features]
[2024-02-11 00:04:25,723: INFO: 1295837172: one hot encode features shape:(