In [1]:
import os

In [2]:
import sys
sys.path.append('/Users/admin/HB_IDS/HackBeta/src')

In [3]:
os.chdir('../')

In [4]:
pwd

'/Users/admin/HB_IDS/HackBeta'

In [5]:
#entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataPreparationConfig:
    root_dir: Path
    local_data_file: Path

In [6]:
from hackbeta.constants import *
from hackbeta.utils.common import read_yaml,create_directories
import pandas as pd


In [7]:


class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        # Create the root directory for artifacts as defined in the YAML configuration
        create_directories([self.config['artifacts_root']])

    def get_data_preparation_config(self) -> DataPreparationConfig:
        config = self.config['data_preparation']

        # Ensure the data ingestion root directory exists
        create_directories([config['root_dir']])

        # Create a DataIngestionConfig object with paths from the YAML configuration
        data_preparation_config = DataPreparationConfig(
            root_dir=Path(config['root_dir']),
            local_data_file=Path(config['local_data_file']),    # Target path for the moved CSV file
        )

        return data_preparation_config

In [9]:
class DataPreparation:
    def __init__(self, config: DataPreparationConfig):
        self.config = config

    def load_data(self):
        """
        Loads the dataset using pandas from the local data file path specified
        in the configuration.
        """
        df = pd.read_csv(self.config.local_data_file)
        df = self.clean_data(df)
        print(df.head(5))
        self.save_cleaned_data(df)
        return df

    def clean_data(self, df):
        """
        Cleans the DataFrame by removing unwanted characters from each column.
        """
        # Replace '\n' with space (or '') in all text columns
        string_columns = df.select_dtypes(include=['object']).columns
        df[string_columns] = df[string_columns].apply(lambda x: x.str.replace('\n', ' ', regex=True))
        return df

    def save_cleaned_data(self, df):
        """
        Saves the cleaned DataFrame to a new CSV file in the root directory.
        """
        cleaned_file_path = self.config.root_dir / "cleaned_data.csv"
        df.to_csv(cleaned_file_path, index=False)
        print(f"Cleaned data saved to {cleaned_file_path}")

In [12]:

try:
    
    config = ConfigurationManager()
    data_preparation_config = config.get_data_preparation_config()
    data_preparation = DataPreparation(config=data_preparation_config)
    df = data_preparation.load_data()
    
  
    
except Exception as e:
    raise e


[2024-04-07 05:56:57,642: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-04-07 05:56:57,664: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-07 05:56:57,679: INFO: common: created directory at: artifacts]
[2024-04-07 05:56:57,706: INFO: common: created directory at: artifacts/data_preparation]


[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


   Unnamed: 0                                                URL  \
0           0  http://www.leathermag.com/news/newsstahl-expan...   
1           1  http://www.leathermag.com/news/newsbirkenstock...   
2           2  http://www.leathermag.com/news/newsmacys-inc-r...   
3           3  http://www.leathermag.com/news/newseu-cracks-d...   
4           4  http://www.leathermag.com/news/newsburberry-ad...   

                                               Title  \
0      Stahl expands ZDHC level 3certified portfolio   
1  Birkenstock Posts 1.492 Billion Revenues in Fi...   
2  Macys Inc. Rejects Unsolicited Proposal from A...   
3  EU Cracks Down on Greenwashing and Misleading ...   
4  Burberry Adjusts Financial Outlook Amidst Luxu...   

                                        TitleAndDate  \
0   Stahl expands ZDHC level 3-certified portfoli...   
1   Birkenstock Posts €1.492 Billion Revenues in ...   
2   Macy’s, Inc. Rejects Unsolicited Proposal fro...   
3   EU Cracks Down on Greenwas

In [11]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

class DataPreparation:
    def __init__(self, config: DataPreparationConfig):
        self.config = config
        nltk.download('punkt')
        nltk.download('wordnet')
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def load_data(self):
        """Loads the dataset using pandas from the local data file path specified in the configuration."""
        df = pd.read_csv(self.config.local_data_file)
        df = self.clean_data(df)
        print(df.head(5))
        self.normalize_and_save(df)
        return df

    def clean_data(self, df):
        """Cleans the DataFrame by removing unwanted characters from each column."""
        string_columns = df.select_dtypes(include=['object']).columns
        df[string_columns] = df[string_columns].apply(lambda x: x.str.replace('\n', ' ', regex=True))
        return df

    def normalize_and_save(self, df):
        """Aggregates text from the DataFrame, normalizes it, and saves to a text file."""
        # Aggregate text from all columns into a single string
        aggregated_text = " ".join(df.astype(str).sum(axis=1))
        normalized_text = self.normalize(aggregated_text)
        normalized_file_path = self.config.root_dir / "normalized_text.txt"
        with open(normalized_file_path, 'w', encoding='utf-8') as f:
            f.write(normalized_text)
        print(f"Normalized text saved to {normalized_file_path}")

    def normalize(self, text):
        """Applies text normalization including lowercase conversion, punctuation removal, stopword removal, and lemmatization."""
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalpha()]
        words = [word for word in words if word not in self.stop_words]
        words = [self.lemmatizer.lemmatize(word, pos='v') for word in words]
        return ' '.join(words)
