In [1]:
import os

In [2]:
%pwd

'e:\\parth\\DonorsChoose-Application-Screening-app\\research'

In [3]:
Project_path = "E:\parth\DonorsChoose-Application-Screening-app"
os.chdir(Project_path)

In [11]:
%pwd

'E:\\parth\\DonorsChoose-Application-Screening-app'

In [4]:

"""config/config.yaml"""

# artifacts_root: artifacts

# data_ingestion:
#   root_dir: artifacts/data_ingestion
#   source_URL: https://drive.google.com/file/d/1-4mgFvbEEvCuIar7eas29F5k8hfbRFn3/view?usp=sharing
#   local_data_file: artifacts/data_ingestion/dataset.zip
#   unzip_dir: artifacts/data_ingestion

# data_cleaning:
#   root_dir: artifacts/data_cleaning
#   local_data_trainfile: artifacts/data_ingestion/dataset/train1000.csv
#   local_data_resourcefile: artifacts/data_ingestion/dataset/resource1000.csv
#   local_data_stopwordsfile: artifacts/data_ingestion/stopwords-en.txt


'config/config.yaml'

In [95]:
"""src/entity/config_entity -  added entity for data clean """

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataCleanConfig:
    root_dir: Path
    local_data_trainfile: Path
    local_data_resourcefile: Path
    local_data_stopwordsfile: Path
    save_clean_datafile: Path

In [96]:
"""data config manager it will create config object for data clean """

from donorschoose.constants import *
from donorschoose.utils.common import read_yaml, create_directories 
# from donorschoose.entity.config_entity import DataCleanConfig
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        # self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_clean_config(self) -> DataCleanConfig:
        config = self.config.data_cleaning
        print(config)
        create_directories([config.root_dir])

        data_clean_config = DataCleanConfig(
            root_dir=config.root_dir,
            local_data_trainfile=config.local_data_trainfile,
            local_data_resourcefile=config.local_data_resourcefile,
            local_data_stopwordsfile =config.local_data_stopwordsfile,
            save_clean_datafile = config.save_clean_datafile
        )
        return data_clean_config

In [88]:
# import pandas as pd
# from pathlib import Path

# csv_path = Path(Project_path + r"\artifacts\data_ingestion\dataset\train1000.csv")
# df = read_csv(csv_path)
# print(df.shape)

In [89]:
import os
from donorschoose import logger
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from donorschoose.entity.config_entity import DataCleanConfig
# from donorschoose.config.configuration import ConfigurationManager
from donorschoose.utils.common import read_csv
from donorschoose.utils.common import read_stopwords_txt
from sklearn.preprocessing import StandardScaler

In [91]:

def read_txt_file(path_to_txt: Path) -> list:
    """reads txt file and returns 

    Args:
        path_to_txt (str): path like input

    Raises:
        e: empty file

    Returns:
       list
    """
    try:
        with open(path_to_txt, "r") as f:
            stopwords_string = f.read()

        stopwords = [word.strip() for word in stopwords_string.split('\n') if word.strip()]

        logger.info(f"stopword file: {path_to_txt}, Number of words :{len(stopwords)} loaded successfully")
        return  stopwords
    except Exception as e:
        raise e

def save_csv(data_frame,file_path ):
    """get size in KB

    Args:
        path (Path): path of the file

    filtered_resource_data.to_csv('fi ltered_resource_data1000.csv', index=False) 
    """
    file_path = Path(file_path)
    data_frame.to_csv(file_path, index=False) 
    logger.info(f"save dataframe in {file_path}")

In [78]:
class DataCleaning:
    def __init__(self, config: DataCleanConfig):
        self.config = config
        self.train_df = None
        self.resource_df = None
        self.stopwords= None

    def read_files(self):
        '''
        Fetches data from the specified URLs and returns DataFrames.
        '''
        try: 
            root_dir = self.config.root_dir
            os.makedirs(root_dir, exist_ok=True)

            train_path = Path(self.config.local_data_trainfile)
            resource_path = Path(self.config.local_data_resourcefile)
            stopword_txt_path = Path(self.config.local_data_stopwordsfile)
            self.train_df = read_csv(train_path)
            self.resource_df = read_csv(resource_path)
            self.stopwords = read_txt_file(stopword_txt_path)
            
        except Exception as e:
            raise e
    
    def clean_text_column(self, column:str ):
        """
        this function will remove Spaces and special char from text column
        
        if we have any Nan value then it will fill nan value with 
        whatever have max count category
        """

        self.train_df[column] =self.train_df[column].str.replace(' ','_')
        self.train_df[column] =self.train_df[column].str.replace(',','_')
        self.train_df[column] =self.train_df[column].str.replace('.','')
        self.train_df[column] =self.train_df[column].str.replace('-','_')
        self.train_df[column] =self.train_df[column].str.replace('&','_')
        self.train_df[column] =self.train_df[column].str.replace(' The ','')
        self.train_df[column] =self.train_df[column].str.lower()
        self.train_df[column] =self.train_df[column].str.replace('___','_')
        self.train_df[column] =self.train_df[column].str.replace('__','_')
        
        max_count_id = self.train_df[column].value_counts().idxmax()
        self.train_df[column]=self.train_df[column].fillna(max_count_id)
        
        logger.info(f"preprocessed column :{column} , unique categories Count : {len(self.train_df[column].value_counts())}")


    def add_two_column(self , column1: str , column2: str):
        try:
            if column1 not in self.train_df.columns:
                self.train_df[column1] = self.train_df[column2].astype(str)
            else:
                self.train_df[column1] += self.train_df[column2].astype(str)

        except Exception as e:
            raise e
        
    def decontracted(self ,phrase: str) -> str:
        
            # specific
        phrase = re.sub(r"won't", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        
        return phrase
        

    def preprocess_text(self ,column:str)-> list:
        text_list = self.train_df[column].values
        preprocessed_text = []
        try:
            for sentance in tqdm(text_list):
                sent = self.decontracted(sentance)
                sent = sent.replace('\\r', ' ')
                sent = sent.replace('\\n', ' ')
                sent = sent.replace('\\"', ' ')
                sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
                sent = ' '.join(e for e in sent.split() if e.lower() not in self.stopwords)
                preprocessed_text.append(sent.lower().strip())
            logger.info(f"preprocessed column :{column}")
            self.train_df[column]=preprocessed_text
        
        except Exception as e:
            raise e

    def merge_csv(self):
        """
        joining two dataframes in python
        """
        try:
            self.resource_df = self.resource_df.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
            self.train_df =pd.merge(self.train_df, self.resource_df, on='id', how='left')
            logger.info(f"aggrigated resourde df and merged both df")
        except Exception as e:
            raise e
    
    def normalize_column(self ,column: str):
        try:
            scaler = StandardScaler()
            scaler.fit(self.train_df[column].values.reshape(-1, 1))
            self.train_df[column]=scaler.transform(self.train_df[column].values.reshape(-1, 1) )
            logger.info(f"normalized column : {column}")
        except Exception as e:
            raise e
        
    def drop_colums(self , column_list : list):
        self.train_df = self.train_df.drop(column_list, axis=1)
        logger.info(f"dropped columns:{column_list}")
        
    def rename_column(self ,columnNanme:str , newName : str):
        self.train_df = self.train_df.rename(columns={
            columnNanme: newName,
            })
        logger.info(f"renamed column:{columnNanme} to {newName}")
        
    
    

In [97]:


config = ConfigurationManager()
data_clean_config = config.get_data_clean_config()
data_clean = DataCleaning(config=data_clean_config)

data_clean.read_files()
data_clean.merge_csv()

data_clean.clean_text_column("project_subject_categories")
data_clean.clean_text_column("project_subject_subcategories")
data_clean.clean_text_column("project_grade_category")
data_clean.clean_text_column("teacher_prefix")
data_clean.clean_text_column("school_state")

data_clean.preprocess_text("project_title")

data_clean.add_two_column("essay" , "project_essay_1")
data_clean.add_two_column("essay" , "project_essay_2")
data_clean.add_two_column("essay" , "project_essay_3")
data_clean.add_two_column("essay" , "project_essay_4")
data_clean.preprocess_text("essay")
data_clean.normalize_column("price")
data_clean.normalize_column("quantity")
data_clean.rename_column("project_subject_categories" ,"clean_categories")
data_clean.rename_column("project_subject_subcategories" ,"clean_subcategories")
drop_columns_list = ["project_essay_1",
                "project_essay_2",
                "project_essay_3",
                "project_essay_4",
                "teacher_id",
                "project_submitted_datetime",
                "id"]
data_clean.drop_colums(drop_columns_list)
save_csv(data_clean.train_df ,data_clean_config.save_clean_datafile)


[2024-02-09 14:53:24,429: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-09 14:53:24,433: INFO: common: created directory at: artifacts]
{'root_dir': 'artifacts/data_cleaning', 'local_data_trainfile': 'artifacts\\data_ingestion\\dataset\\filtered_train_data1000.csv', 'local_data_resourcefile': 'artifacts\\data_ingestion\\dataset\\filtered_resource_data1000.csv', 'local_data_stopwordsfile': 'artifacts\\data_ingestion\\dataset\\stopwords-en.txt', 'save_clean_datafile': 'artifacts/data_cleaning/clean_data.csv'}
[2024-02-09 14:53:24,437: INFO: common: created directory at: artifacts/data_cleaning]
[2024-02-09 14:53:24,499: INFO: common: csv file: artifacts\data_ingestion\dataset\filtered_train_data1000.csv, df Shape:(1000, 16) loaded successfully]
[2024-02-09 14:53:24,516: INFO: common: csv file: artifacts\data_ingestion\dataset\filtered_resource_data1000.csv, df Shape:(5650, 4) loaded successfully]
[2024-02-09 14:53:24,531: INFO: 555177667: stopword file: artifa

100%|██████████| 1000/1000 [00:00<00:00, 2006.35it/s]

[2024-02-09 14:53:25,310: INFO: 3771689221: preprocessed column :project_title]



100%|██████████| 1000/1000 [00:05<00:00, 170.99it/s]

[2024-02-09 14:53:31,208: INFO: 3771689221: preprocessed column :essay]
[2024-02-09 14:53:31,215: INFO: 3771689221: normalized column : price]
[2024-02-09 14:53:31,219: INFO: 3771689221: normalized column : quantity]
[2024-02-09 14:53:31,233: INFO: 3771689221: renamed column:project_subject_categories to clean_categories]
[2024-02-09 14:53:31,234: INFO: 3771689221: renamed column:project_subject_subcategories to clean_subcategories]
[2024-02-09 14:53:31,248: INFO: 3771689221: dropped columns:['project_essay_1', 'project_essay_2', 'project_essay_3', 'project_essay_4', 'teacher_id', 'project_submitted_datetime', 'id']]





[2024-02-09 14:53:31,534: INFO: 555177667: save dataframe in artifacts\data_cleaning\clean_data.csv]


In [85]:
data_clean.train_df.head(2)

Unnamed: 0,teacher_prefix,school_state,project_grade_category,clean_categories,clean_subcategories,project_title,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,price,quantity,essay
0,mrs,in,grades_prek_2,literacy_language,esl_literacy,educational support for english learners at home,My students need opportunities to practice beg...,0,0,-0.446746,0.236383,my students are english learners that are work...
1,mr,fl,grades_6_8,history_civics_health_sports,civics_government_team_sports,wanted projector for hungry learners,My students need a projector to help with view...,7,1,-0.040664,-0.542591,our students arrive to our school eager to lea...


In [47]:
# path_to_txt = Path(r"E:\parth\DonorsChoose-Application-Screening-app\artifacts\data_ingestion\dataset\stopwords.txt")
# with open(path_to_txt, "r") as f:
#     stopwords_string = f.read()

# # Split the single string by newline characters and store the elements in a list
# stopwords = [word.strip() for word in stopwords_string.split('\n') if word.strip()]
# stopwords[3]

"'ve"

In [None]:
def drop_colums(self , column_list : list):
    project_data = project_data.drop(column_list, axis=1)
    
def rename_column(columnNanme:str , newName : str):
    project_data = project_data.rename(columns={
        columnNanme: newName,
        })
    

In [None]:
project_data = project_data.drop(["project_essay_1",
                                  "project_essay_2",
                                  "project_essay_3",
                                  "project_essay_4",
                                  "teacher_id",
                                  "Unnamed: 0",
                                  "project_submitted_datetime",
                                  "id"], axis=1)



project_data = project_data.rename(columns={'project_subject_categories': 'clean_categories',
                        'project_subject_subcategories': 'clean_subcategories'})