In [1]:
import os
import sys

In [2]:
os.chdir("../")

In [3]:
import yaml
from ensure import ensure_annotations
from pathlib import Path
from typing import Any
import yaml
from types import SimpleNamespace
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from keras.layers import TextVectorization
from pickle import dump, load
import pickle

In [4]:
def dict_to_namespace(d):
    """Recursively converts a dictionary into a SimpleNamespace object."""
    if isinstance(d, dict):
        return SimpleNamespace(**{k: dict_to_namespace(v) for k, v in d.items()})
    return d

In [5]:
import os
def create_directories(path_to_directories: list, verbose=True):
    for path in path_to_directories:
        os.makedirs(path, exist_ok=True)

def read_yaml(path_to_yaml: Path):
    try:
        with open(path_to_yaml, "r") as yaml_file:
            content = yaml.safe_load(yaml_file)
            return dict_to_namespace(content)  # Convert dict to namespace
    except FileNotFoundError:
        print(f"File not found: {path_to_yaml}")

In [6]:
def load_doc(file_name):
    """
    Loads the document file and reads its contents into a string.
    """
    # a = "U:/nlp_project/Image_Sharing_Plateform/data/processed"
    # file_name = os.path.join()  # Use raw string
    file_path = Path(file_name)  # Convert to Path object

    with open(file_path, 'r', encoding='utf-8') as file:  # Use 'with open' to avoid manual close
        text = file.read()
    
    return text

In [7]:
# text = load_doc(r"U:\nlp_project\Image_Sharing_Plateform\data\processed\training_data.txt")

In [8]:
# text

In [9]:
from src.image_sharing_plateform.constants import *

In [10]:
import numpy as np

class TrainTestSplit():
    def __init__(self):
        pass

    def train_val_split(caption_data, train_size=0.8, shuffle=True):

       # 1. Get the list of all image names
        all_images = list(caption_data.keys())

        # 2. Shuffle if necessary
        if shuffle:
            np.random.shuffle(all_images)

        # 3. Split into training and validation sets
        train_size = int(len(caption_data) * train_size)

        training_data = {
            img_name: caption_data[img_name] for img_name in all_images[:train_size]
        }
        validation_data = {
            img_name: caption_data[img_name] for img_name in all_images[train_size:]
        }

        # 4. Return the splits
        return training_data, validation_data

In [11]:
# 8092*0.8

In [12]:
# 8092-6473

In [13]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    original_image_data_path: Path
    original_caption_data_path: Path
    preprocessed_data_path : Path
    extracted_features_path : Path
    training_data : Path
    validation_data : Path
    vectorizer_path : Path
    SEQ_LENGTH : int

In [14]:
class DataTransformationConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.training_data, config.validation_data, config.vectorizer_path])

        data_transformation_config = DataTransformationConfig(
            original_image_data_path = config.original_image_data_path,

            original_caption_data_path = config.original_caption_data_path,
            
            preprocessed_data_path = config.preprocessed_data_path,

            extracted_features_path = config.extracted_features_path,

            training_data = config.training_data,
            
            validation_data = config.validation_data,
            
            vectorizer_path = config.vectorizer_path,

            SEQ_LENGTH = config.SEQ_LENGTH
            
        )
        return data_transformation_config

In [None]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def all_img_captions(self):
        """This function will take original caption data and return:
        'descriptions': a dictionary containing key value pair of image and list of all captions of each image. In original data image and there caption is repeating.
        """
        path = self.config.original_caption_data_path + "/" + "Flickr8k.token.txt" 

        file = load_doc(Path(path))
        captions = file.split('\n')
        descriptions ={}
        text_data = []
        for caption in captions:
            if '\t' in caption:
                img, caption = caption.split('\t')
                if img[:-2] not in descriptions:
                    descriptions[img[:-2]] = [ caption ]
                else:
                    descriptions[img[:-2]].append(caption)
                # text_data.append(caption)
        return descriptions
    
    
    def cleaning_text(self,captions):
        table = str.maketrans('','',string.punctuation)
        for img,caps in captions.items():
            for i,img_caption in enumerate(caps):
                img_caption.replace("-"," ")
                desc = img_caption.split()
                
                #converts to lowercase
                desc = [word.lower() for word in desc]
                
                #remove punctuation from each token
                desc = [word.translate(table) for word in desc]
                
                #remove hanging 's and a 
                desc = [word for word in desc if(len(word)>1)]
                
                #remove tokens with numbers in them
                desc = [word for word in desc if(word.isalpha())]
                
                #convert back to string
                img_caption = ' '.join(desc)
                img_caption = '<start> ' + " ".join(desc) + ' <end>'
                captions[img][i]= img_caption
        return captions
    
    def save_descriptions(self,descriptions, filename):
        lines = list()
        for key, desc_list in descriptions.items():
            for desc in desc_list:
                lines.append(key + '\t' + desc )
        data = "\n".join(lines)
        
        cleaned_data_path = self.config.preprocessed_data_path + "/" +  filename
        
        with open(cleaned_data_path, "w", encoding="utf-8") as file:
            file.write(data)

    
    def create_photos(self,filename):
        file_path = self.config.preprocessed_data_path + "/"+ filename
        file = load_doc(file_path)
        train_images = []
        texts = file.split("\n")[:-1]
        for text in texts:
            text = text.split("\t")
            train_images.append(text[0])
        return set(train_images)
    
    def create_features(self,photos,filename):
        #loading all features
        file_path = self.config.extracted_features_path + "/"+ filename
        train_data_features = {}
        all_features = load(open(file_path,"rb"))
        #selecting only needed features
        for ph in photos:
            if ph in all_features.keys():
                train_data_features[ph] = all_features[ph]
            else:
                train_data_features[ph] = []
        return train_data_features
    

    def clean_final_data(self,data_captions, data_images, data_features):
        invalid_ids = []  # Store invalid image IDs

        for img_id, _ in data_captions.items():
            image_feature = data_features[img_id]  # Extract image feature vector
            image_feature = np.array(image_feature)

            if image_feature.shape != (512,):  # Check if the shape is incorrect
                # print(img_id, image_feature.shape)
                invalid_ids.append(img_id)  # Collect invalid IDs

        # Delete all invalid IDs **after** iteration
        for img_id in invalid_ids:
            del data_captions[img_id] 
            del data_features[img_id]
            data_images.discard(img_id)

        return data_captions, data_features, data_images
    
    def get_vectorizer(self, train_data_captions):

        """Here data required is in list format. So converting data in required format only i.e list"""
        all_desc = []

        for key in train_data_captions.keys():
            all_desc = all_desc + train_data_captions[key]
        
        vectorizer = tf.keras.layers.TextVectorization(
                            max_tokens=7151,
                            output_mode="int", 
                            output_sequence_length=self.config.SEQ_LENGTH
                    )
        # print(all_desc)
        vectorizer.adapt(all_desc)
        
        return vectorizer
    

    def vectorize_data(self,vectorizer,data):
        tokenized_data = {img_id: vectorizer([f"<start> {cap} <end>"]) for img_id, caps in data.items() for cap in caps}
        return tokenized_data
        
    
    def save_training_validation_data(self,data_caption, data_image, data_feature, data):
        
        if data == "train":
            path = self.config.training_data + "/" + "train_data.pkl"
        else:
            path = self.config.validation_data + "/" + "validation_data.pkl"

        with open(path, "wb") as file:
            pickle.dump({"caption_data": data_caption, "image_data": data_image, "feature_data": data_feature}, file)

In [16]:
try:
    """
    STAGE1:
    step1: Create ConfigurationManager() object.
    step2: get all configuration and values config.yaml file.
    step3: create a dictionary 'descriptions' containing image is a key : all caption in a list as a value.
    step4: clean all like :=  'converts to lowercase', 'remove punctuation from each token', 'remove hanging 's and a'.
    step5: split full data in train and validation data. i.e 'training_data' and 'validation_data'.
    step5: save this 'training_data' and 'validation_data' in .txt format in desired location.
    
    STAGE2:
    step1: read training_data.txt and validation_data.txt.
    step2: create a list of all image in image data and create a dictionary of image_id as key and his vgg generated feature of shape (512,).
    step3:  create vectorizer on train_data
    step4: vectorize 'train_data' and create 'train_data_caption' which in vectorized data.
    step5: repeat step2, step3, step4 for validation data as well.
    step6: save 'train_data_captions, train_data_images, train_data_features'.
    step7: save 'validation_data_captions, validation_data_images, validation_data_features'.
    step8: save vectorizer object for further use.
    """
    config = DataTransformationConfigurationManager()
    # train_split = TrainTestSplit()
    
    data_transformation_config = config.get_data_transformation_config()
    
    data_transformation = DataTransformation(config=data_transformation_config)
    
    descriptions = data_transformation.all_img_captions()

    full_data = data_transformation.cleaning_text(descriptions)

    training_data, validation_data = TrainTestSplit.train_val_split(full_data)
    
    data_transformation.save_descriptions(training_data,"training_data.txt")
    data_transformation.save_descriptions(validation_data,"validation_data.txt")

    # train_data_captions = data_transformation.create_captions("training_data.txt")
    train_data_images = data_transformation.create_photos("training_data.txt")
    train_data_features = data_transformation.create_features(train_data_images, "extracted_features.p")
    train_data_captions, train_data_images, train_data_features = data_transformation.clean_final_data(training_data, train_data_images, train_data_features)
    
    
    vectorizer = data_transformation.get_vectorizer(training_data)
    train_data_captions = data_transformation.vectorize_data(vectorizer,training_data)
    

    # validation_data_captions = data_transformation.create_captions(r"validation_data.txt")
    validation_data_images = data_transformation.create_photos(r"validation_data.txt")
    validation_data_features = data_transformation.create_features(validation_data_images ,"extracted_features.p")
    validation_data_captions, validation_data_images, validation_data_features= data_transformation.clean_final_data(validation_data, validation_data_images, validation_data_features)

    validation_data_captions = data_transformation.vectorize_data(vectorizer,validation_data_captions)

    data_transformation.save_training_validation_data(train_data_captions, train_data_images, train_data_features, "train")
    data_transformation.save_training_validation_data(validation_data_captions, validation_data_images, validation_data_features, "validation")

    vectorizer = tf.keras.models.Sequential([vectorizer])
    # vectorizer.save(r"U:\nlp_project\Image_Sharing_Plateform\data\processed\vectorizer")

    vectorizer_path = data_transformation_config.vectorizer_path + "/" + "vectorizer"
    vectorizer.save(vectorizer_path)

except Exception as e:
    raise e

INFO:tensorflow:Assets written to: data/processed/vectorizer/vectorizer\assets


In [17]:
len(train_data_images),len(train_data_features), len(train_data_captions)

(6472, 6472, 6472)

In [19]:
len(validation_data_images),len(training_data),len(train_data_features)

(1619, 6472, 6472)