In [None]:
from nlpaug.util.file.download import DownloadUtil

# Note all the below are not finetuned model, they are embeddings for contexutal information
DownloadUtil.download_word2vec(dest_dir='.') # Download word2vec
DownloadUtil.download_glove(model_name='glove.6B', dest_dir='.') # Download GloVe
DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir='.') # Download fasttext


import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.augmenter.char as nac
import pandas as pd
import random
import os
os.environ["TOKENIZERS_PARALLELISM"]="true"
from joblib import Parallel, delayed

In [None]:
def flatten_list(mixed_list):
    """
    Flattens a list containing strings or lists of strings.
    
    Args:
        mixed_list (list): A list containing strings and lists of strings.

    Returns:
        list: A flattened list of strings.
    """
    flattened = []
    for item in mixed_list:
        if isinstance(item, list):  # Check if the item is a list
            flattened.extend(item)  # Extend the flattened list with the inner list
        elif isinstance(item, str):  # Check if the item is a string
            flattened.append(item)  # Add the string to the flattened list
    return flattened


In [None]:
class AdvancedTextAugmenter:
    def __init__(self, languages=None):
        """
        Initialize augmenters with optional language support
        
        Args:
            languages (list): List of language codes for translation augmentation
        """
        self.languages = languages or ['fr', 'de', 'es']
        
        # Word-level augmenters
        self.synonym_aug = naw.SynonymAug(aug_src='wordnet')
        self.word_embedding_aug = naw.WordEmbsAug(
            model_type='word2vec', 
            model_path='GoogleNews-vectors-negative300.bin'  # Optional: specify a pre-trained model
        )
        
        
        # Back-translation augmenters
        self.back_translation_augs = [
            naw.BackTranslationAug(
                from_model_name=f'Helsinki-NLP/opus-mt-en-{lang}',
                to_model_name=f'Helsinki-NLP/opus-mt-{lang}-en',device="cuda"
            ) for lang in self.languages
        ]
        
        # Sentence-level augmenter
        self.sentence_aug = nas.RandomSentAug()

    def augment_text(self, text, num_augmentations=5, augmentation_techniques=None, categories=None):
        """
        Apply multiple augmentation techniques to the input text
        
        Args:
            text (str): Input text to augment
            num_augmentations (int): Number of augmentations to generate
            augmentation_techniques (list): Specific techniques to use
        
        Returns:
            list: Augmented text variations
        """
        # Default augmentation techniques if not specified
        if augmentation_techniques is None:
            augmentation_techniques = [
                'synonym',
                'word_embedding',
                'back_translation',
                'sentence_swap'
            ]
        augmented_texts = []
        
        # Synonym Replacement
        print(f"Generating the data for categories {categories} using synonym")
        if 'synonym' in augmentation_techniques:
            augmented_texts.extend(
                [self.synonym_aug.augment(text) for _ in range(num_augmentations//2)]
            )
        
        # Word Embedding Augmentation
        print(f"Generating the data for categories {categories} using word_embedding")
        if 'word_embedding' in augmentation_techniques:
            try:
                augmented_texts.extend(
                    [self.word_embedding_aug.augment(text) for _ in range(num_augmentations//2)]
                )
            except Exception as e:
                print(f"Word embedding augmentation failed: {e}")
        
        # Back Translation
        print(f"Generating the data for categories {categories} using back_translation")
        if 'back_translation' in augmentation_techniques:
            for translator in self.back_translation_augs:
                try:
                    augmented_texts.append(translator.augment(text)[0])
                except Exception as e:
                    print(f"Back translation augmentation failed: {e}")
        
        # Sentence-level Augmentation
        print(f"Generating the data for categories {categories} using sentence_swapping")
        if 'sentence_swap' in augmentation_techniques:
            augmented_texts.append(self.sentence_aug.augment(text)[0])
            
            
        augmented_texts=flatten_list(augmented_texts)
        # Remove duplicates and limit to unique augmentations
        unique_augmented_texts = list(set(augmented_texts))
        
        print(f"Total unique generated data text for {categories} is {len(unique_augmented_texts)}")
        return unique_augmented_texts

In [None]:
text_augmenter = AdvancedTextAugmenter()

In [None]:
def augment_original_mapped_sub_category(df, per_sample_target_count=1000, text_columns=None,category=None,category_column=None):
    """
    Augment rows for original_mapped_sub_category with fewer than target_count instances
    
    Args:
        df (pd.DataFrame): Input DataFrame
        target_count (int): Target number of instances per category
        text_columns (list): Columns to apply text augmentation
        output_file (str): Path to save augmented data CSV
    
    Returns:
        pd.DataFrame: Augmented DataFrame
    """
    
    # Augmented samples storage    
    # Augmentation for each low-count category
    print("*"*75)
    print(f"Generation starts for category: {category}")
    
    # Get subset of current category
    category_subset = df[df[category_column] == category]
    current_count = len(category_subset)
    needed_count = per_sample_target_count - current_count
    base_sample_text = category_subset.sample(n=1).crimeaditionalinfo.to_list()
    augmented_variations = text_augmenter.augment_text(
                base_sample_text,
                num_augmentations=needed_count,
                categories=category
            )
    augmented_variations_df=pd.DataFrame({
        "content":augmented_variations,
        category_column:[category for _ in range(len(augmented_variations))]
    })
    return augmented_variations_df


In [2]:
import pandas as pd
df=pd.read_csv("../data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo,final_category,final_subcategory,category_justification,subcategory_justification,confidence_score
0,0.0,Online Cyber Trafficking,Online Trafficking,SIR I HAVE GET SMS WITH PRE APPORVED LOAN IJU...,online_financial_fraud,fraud_callvishing,The content describes a fraudulent loan scheme...,The user was contacted via WhatsApp and pressu...,
1,1.0,Online Cyber Trafficking,Online Trafficking,this number frauder call me I had ordered on a...,online_financial_fraud,debitcredit_card_fraudsim_swap_fraud,The content describes a situation where the us...,The user describes a debit card being used fra...,
2,3.0,Online Cyber Trafficking,Online Trafficking,I have received a notification by chrome he sa...,online_financial_fraud,debitcredit_card_fraudsim_swap_fraud,The content describes a user losing money afte...,The user's account balance being deducted afte...,
3,6.0,Online Cyber Trafficking,Online Trafficking,The app is in playstore with name of the five ...,online_financial_fraud,attacks_or_incidents_affecting_digital_payment...,The content describes a scenario where the use...,The mention of investing money through an app ...,
4,14.0,Online Cyber Trafficking,Online Trafficking,MY TR ID SBIUPI ID mpMTiRBaQPTVUZXBAwlwhDqcsUu...,online_financial_fraud,upi_related_frauds,"The content mentions SBIUPI ID and OTP, indica...",The mention of UPI ID suggests the fraud is re...,


In [None]:
agumented_data=augment_original_mapped_sub_category(df,text_columns="crimeadditionalinfo",category_column="sub_category",category="online_financial_fraud")
agumented_data.to_csv("Augumented_data_for_online_financial_fraud.csv")