In [1]:
import pandas as pd
import numpy as np

# Load the original dataset
# Replace 'your_dataset.csv' with the path to your dataset
original_df = pd.read_csv('./data.csv')

# Number of entries in the original dataset
original_size = len(original_df)

# Desired size range for the new dataset
desired_min_size = 3000
desired_max_size = 5000

# Calculate the number of new entries needed
new_entries_needed = desired_min_size - original_size

# Function to create new entries by tweaking existing ones
def augment_data(df, num_new_entries, tweak_amount=0.0001):
    new_data = []
    for _ in range(num_new_entries):
        # Randomly choose an existing entry to tweak
        random_entry = df.sample().iloc[0]
        # Apply a small random tweak to the numeric values, but not the 'id' or 'diagnosis'
        tweaked_entry = random_entry.copy()
        for col in df.columns:
            if col not in ['id', 'diagnosis']:
                tweaked_entry[col] += np.random.uniform(-tweak_amount, tweak_amount) * tweaked_entry[col]
        new_data.append(tweaked_entry)
    return pd.DataFrame(new_data, columns=df.columns)

# Generate new entries
augmented_df = augment_data(original_df, new_entries_needed)

# Combine the original and augmented datasets
final_df = pd.concat([original_df, augmented_df], ignore_index=True)

# If the augmented dataset is still less than the maximum desired size, augment further
if len(final_df) < desired_max_size:
    additional_entries_needed = desired_max_size - len(final_df)
    final_df = pd.concat([final_df, augment_data(final_df, additional_entries_needed)], ignore_index=True)

# Save the augmented dataset
# Replace 'augmented_dataset.csv' with the desired output file path
final_df.to_csv('augmented_dat.csv', index=False)

print(f"Original dataset size: {original_size}")
print(f"Augmented dataset size: {len(final_df)}")


Original dataset size: 569
Augmented dataset size: 5000
