## Bootstraping the original dataset

In [1]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker
fake = Faker()


In [2]:
# Load the CSV data into a DataFrame
df = pd.read_csv('user_interaction.csv')

In [3]:
df.tail()

Unnamed: 0,uid,userName,catagory,sub_catagory,product,listing_amount,counts,total_bid_amount
3368,46f99726-e67a-43ea-9bde-1c3c2b2d1a00,talonjameson,Video Games & Consoles,Video Games,PS4 Black Fag signed by Matt Ryan,1,3,9
3369,5dce8cf3-36fc-4689-b13f-345d5d4fae2b,GameOfPops,Video Games & Consoles,Video Games,PS4 Black Fag signed by Matt Ryan,1,2,29
3370,919a3bf3-35d0-4256-91c0-c90f64a20991,ShylockDT,Video Games & Consoles,Video Games,PS4 Black Fag signed by Matt Ryan,1,1,4
3371,e0ce123a-86f5-47b0-b38d-1ccf4c27dea3,popculturejohn,Video Games & Consoles,Video Games,PS4 Black Fag signed by Matt Ryan,1,1,2
3372,5dce8cf3-36fc-4689-b13f-345d5d4fae2b,GameOfPops,Video Games & Consoles,Video Games,Xbox 360 Bkack flag singned by Matt Ryan,1,1,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3373 entries, 0 to 3372
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   uid               3373 non-null   object
 1   userName          3373 non-null   object
 2   catagory          3373 non-null   object
 3   sub_catagory      3373 non-null   object
 4   product           3373 non-null   object
 5   listing_amount    3373 non-null   int64 
 6   counts            3373 non-null   int64 
 7   total_bid_amount  3373 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 210.9+ KB


In [5]:
# Number of data points needed
desired_data_points = 100000

In [6]:
# Function to generate fake data
def generate_fake_data():
    return {
        'uid': fake.uuid4(),
        'userName': fake.user_name(),
        'catagory': fake.word(),
        'sub_catagory': fake.word(),
        'product': fake.word(),
        'listing_amount': round(np.random.uniform(10, 1000), 2),
        'counts': np.random.randint(1, 20),
        'total_bid_amount': round(np.random.poisson(50), 2)
    }


In [7]:
# Determine how many additional data points are needed
additional_data_points = desired_data_points - len(df)

# Generate the additional data points
fake_data = [generate_fake_data() for _ in range(additional_data_points)]

# Create a DataFrame with the fake data
fake_df = pd.DataFrame(fake_data)

In [8]:
# Combine the original data with the fake data
expanded_df = pd.concat([df, fake_df], ignore_index=True)

In [9]:
expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   uid               100000 non-null  object 
 1   userName          100000 non-null  object 
 2   catagory          100000 non-null  object 
 3   sub_catagory      100000 non-null  object 
 4   product           100000 non-null  object 
 5   listing_amount    100000 non-null  float64
 6   counts            100000 non-null  int64  
 7   total_bid_amount  100000 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 6.1+ MB


In [10]:
expanded_df.tail()

Unnamed: 0,uid,userName,catagory,sub_catagory,product,listing_amount,counts,total_bid_amount
99995,1e3002b8-e0aa-4394-a9c6-31c98c330c33,bonnieallison,play,born,house,28.49,14,38
99996,3f8c0fc6-3054-4709-ba3a-0874d7481096,christopher36,cultural,figure,way,31.44,5,39
99997,e10c8f5e-44a2-43a6-b91d-47f40c7b5696,lopezspencer,hope,heart,gun,825.51,7,50
99998,a6634bc3-c896-4842-8756-bc25a6324928,harolddavis,return,realize,sometimes,383.06,3,48
99999,3ed438d8-57d0-4dc0-bffe-ebd6747ad76b,markwhite,data,political,improve,890.16,6,41


In [11]:
expanded_df.to_csv('fake_user_interaction.csv', index=False)