# Generate Fake Data

In [41]:
import pandas as pd
from faker import Faker
import random
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import os
from datetime import timedelta

In [None]:
sample = pd.read_csv('sample_df.csv')
sample = sample[sample['module'] != 'GC']
sample = sample[sample['assignment'].str.contains(r'A0[0-9]+')]
sample.head()

In [46]:
sample['assignment'].unique()

array(['A04', 'A02', 'A01', 'A03', 'A01b'], dtype=object)

In [None]:
sample['assignment_full'] = sample['module'] + '.' + sample['assignment']
sample = sample.drop(['module', 'assignment'], axis=1)
sample.head()

In [49]:
# train on sample data
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(sample)
metadata.set_primary_key(column_name='filename')

model = CTGANSynthesizer(metadata)
model.fit(sample)



In [50]:
model.save("sdv-ctgan-uploads.pkl")

In [51]:
# generate fake data
uploads = model.sample(num_rows=10000)

In [52]:
# generate fake usernames
fake = Faker()
unique_usernames = {original: fake.user_name() for original in uploads['username'].unique()}
uploads['username'] = uploads['username'].map(unique_usernames)

uploads.head()

Unnamed: 0,username,datetime,identifier,filename,assignment_full
0,mccoynicholas,2021/05/10 17:43:10,sdv-pii-ag5je,sdv-id-yaNmDU,08.A01
1,lhill,2020/10/15 23:30:47,sdv-pii-a6b43,sdv-id-VJCHOc,02.A01
2,bestmichelle,2023/10/28 08:27:07,sdv-pii-xo3rj,sdv-id-imywbe,05.A04
3,victorkim,2021/05/08 08:58:38,sdv-pii-p16ax,sdv-id-GMdjgm,05.A03
4,justinhughes,2021/05/06 05:53:55,sdv-pii-8lsj0,sdv-id-piRkmB,04.A01


In [54]:
uploads[['module', 'assignment']] = uploads['assignment_full'].str.split('.', expand=True)
uploads.head()

Unnamed: 0,username,datetime,identifier,filename,assignment_full,module,assignment
0,mccoynicholas,2021/05/10 17:43:10,sdv-pii-ag5je,sdv-id-yaNmDU,08.A01,8,A01
1,lhill,2020/10/15 23:30:47,sdv-pii-a6b43,sdv-id-VJCHOc,02.A01,2,A01
2,bestmichelle,2023/10/28 08:27:07,sdv-pii-xo3rj,sdv-id-imywbe,05.A04,5,A04
3,victorkim,2021/05/08 08:58:38,sdv-pii-p16ax,sdv-id-GMdjgm,05.A03,5,A03
4,justinhughes,2021/05/06 05:53:55,sdv-pii-8lsj0,sdv-id-piRkmB,04.A01,4,A01


In [56]:
cohorts = pd.read_csv('data/cohorts.csv')

# Convert launch_start and term_end columns to datetime
cohorts['launch_start_dt'] = pd.to_datetime(cohorts['Launch Start'], format='%b %d, %Y')
cohorts['term_end_dt'] = pd.to_datetime(cohorts['Term End'], format='%b %d, %Y')
cohorts = cohorts[cohorts['Cohort ID'] != 'C11']

# Find terms and their corresponding start and end dates
terms = {row['Cohort ID']: (row['launch_start_dt'], row['term_end_dt']) for _, row in cohorts.iterrows()}
print(terms)

{'C01': (Timestamp('2020-03-01 00:00:00'), Timestamp('2020-05-20 00:00:00')), 'C02': (Timestamp('2020-06-15 00:00:00'), Timestamp('2020-09-02 00:00:00')), 'C03': (Timestamp('2020-08-09 00:00:00'), Timestamp('2020-10-28 00:00:00')), 'C04': (Timestamp('2020-10-04 00:00:00'), Timestamp('2020-12-30 00:00:00')), 'C05': (Timestamp('2021-03-07 00:00:00'), Timestamp('2021-06-02 00:00:00')), 'C06': (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-11-03 00:00:00')), 'C07': (Timestamp('2022-05-08 00:00:00'), Timestamp('2022-08-03 00:00:00')), 'C08': (Timestamp('2023-04-10 00:00:00'), Timestamp('2023-07-19 00:00:00')), 'C09': (Timestamp('2023-10-01 00:00:00'), Timestamp('2024-01-10 00:00:00')), 'C10': (Timestamp('2024-03-03 00:00:00'), Timestamp('2024-06-12 00:00:00'))}


In [57]:
# Restrict a subset of users to fewer terms
usernames = uploads['username'].unique()
subset_usernames = random.sample(list(usernames), int(len(usernames) * 0.5))
user_term_mapping = {username: random.choice(list(terms.keys())) for username in usernames}

# Function to generate a random date within a specified range
def get_random_date(start_date, end_date):
    date_range = (end_date - start_date).days
    random_days = random.randint(0, date_range)
    random_seconds = random.randint(0, date_range)
    random_datetime = start_date + timedelta(days=random_days, seconds=random_seconds)
    return random_datetime.strftime('%Y/%m/%d %H:%M:%S')

uploads2 = uploads.copy()
for username, term in user_term_mapping.items():
    start_date, end_date = terms[term]
    mask = uploads2['username'] == username
    uploads2.loc[mask, 'datetime'] = uploads2.loc[mask, 'datetime'].apply(lambda x: get_random_date(start_date, end_date))

print(uploads2.head())

        username             datetime     identifier       filename  \
0  mccoynicholas  2022/05/09 00:00:11  sdv-pii-ag5je  sdv-id-yaNmDU   
1          lhill  2023/10/09 00:00:53  sdv-pii-a6b43  sdv-id-VJCHOc   
2   bestmichelle  2021/04/04 00:01:13  sdv-pii-xo3rj  sdv-id-imywbe   
3      victorkim  2024/04/06 00:01:05  sdv-pii-p16ax  sdv-id-GMdjgm   
4   justinhughes  2023/07/01 00:00:25  sdv-pii-8lsj0  sdv-id-piRkmB   

  assignment_full module assignment  
0          08.A01     08        A01  
1          02.A01     02        A01  
2          05.A04     05        A04  
3          05.A03     05        A03  
4          04.A01     04        A01  


In [58]:
directory = 'data'
file_path = os.path.join(directory, 'uploads.csv')
uploads2.to_csv(file_path, index=False)