# Artificial Datasets

In [1]:
# code from chatgpt

import pandas as pd
import random
from datetime import datetime
import time

In [10]:
def generate_artificial_random_dataset(n_users,
                                       n_items, 
                                       ts,
                                       all_items_seen,
                                       random_seed,
                                       n_items_to_drift,
                                       sudden_drift_start,
                                       items_freq_list,
                                       save_path):
    
    def save_dataset_atomic_file(df, save_path):
        if save_path:
            # Output the dataset
            df.to_csv(save_path+'.csv', index=False)
            df.to_csv(save_path+'.inter',
                    header=['user_id:token','item_id:token','timestamp:float'], 
                    sep='\t', 
                    index=False)
            print("Dataset with sudden drift created and saved at "+save_path+".")


    users_list = [f'u_{i+1}' for i in range(n_users)]
    items_list = [f'i_{j+1}' for j in range(n_items)]


    if all_items_seen:

        data = []
        for user in users_list:
            for item in items_list:
                all_items_seen_df.append({'user_id': user, 'item_id': item, 'timestamp':ts})

        all_items_seen_df = pd.DataFrame(data)

        # Introduce sudden drift
        random.seed(random_seed)  # For reproducibility
        drift_items_list = random.sample(items_list, k=n_items_to_drift)  
        renamed_items = {item: f'drifted_{item}' for item in drift_items_list}


        def rename_item(row):
            if int(row['user_id'].split('_')[1]) > sudden_drift_start and row['item_id'] in renamed_items:
                return renamed_items[row['item_id']]
            return row['item_id']

        all_items_seen_df['item_id'] = all_items_seen_df.apply(rename_item, axis=1)
        # print(all_items_seen_df.item_id.groupby([all_items_seen_df.user_id, all_items_seen_df.item_id]).count().unstack().fillna(0).astype(int))

        save_dataset_atomic_file(all_items_seen_df, save_path)

    else:
        
        if len(items_freq_list) != len(items_list):
            print('Not all items frequency was specified!')
            return None


        random.seed(random_seed)  # For reproducibility
        sampled_df = pd.DataFrame({})
        for i, freq in enumerate(items_freq_list):
            # print('k ',k)
            user_sample = random.sample(users_list[:sudden_drift_start], k=freq) +\
                            random.sample(users_list[sudden_drift_start:], k=freq)
            temp_df = pd.DataFrame({'user_id': user_sample, 
                                    'item_id': items_list[i]})
            print(temp_df.item_id.groupby([temp_df.user_id, temp_df.item_id]).count().unstack().fillna(0).astype(int))
            sampled_df = pd.concat([sampled_df, temp_df])
        

        # Introduce sudden drift
        # random.seed(random_seed)  # For reproducibility
        # drift_items_list = random.sample(items_list, k=n_items_to_drift)  
        drift_items_list = [items_list[i] for i,x in enumerate(items_freq_list) if x == sudden_drift_start]
        renamed_items = {item: f'drifted_{item}' for item in drift_items_list}
        print('drift_items_list', drift_items_list)
        print('renamed_items', renamed_items)


        def rename_item(row):
            if int(row['user_id'].split('_')[1]) > sudden_drift_start and row['item_id'] in renamed_items:
                return renamed_items[row['item_id']]
            return row['item_id']

        sampled_df['item_id'] = sampled_df.apply(rename_item, axis=1)
        print(sampled_df.item_id.groupby([sampled_df.user_id, sampled_df.item_id]).count().unstack().fillna(0).astype(int))
        

        save_dataset_atomic_file(sampled_df, save_path)





In [6]:
# Parameters
n_users = 10
n_items = 5
sudden_drift_start = 5  # Starting user index for drift (1-indexed)

string = "24/12/2024 21:12:24"
ts = time.mktime(datetime.strptime(string, "%d/%m/%Y %H:%M:%S").timetuple())

all_items_seen = False

random_seed = 42

ratio_to_drift = 2 # Select 50% of items to rename
n_items_to_drift = n_items // ratio_to_drift

save_path = 'sudden_drift_all_items_seen_dataset'#.inter'

items_freq_list = [sudden_drift_start, sudden_drift_start, sudden_drift_start//4, sudden_drift_start//4, sudden_drift_start//4]

# non_drift_items_freq_list = [sudden_drift_start//2, sudden_drift_start//4, sudden_drift_start//3]
# non_drift_items_freq_list = [sudden_drift_start//4, sudden_drift_start//4, sudden_drift_start//4]
# non_drift_items_freq_list = [sudden_drift_start*3//4, sudden_drift_start*3//4, sudden_drift_start*3//4]
# drift_freq_item_list = [1, 1]
# drift_items_freq_list = [sudden_drift_start//2]*n_items_to_drift # yes, just n_items_to_drift (see that code only renames var after deleting lines)

In [11]:
generate_artificial_random_dataset(n_users,
                                    n_items, 
                                    ts,
                                    all_items_seen,
                                    random_seed,
                                    n_items_to_drift,
                                    sudden_drift_start,
                                    items_freq_list,
                                    save_path=None)

item_id  i_1
user_id     
u_1        1
u_10       1
u_2        1
u_3        1
u_4        1
u_5        1
u_6        1
u_7        1
u_8        1
u_9        1
item_id  i_2
user_id     
u_1        1
u_10       1
u_2        1
u_3        1
u_4        1
u_5        1
u_6        1
u_7        1
u_8        1
u_9        1
item_id  i_3
user_id     
u_5        1
u_9        1
item_id  i_4
user_id     
u_2        1
u_9        1
item_id  i_5
user_id     
u_5        1
u_8        1
drift_items_list ['i_1', 'i_2']
renamed_items {'i_1': 'drifted_i_1', 'i_2': 'drifted_i_2'}
item_id  drifted_i_1  drifted_i_2  i_1  i_2  i_3  i_4  i_5
user_id                                                   
u_1                0            0    1    1    0    0    0
u_10               1            1    0    0    0    0    0
u_2                0            0    1    1    0    1    0
u_3                0            0    1    1    0    0    0
u_4                0            0    1    1    0    0    0
u_5                0        

In [52]:
# code from chatgpt

import pandas as pd
import random

# Parameters
num_users = 800
num_items = 5
sudden_drift_start = 400  # Starting user index for drift (1-indexed)

In [51]:
from datetime import datetime
import time

string = "24/12/2024 21:12:24"
ts = time.mktime(datetime.strptime(string, "%d/%m/%Y %H:%M:%S").timetuple())
print(ts)

1735092744.0


In [53]:
# Generate initial dataset
users = [f'u_{i+1}' for i in range(num_users)]
items = [f'i_{j+1}' for j in range(num_items)]
data = []

for user in users:
    for item in items:
        data.append({'user_id': user, 'item_id': item, 'timestamp':ts})

df = pd.DataFrame(data)
df

Unnamed: 0,user_id,item_id,timestamp
0,u_1,i_1,1.735093e+09
1,u_1,i_2,1.735093e+09
2,u_1,i_3,1.735093e+09
3,u_1,i_4,1.735093e+09
4,u_1,i_5,1.735093e+09
...,...,...,...
3995,u_800,i_1,1.735093e+09
3996,u_800,i_2,1.735093e+09
3997,u_800,i_3,1.735093e+09
3998,u_800,i_4,1.735093e+09


In [54]:
# Introduce sudden drift
random.seed(42)  # For reproducibility
drift_items = random.sample(items, k=num_items // 2)  # Select 50% of items to rename
renamed_items = {item: f'drifted_{item}' for item in drift_items}

def rename_item(row):
    if int(row['user_id'].split('_')[1]) > sudden_drift_start and row['item_id'] in renamed_items:
        return renamed_items[row['item_id']]
    return row['item_id']

df['item_id'] = df.apply(rename_item, axis=1)

# Output the dataset
df.to_csv("sudden_drift_all_items_seen_dataset.csv", index=False)
print("Dataset with sudden drift created and saved as 'sudden_drift_all_items_seen_dataset.csv'.")


Dataset with sudden drift created and saved as 'sudden_drift_all_items_seen_dataset.csv'.


# different one

In [55]:
# Generate initial dataset
users = [f'u_{i+1}' for i in range(num_users)]
items = [f'i_{j+1}' for j in range(num_items)]
data = []

for user in users:
    for item in items:
        data.append({'user_id': user, 'item_id': item, 'timestamp':ts})

df = pd.DataFrame(data)
df

Unnamed: 0,user_id,item_id,timestamp
0,u_1,i_1,1.735093e+09
1,u_1,i_2,1.735093e+09
2,u_1,i_3,1.735093e+09
3,u_1,i_4,1.735093e+09
4,u_1,i_5,1.735093e+09
...,...,...,...
3995,u_800,i_1,1.735093e+09
3996,u_800,i_2,1.735093e+09
3997,u_800,i_3,1.735093e+09
3998,u_800,i_4,1.735093e+09


In [56]:
# Introduce sudden drift
random.seed(42)  # For reproducibility
drift_items = random.sample(items, k=num_items // 2)  # Select 50% of items to rename
renamed_items = {item: f'drifted_{item}' for item in drift_items}

non_drift_items = list(set(items) - set(drift_items))
non_drift_items

['i_4', 'i_3', 'i_2']

In [57]:
freq_item_list = [sudden_drift_start//2, sudden_drift_start//4, sudden_drift_start//3]
freq_item_list

[200, 100, 133]

In [58]:
random.seed(52)  # For reproducibility

to_exclude_df = pd.DataFrame({})

for i, k in enumerate(freq_item_list):
    users_sample = random.sample(users[:sudden_drift_start], k=k)
    item_id = non_drift_items[i]
    temp_df = pd.DataFrame({'user_id': users_sample, 'item_id':item_id})
    print(temp_df.shape)
    to_exclude_df = pd.concat([to_exclude_df, temp_df])

(200, 2)
(100, 2)
(133, 2)


In [59]:
org_sudden_drift_idx = df[df['user_id']=='u_'+str(sudden_drift_start+1)].index[0]
org_sudden_drift_idx

2000

In [60]:
df_till_drift = df.iloc[:org_sudden_drift_idx]
df_till_drift

Unnamed: 0,user_id,item_id,timestamp
0,u_1,i_1,1.735093e+09
1,u_1,i_2,1.735093e+09
2,u_1,i_3,1.735093e+09
3,u_1,i_4,1.735093e+09
4,u_1,i_5,1.735093e+09
...,...,...,...
1995,u_400,i_1,1.735093e+09
1996,u_400,i_2,1.735093e+09
1997,u_400,i_3,1.735093e+09
1998,u_400,i_4,1.735093e+09


In [61]:
df_till_drift = df_till_drift[~(df_till_drift.user_id.isin(to_exclude_df.user_id) & df_till_drift.item_id.isin(to_exclude_df.item_id))].reset_index(drop=True)
df_till_drift

Unnamed: 0,user_id,item_id,timestamp
0,u_1,i_1,1.735093e+09
1,u_1,i_5,1.735093e+09
2,u_2,i_1,1.735093e+09
3,u_2,i_5,1.735093e+09
4,u_3,i_1,1.735093e+09
...,...,...,...
1110,u_399,i_3,1.735093e+09
1111,u_399,i_4,1.735093e+09
1112,u_399,i_5,1.735093e+09
1113,u_400,i_1,1.735093e+09


In [62]:
new_drift_idx = df_till_drift.index.stop
new_drift_idx

1115

In [63]:
full_df = pd.concat([df_till_drift,df_till_drift]).reset_index(drop=True)
full_df

Unnamed: 0,user_id,item_id,timestamp
0,u_1,i_1,1.735093e+09
1,u_1,i_5,1.735093e+09
2,u_2,i_1,1.735093e+09
3,u_2,i_5,1.735093e+09
4,u_3,i_1,1.735093e+09
...,...,...,...
2225,u_399,i_3,1.735093e+09
2226,u_399,i_4,1.735093e+09
2227,u_399,i_5,1.735093e+09
2228,u_400,i_1,1.735093e+09


In [64]:
for i, u in enumerate(users[:sudden_drift_start]):
    _ = full_df.iloc[new_drift_idx:]
    # print(_[_['user_id']==u])
    print(users[sudden_drift_start:][i])
    _.loc[_['user_id']==u, 'user_id'] = users[sudden_drift_start:][i]
    # print(full_df.iloc[new_drift_idx:][full_df['user_id']==u]) # should be empty

u_401
u_402
u_403
u_404
u_405
u_406
u_407
u_408
u_409
u_410
u_411
u_412
u_413
u_414
u_415
u_416
u_417
u_418
u_419
u_420
u_421
u_422
u_423
u_424
u_425
u_426
u_427
u_428
u_429
u_430
u_431
u_432
u_433
u_434
u_435
u_436
u_437
u_438
u_439
u_440
u_441
u_442
u_443
u_444
u_445
u_446
u_447
u_448
u_449
u_450
u_451
u_452
u_453
u_454
u_455
u_456
u_457
u_458
u_459
u_460
u_461
u_462
u_463
u_464
u_465
u_466
u_467
u_468
u_469
u_470
u_471
u_472
u_473
u_474
u_475
u_476
u_477
u_478
u_479
u_480
u_481
u_482
u_483
u_484
u_485
u_486
u_487
u_488
u_489
u_490
u_491
u_492
u_493
u_494
u_495
u_496
u_497
u_498
u_499
u_500
u_501
u_502
u_503
u_504
u_505
u_506
u_507
u_508
u_509
u_510
u_511
u_512
u_513
u_514
u_515
u_516
u_517
u_518
u_519
u_520
u_521
u_522
u_523
u_524
u_525
u_526
u_527
u_528
u_529
u_530
u_531
u_532
u_533
u_534
u_535
u_536
u_537
u_538
u_539
u_540
u_541
u_542
u_543
u_544
u_545
u_546
u_547
u_548
u_549
u_550
u_551
u_552
u_553
u_554
u_555
u_556
u_557
u_558
u_559
u_560
u_561
u_562
u_563
u_564
u_565
u_566
u_56

In [65]:
full_df

Unnamed: 0,user_id,item_id,timestamp
0,u_1,i_1,1.735093e+09
1,u_1,i_5,1.735093e+09
2,u_2,i_1,1.735093e+09
3,u_2,i_5,1.735093e+09
4,u_3,i_1,1.735093e+09
...,...,...,...
2225,u_799,i_3,1.735093e+09
2226,u_799,i_4,1.735093e+09
2227,u_799,i_5,1.735093e+09
2228,u_800,i_1,1.735093e+09


In [66]:
full_df['item_id'] = full_df.apply(rename_item, axis=1)
full_df

Unnamed: 0,user_id,item_id,timestamp
0,u_1,i_1,1.735093e+09
1,u_1,i_5,1.735093e+09
2,u_2,i_1,1.735093e+09
3,u_2,i_5,1.735093e+09
4,u_3,i_1,1.735093e+09
...,...,...,...
2225,u_799,i_3,1.735093e+09
2226,u_799,i_4,1.735093e+09
2227,u_799,drifted_i_5,1.735093e+09
2228,u_800,drifted_i_1,1.735093e+09


In [67]:
# Output the dataset
full_df.to_csv("sudden_drift_dataset.csv", index=False)
print("Dataset with sudden drift created and saved as 'sudden_drift_dataset.csv'.")

Dataset with sudden drift created and saved as 'sudden_drift_dataset.csv'.
