In [1]:
import os
import pandas as pd
import random
from datetime import datetime
import time
import pickle

In [49]:
# import sys
# sys.path.append(os.path.abspath('') + '/..')
# sys.path

In [2]:
def validate_folderpath(folderpath):
    # print('im validating')
    if not os.path.exists(folderpath):
        os.makedirs(folderpath)
        print('Folder created: ', folderpath)


def save_picklefile(d, filepath):
    with open(filepath, 'wb') as f:
            pickle.dump(d, f)

    print('Saved file at '+filepath)

def load_picklefile(filepath):
    with open(filepath, 'rb') as f:
        loaded_dict = pickle.load(f)
    return loaded_dict

    


def generate_artificial_random_dataset(n_users,
                                       n_items, 
                                       ts,
                                       all_items_seen,
                                       random_seed,
                                       n_items_to_drift,
                                       sudden_drift_start,
                                       drift_items_freq_list,
                                       non_drift_items_freq_list,
                                       save_path,
                                       base_filename,
                                       bin_size):
    
    def add_zero_user(df):

        df['user_id_n'] = df['user_id'].apply(lambda x: x[2:])
        df['user_id_n'] = df['user_id_n'].astype(int)

        # df.loc[-1] = ['u_-1', 'i_1', ts, 0]
        # df.loc[-2] = ['u_-1', 'i_2', ts, 0]
        # df.loc[-3] = ['u_-1', 'i_3', ts, 0]
        # df.loc[-4] = ['u_-1', 'i_5', ts, 0]
        # df.loc[-5] = ['u_0', 'i_4', ts, 0]
        df.loc[-6] = ['u_0', 'drifted_i_1', ts, 0]
        df.loc[-7] = ['u_0', 'drifted_i_5', ts, 0]

        df.sort_values(by='user_id_n', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.drop(columns=['user_id_n'], inplace=True)

        # print('added zero user\n', df.head())
        return df

    
    def create_folderpath(save_path, base_filename, specs_str):
        return save_path+base_filename+'_'+specs_str+'/'
    

    def save_items_frequencies(n_items_to_drift,
                               sudden_drift_start,
                               drift_items_freq_list,
                               non_drift_items_freq_list,
                               save_path, 
                               base_filename,
                               specs_str):
        
        folderpath = create_folderpath(save_path, base_filename, specs_str)
        validate_folderpath(folderpath)

        d = {'n_items_to_drift': n_items_to_drift,
             'sudden_drift_start': sudden_drift_start,
             'drift_items_freq_list': drift_items_freq_list,
             'non_drift_items_freq_list': non_drift_items_freq_list}

        save_picklefile(d, folderpath+'saved_dictionary.pkl')
       

    
    def save_dataset_atomic_file(df, save_path, base_filename, specs_str):
        if save_path:
            folderpath = create_folderpath(save_path, base_filename, specs_str)
            validate_folderpath(folderpath)
            # Output the dataset
            filepath = folderpath+base_filename+'_'+specs_str
            df.to_csv(filepath+'.csv', index=False)
            df.to_csv(filepath+'.inter',
                        header=['user_id:token','item_id:token','timestamp:float'], 
                        sep='\t', 
                        index=False)
            print("Dataset with sudden drift created and saved at "+filepath+".")



    def split_dataset_into_4_and_save_atomic_file(df, bin_size, save_path, base_filename, specs_str):
        print(df.head())
        save_dataset_atomic_file(df, save_path, base_filename, specs_str)
        save_dataset_atomic_file(df[:bin_size].reset_index(drop=True), save_path, base_filename, specs_str+'_pt1')
        save_dataset_atomic_file(df[:bin_size*2].reset_index(drop=True), save_path, base_filename, specs_str+'_pt2')
        save_dataset_atomic_file(df[:bin_size*3].reset_index(drop=True), save_path, base_filename, specs_str+'_pt3')

        
        save_dataset_atomic_file(df[bin_size:bin_size*2].reset_index(drop=True), save_path, base_filename, specs_str+'_pt5')
        save_dataset_atomic_file(df[bin_size*2:bin_size*3].reset_index(drop=True), save_path, base_filename, specs_str+'_pt6')
        save_dataset_atomic_file(df[bin_size*3:].reset_index(drop=True), save_path, base_filename, specs_str+'_pt7')
        save_dataset_atomic_file(df[bin_size:bin_size*3].reset_index(drop=True), save_path, base_filename, specs_str+'_pt8')

    def calculate_sparsity(df):
        # df.item_id.groupby([df.user_id, df.item_id]).count().sum() == df.user_id.count()
        sparsity = 1 - df.user_id.count()/(df.user_id.nunique()*df.item_id.nunique())
        specs_str = str(df.user_id.nunique())+'x'+str(df.item_id.nunique())+'_'+str(round(sparsity, 2))
        print('specs_str', specs_str)
        return sparsity, specs_str
    

    def rename_item(row):
        if int(row['user_id'].split('_')[1]) > sudden_drift_start and row['item_id'] in renamed_items:
            return renamed_items[row['item_id']]
        return row['item_id']


    users_list = [f'u_{i+1}' for i in range(n_users)]
    items_list = [f'i_{j+1}' for j in range(n_items)]


    if all_items_seen:

        data = []
        for user in users_list:
            for item in items_list:
                data.append({'user_id': user, 'item_id': item, 'timestamp':ts})

        all_items_seen_df = pd.DataFrame(data)

        # Introduce sudden drift
        random.seed(random_seed)  # For reproducibility
        drift_items_list = random.sample(items_list, k=n_items_to_drift)  
        renamed_items = {item: f'drifted_{item}' for item in drift_items_list}       
        

        all_items_seen_df['item_id'] = all_items_seen_df.apply(rename_item, axis=1)
        # print(all_items_seen_df.item_id.groupby([all_items_seen_df.user_id, all_items_seen_df.item_id]).count().unstack().fillna(0).astype(int))

        
        sparsity , specs_str = calculate_sparsity(all_items_seen_df)
        print('sparsity: ',sparsity)
        

        all_items_seen_df = add_zero_user(all_items_seen_df)

        split_dataset_into_4_and_save_atomic_file(all_items_seen_df, bin_size, save_path, specs_str)
        # save_dataset_atomic_file(all_items_seen_df, save_path, specs_str)
        save_items_frequencies(n_items_to_drift, sudden_drift_start, drift_items_freq_list, non_drift_items_freq_list, save_path, base_filename, specs_str)
        

    else:
        
        if len(drift_items_freq_list) != n_items_to_drift:
            print('Not all items frequency was specified!')
            return None
        elif len(non_drift_items_freq_list) != len(items_list)-n_items_to_drift:
            print('Not all items frequency was specified!')
            return None


        def sample_with_repetition_of_pattern(users_list, items_list, items_freq_list):
            random.seed(random_seed)
            sampled_df = pd.DataFrame({})
            for i, freq in enumerate(items_freq_list):
                # print('k ',k)
                user_sample = random.sample(users_list[:sudden_drift_start], k=freq) +\
                                random.sample(users_list[sudden_drift_start:], k=freq)
                temp_df = pd.DataFrame({'user_id': user_sample, 
                                        'item_id': items_list[i]})
                # print(temp_df.item_id.groupby([temp_df.user_id, temp_df.item_id]).count().unstack().fillna(0).astype(int))
                sampled_df = pd.concat([sampled_df, temp_df])
            
            return sampled_df
        

        # Introduce sudden drift
        # No need to random sample, bc the list will have the frequencies for each item
        random.seed(random_seed)  # For reproducibility
        drift_items_list = random.sample(items_list, k=n_items_to_drift)  
        # drift_items_list = [items_list[i] for i,x in enumerate(items_freq_list) if x == sudden_drift_start]
        renamed_items = {item: f'drifted_{item}' for item in drift_items_list}
        non_drift_items_list = list(set(items_list) - set(drift_items_list))
        
        print('drift_items_list', drift_items_list)
        print('renamed_items', renamed_items)
        print('non_drift_items_list', non_drift_items_list)

        
        random.seed(random_seed)  # For reproducibility
        sampled_df = sample_with_repetition_of_pattern(users_list,
                                                       non_drift_items_list,
                                                       non_drift_items_freq_list)
        
        sampled_df = pd.concat([sampled_df,
                                sample_with_repetition_of_pattern(users_list,
                                                                    drift_items_list,
                                                                    drift_items_freq_list)])

        if sampled_df.user_id.nunique() < n_users:
            # print(sampled_df.head())
            users_not_sampled = list(set(users_list) - set(sampled_df.user_id))
            print('users_not_sampled', len(users_not_sampled))
            # print('drift_items_list', drift_items_list)
            for user in users_not_sampled:
                for item in drift_items_list:
                    # print(sampled_df.loc[sampled_df['user_id']==user, 'item_id'].count())
                    sampled_df.loc[len(sampled_df)] = [user, item]


        sampled_df['item_id'] = sampled_df.apply(rename_item, axis=1)
        # print(sampled_df.item_id.groupby([sampled_df.user_id, sampled_df.item_id]).count().unstack().fillna(0).astype(int))

        sampled_df['timestamp'] = ts



        sparsity, specs_str = calculate_sparsity(sampled_df)
        print('sparsity: ',sparsity)
        # print(specs_str)
        print(sampled_df.item_id.groupby([sampled_df.user_id, sampled_df.item_id]).count().unstack().fillna(0).astype(int))
        # print(sampled_df.head())


        sampled_df = add_zero_user(sampled_df)


        # save_dataset_atomic_file(sampled_df, save_path, specs_str)
        split_dataset_into_4_and_save_atomic_file(sampled_df, bin_size, save_path, base_filename, specs_str)
        save_items_frequencies(n_items_to_drift, sudden_drift_start, drift_items_freq_list, non_drift_items_freq_list, save_path, base_filename, specs_str)


        return sampled_df

In [3]:
# Parameters
n_users = 4000 # bc of random sample, some users dont have occurrencies
n_items = 5
sudden_drift_start = 2000  # Starting user index for drift (1-indexed)
bin_size = 1000

string = "24/12/2024 21:12:24"
ts = time.mktime(datetime.strptime(string, "%d/%m/%Y %H:%M:%S").timetuple())

all_items_seen = False

random_seed = 42

ratio_to_drift = 2 # Select 50% of items to rename
n_items_to_drift = n_items // ratio_to_drift

base_filename = 'sudden_drift_dataset'
save_path = 'processed_datasets/artificial_data/'

drift_items_freq_list = [sudden_drift_start//2, sudden_drift_start//2]
non_drift_items_freq_list = [sudden_drift_start//4, sudden_drift_start//4, sudden_drift_start//4]

In [52]:
df = generate_artificial_random_dataset(n_users=n_users,
                                    n_items=n_items, 
                                    ts=ts,
                                    all_items_seen=all_items_seen,
                                    n_items_to_drift=n_items_to_drift,
                                    random_seed=random_seed,
                                    sudden_drift_start=sudden_drift_start,
                                    drift_items_freq_list=drift_items_freq_list,
                                    non_drift_items_freq_list=non_drift_items_freq_list,
                                    save_path=save_path,
                                    base_filename=base_filename,
                                    bin_size=bin_size)

drift_items_list ['i_1', 'i_5']
renamed_items {'i_1': 'drifted_i_1', 'i_5': 'drifted_i_5'}
non_drift_items_list ['i_3', 'i_2', 'i_4']
users_not_sampled 571
specs_str 4000x7_0.71
sparsity:  0.7092142857142858
item_id  drifted_i_1  drifted_i_5  i_1  i_2  i_3  i_4  i_5
user_id                                                   
u_1                0            0    1    0    0    0    0
u_10               0            0    1    0    0    0    1
u_100              0            0    0    0    0    0    1
u_1000             0            0    1    1    0    0    0
u_1001             0            0    1    1    1    0    1
...              ...          ...  ...  ...  ...  ...  ...
u_995              0            0    1    0    1    0    0
u_996              0            0    1    0    1    1    0
u_997              0            0    1    0    1    0    0
u_998              0            0    1    0    0    0    1
u_999              0            0    1    1    0    0    0

[4000 rows x 7 columns]


In [53]:
load_picklefile('processed_datasets/artificial_data/sudden_drift_dataset_4000x7_0.71/saved_dictionary.pkl')

{'n_items_to_drift': 2,
 'sudden_drift_start': 2000,
 'drift_items_freq_list': [1000, 1000],
 'non_drift_items_freq_list': [500, 500, 500]}

# base folderpath

In [5]:
def get_filepath(base_folderpath, filename, pt):
    return base_folderpath+filename+pt+'/'+filename+pt

In [6]:
FILENAME = 'sudden_drift_dataset_4000x7_0.71'

# df pt 1

In [7]:
df_pt1 = pd.read_csv(get_filepath(save_path, FILENAME, '_pt1')+'.csv')
df_pt1

Unnamed: 0,user_id,item_id,timestamp
0,u_0,drifted_i_5,1.735093e+09
1,u_0,drifted_i_1,1.735093e+09
2,u_1,i_1,1.735093e+09
3,u_2,i_1,1.735093e+09
4,u_2,i_3,1.735093e+09
...,...,...,...
995,u_484,i_4,1.735093e+09
996,u_485,i_1,1.735093e+09
997,u_485,i_3,1.735093e+09
998,u_485,i_5,1.735093e+09


In [57]:
df_pt1

Unnamed: 0,user_id,item_id,timestamp
0,u_0,drifted_i_5,1.735093e+09
1,u_0,drifted_i_1,1.735093e+09
2,u_1,i_1,1.735093e+09
3,u_2,i_1,1.735093e+09
4,u_2,i_3,1.735093e+09
...,...,...,...
995,u_484,i_4,1.735093e+09
996,u_485,i_1,1.735093e+09
997,u_485,i_3,1.735093e+09
998,u_485,i_5,1.735093e+09


In [8]:
df_pt1.item_id.value_counts()

item_id
i_1            327
i_5            312
i_3            132
i_4            116
i_2            111
drifted_i_5      1
drifted_i_1      1
Name: count, dtype: int64

# df pt 5

In [10]:
df_pt5 = pd.read_csv(get_filepath(save_path, FILENAME, '_pt5')+'.csv')
df_pt5.item_id.value_counts()

item_id
i_1    314
i_5    312
i_2    135
i_4    130
i_3    109
Name: count, dtype: int64

# test code trying to solve the problem with 'some users have seen all items' by adding user 0

In [58]:
df_full = pd.read_csv(get_filepath(save_path, FILENAME, '')+'.csv')
df_full

Unnamed: 0,user_id,item_id,timestamp
0,u_0,drifted_i_5,1.735093e+09
1,u_0,drifted_i_1,1.735093e+09
2,u_1,i_1,1.735093e+09
3,u_2,i_1,1.735093e+09
4,u_2,i_3,1.735093e+09
...,...,...,...
8139,u_3998,i_4,1.735093e+09
8140,u_3998,drifted_i_5,1.735093e+09
8141,u_3999,drifted_i_1,1.735093e+09
8142,u_3999,i_3,1.735093e+09


In [59]:
df_full.item_id.unique()

array(['drifted_i_5', 'drifted_i_1', 'i_1', 'i_3', 'i_2', 'i_4', 'i_5'],
      dtype=object)

In [60]:
list(df_full.item_id)

['drifted_i_5',
 'drifted_i_1',
 'i_1',
 'i_1',
 'i_3',
 'i_2',
 'i_1',
 'i_4',
 'i_5',
 'i_1',
 'i_3',
 'i_1',
 'i_2',
 'i_3',
 'i_5',
 'i_1',
 'i_1',
 'i_5',
 'i_3',
 'i_1',
 'i_5',
 'i_1',
 'i_5',
 'i_4',
 'i_5',
 'i_1',
 'i_1',
 'i_5',
 'i_1',
 'i_3',
 'i_1',
 'i_3',
 'i_1',
 'i_5',
 'i_5',
 'i_3',
 'i_1',
 'i_5',
 'i_1',
 'i_5',
 'i_3',
 'i_2',
 'i_1',
 'i_1',
 'i_2',
 'i_1',
 'i_5',
 'i_5',
 'i_1',
 'i_5',
 'i_3',
 'i_1',
 'i_5',
 'i_1',
 'i_1',
 'i_2',
 'i_1',
 'i_5',
 'i_1',
 'i_5',
 'i_1',
 'i_5',
 'i_2',
 'i_5',
 'i_1',
 'i_3',
 'i_1',
 'i_2',
 'i_5',
 'i_5',
 'i_4',
 'i_5',
 'i_5',
 'i_1',
 'i_2',
 'i_5',
 'i_1',
 'i_4',
 'i_4',
 'i_5',
 'i_3',
 'i_1',
 'i_1',
 'i_5',
 'i_1',
 'i_4',
 'i_4',
 'i_2',
 'i_1',
 'i_3',
 'i_5',
 'i_5',
 'i_1',
 'i_5',
 'i_5',
 'i_1',
 'i_1',
 'i_5',
 'i_1',
 'i_3',
 'i_1',
 'i_2',
 'i_1',
 'i_5',
 'i_3',
 'i_1',
 'i_5',
 'i_4',
 'i_1',
 'i_1',
 'i_4',
 'i_5',
 'i_1',
 'i_2',
 'i_1',
 'i_5',
 'i_1',
 'i_3',
 'i_5',
 'i_5',
 'i_1',
 'i_5',
 'i_1',


In [61]:
df_full.item_id.value_counts()

item_id
drifted_i_5    1288
drifted_i_1    1288
i_1            1284
i_5            1284
i_3            1000
i_2            1000
i_4            1000
Name: count, dtype: int64

In [62]:
df_full.sort_values(by='user_id')

Unnamed: 0,user_id,item_id,timestamp
0,u_0,drifted_i_5,1.735093e+09
1,u_0,drifted_i_1,1.735093e+09
2,u_1,i_1,1.735093e+09
22,u_10,i_5,1.735093e+09
21,u_10,i_1,1.735093e+09
...,...,...,...
2058,u_997,i_3,1.735093e+09
2059,u_998,i_5,1.735093e+09
2060,u_998,i_1,1.735093e+09
2062,u_999,i_1,1.735093e+09


In [63]:
df_full.user_id.value_counts()

user_id
u_3250    5
u_601     5
u_2563    5
u_494     5
u_1040    5
         ..
u_2831    1
u_1133    1
u_2828    1
u_1138    1
u_4000    1
Name: count, Length: 4001, dtype: int64

In [64]:
df_full

Unnamed: 0,user_id,item_id,timestamp
0,u_0,drifted_i_5,1.735093e+09
1,u_0,drifted_i_1,1.735093e+09
2,u_1,i_1,1.735093e+09
3,u_2,i_1,1.735093e+09
4,u_2,i_3,1.735093e+09
...,...,...,...
8139,u_3998,i_4,1.735093e+09
8140,u_3998,drifted_i_5,1.735093e+09
8141,u_3999,drifted_i_1,1.735093e+09
8142,u_3999,i_3,1.735093e+09


In [65]:
df_full['user_id_n'] = df_full['user_id'].apply(lambda x: x[2:])

In [66]:
df_full['user_id_n'] = df_full['user_id_n'].astype(int)

In [67]:
df_full.loc[-1] = ['u_0', 'drifted_i_5', ts, 0]
df_full.loc[-2] = ['u_0', 'drifted_i_1', ts, 0]

df_full.sort_values(by='user_id_n').reset_index(drop=True)

Unnamed: 0,user_id,item_id,timestamp,user_id_n
0,u_0,drifted_i_5,1.735093e+09,0
1,u_0,drifted_i_5,1.735093e+09,0
2,u_0,drifted_i_1,1.735093e+09,0
3,u_0,drifted_i_1,1.735093e+09,0
4,u_1,i_1,1.735093e+09,1
...,...,...,...,...
8141,u_3998,i_4,1.735093e+09,3998
8142,u_3998,drifted_i_5,1.735093e+09,3998
8143,u_3999,drifted_i_1,1.735093e+09,3999
8144,u_3999,i_3,1.735093e+09,3999


In [68]:
df_full.sort_values(by='user_id_n').reset_index(drop=True)

Unnamed: 0,user_id,item_id,timestamp,user_id_n
0,u_0,drifted_i_5,1.735093e+09,0
1,u_0,drifted_i_5,1.735093e+09,0
2,u_0,drifted_i_1,1.735093e+09,0
3,u_0,drifted_i_1,1.735093e+09,0
4,u_1,i_1,1.735093e+09,1
...,...,...,...,...
8141,u_3998,i_4,1.735093e+09,3998
8142,u_3998,drifted_i_5,1.735093e+09,3998
8143,u_3999,drifted_i_1,1.735093e+09,3999
8144,u_3999,i_3,1.735093e+09,3999


In [69]:
df_full[df_full.user_id_n<=sudden_drift_start].item_id.value_counts()

item_id
i_1            1284
i_5            1284
i_3             500
i_2             500
i_4             500
drifted_i_5       2
drifted_i_1       2
Name: count, dtype: int64

In [70]:
df_full[df_full.user_id_n>sudden_drift_start].item_id.value_counts()

item_id
drifted_i_5    1287
drifted_i_1    1287
i_3             500
i_2             500
i_4             500
Name: count, dtype: int64