In [2]:
from math import ceil
import numpy as np
import pandas as pd
import random

In [3]:
# Importing the data
R_df = pd.read_csv('InputData/R.csv',).drop('user_id\movie_id',axis=1)
R_df.columns = range(len(R_df.columns))
R_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679
0,5,3,4,3,3,5,4,1,5,3,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,0
939,0,0,0,2,0,0,4,5,3,0,...,0,0,0,0,0,0,0,0,0,0
940,5,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Household and Accounts Statistical Model

In [4]:
# Shared Datasets: 67% of all accounts are single_user/ 
                  #31% are two_users/1.6% are Three_users/0.4% are 4use
p_one_user_accounts = 0.67
p_two_users_accounts = 0.31
p_three_users_accounts = 0.016
p_four_users_accounts = 0.004

# MovieLens 100K
num_users = R_df.shape[0] # total number of users

# Number of users in each account in 100K dataset
n_one_users = 1+int(num_users*p_one_user_accounts)
n_two_users = int(num_users*p_two_users_accounts)
n_three_users = -1+ceil(num_users*p_three_users_accounts)
n_four_users = ceil(num_users*p_four_users_accounts)

total = n_one_users+n_two_users+n_three_users+n_four_users

n_one_user_accounts = n_one_users
n_two_users_accounts = n_two_users/2
n_three_users_accounts = n_three_users/3
n_four_users_accounts = n_four_users/4

In [5]:
table = pd.DataFrame(
    [[n_one_users, n_two_users, n_three_users, n_four_users, total],
     [n_one_user_accounts,n_two_users_accounts,n_three_users_accounts,n_four_users_accounts,
      n_one_user_accounts+n_two_users_accounts+n_three_users_accounts+n_four_users_accounts]]
    , index=["#users", '#accounts'],columns=[1,2,3,4,'Total']).T.astype(int)

table['#accounts']['Total']

784

In [6]:
# Function to merge user rows
def merge_users(user_indices):
    return R_df.iloc[user_indices, :].sum(axis=0)

In [7]:
num_accounts = table['#accounts']['Total']
num_users = R_df.shape[0]
n = R_df.shape[1]

# Allocate users to shared accounts based on the calculated ratios and merge their ratings
np.random.seed(42)  # For reproducibility
user_indices = np.random.permutation(num_users)
account_indices = np.random.permutation(num_accounts)

# Create a new DataFrame for R_sa
R_sa = np.zeros((num_accounts,n),dtype=float)
merge_mapping = []
start_idx = 0
i=0

In [8]:
for size, row in table[:-1].iterrows():
    for _ in range(row['#accounts']):
        selected_users = user_indices[start_idx:start_idx+size]
        R_sa[account_indices[i]] = merge_users(selected_users)
        merge_mapping.append((account_indices[i], list(selected_users))) 
        start_idx += size
        i = i+1

In [9]:
selected_users

array([270, 860, 435, 102])

In [10]:
# Reset the index of the new DataFrame
# R_sa.reset_index(drop=True, inplace=True)
pd.DataFrame(R_sa).to_csv('InputData/R_sa.csv')
# Convert the merge mapping to a DataFrame
merge_mapping_df = pd.DataFrame(merge_mapping,columns=['are mixed in row of R_sa','rows of R'])
merge_mapping_df.to_csv('InputData/SA.csv', index=False)
# Show the shape of the new matrix R_sa and a sample of the data

In [13]:
account_indices

array([300, 372, 535, 511, 560, 191, 405, 632,  23, 368, 190, 426, 543,
       778, 526,  42, 352, 247, 273, 545, 493, 149, 504, 629, 188,  93,
       548, 598, 527, 667, 411, 509, 709, 433, 407, 690,  20, 582, 524,
       559, 550, 491, 276, 720, 671, 579, 202, 357, 425, 115, 449, 628,
       219, 422, 777, 297, 185, 271, 204, 640, 533, 438, 314, 394, 694,
       519, 554, 492, 508, 254, 429, 134, 735, 364, 575, 207, 414, 462,
        49, 651, 551, 733, 307, 324, 647, 610, 371, 165, 203, 388,  21,
       737,  78, 611, 534, 770, 154, 621,  70, 749,  82,  69,  34, 437,
       473, 725, 494, 635, 783, 334, 696, 685, 367, 275, 604,  65, 125,
       759, 569, 376, 514, 484, 135, 239, 289, 264,  29, 464, 241, 648,
        30,  57, 137, 166, 303,  99, 398, 259, 351, 256,  47, 251, 661,
       656,  91, 523, 211,  32,  90, 293, 383, 416,  55, 505, 448, 298,
       481,  44, 173, 539, 614,  27,  59, 338, 736, 742, 613, 678,  37,
       240, 738, 283, 706, 444, 205, 278, 415, 229, 330, 305, 77

In [269]:
merge_mapping

[(300, [96]),
 (372, [265]),
 (535, [810]),
 (511, [23]),
 (560, [30]),
 (191, [280]),
 (405, [568]),
 (632, [259]),
 (23, [331]),
 (368, [323]),
 (190, [422]),
 (426, [467]),
 (543, [286]),
 (778, [893]),
 (526, [868]),
 (42, [638]),
 (352, [538]),
 (247, [499]),
 (273, [481]),
 (545, [334]),
 (493, [848]),
 (149, [770]),
 (504, [925]),
 (629, [39]),
 (188, [363]),
 (93, [764]),
 (548, [307]),
 (598, [444]),
 (527, [713]),
 (667, [704]),
 (411, [76]),
 (509, [817]),
 (709, [595]),
 (433, [371]),
 (407, [165]),
 (690, [755]),
 (20, [250]),
 (582, [882]),
 (524, [436]),
 (559, [239]),
 (550, [107]),
 (491, [67]),
 (276, [174]),
 (720, [158]),
 (671, [139]),
 (579, [754]),
 (202, [731]),
 (357, [306]),
 (425, [532]),
 (115, [694]),
 (449, [802]),
 (628, [63]),
 (219, [235]),
 (422, [765]),
 (777, [899]),
 (297, [86]),
 (185, [692]),
 (271, [723]),
 (204, [778]),
 (640, [516]),
 (533, [660]),
 (438, [587]),
 (314, [326]),
 (394, [218]),
 (694, [209]),
 (519, [656]),
 (554, [70]),
 (492, [

### Generating a dataset that accounts are built with users from different clusters

### Input data

In [11]:
# reading data from clustered users in '00 Weight Matrix Production.ipynb'
%store -r user_indices_of_Type

In [12]:
user_indices_of_Type

[[2,
  3,
  7,
  8,
  13,
  16,
  18,
  24,
  25,
  27,
  28,
  29,
  30,
  32,
  36,
  38,
  39,
  40,
  44,
  46,
  47,
  49,
  50,
  52,
  53,
  54,
  56,
  60,
  62,
  65,
  66,
  67,
  68,
  70,
  72,
  73,
  74,
  75,
  76,
  77,
  80,
  83,
  85,
  92,
  96,
  99,
  100,
  102,
  103,
  104,
  105,
  106,
  107,
  109,
  110,
  111,
  112,
  113,
  114,
  117,
  120,
  123,
  126,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  140,
  141,
  142,
  145,
  147,
  152,
  153,
  155,
  156,
  160,
  161,
  162,
  163,
  165,
  166,
  167,
  171,
  172,
  175,
  178,
  181,
  182,
  186,
  189,
  190,
  191,
  192,
  194,
  195,
  198,
  204,
  207,
  210,
  216,
  217,
  218,
  219,
  225,
  226,
  230,
  231,
  234,
  236,
  241,
  242,
  244,
  246,
  247,
  250,
  251,
  254,
  256,
  258,
  260,
  264,
  271,
  281,
  282,
  283,
  284,
  286,
  287,
  288,
  301,
  305,
  308,
  309,
  314,
  316,
  318,
  321,
  322,
  328,
  330,
  332,
  334,
  335,
  336,
  339,


In [13]:
account_size =[]
number_of_users_in_accounts = []
for size, row in table[:-1].iterrows():
    account_size.append(size)
    number_of_users_in_accounts.append(row['#accounts'])

________________________

### Process

In [14]:
np.random.seed(42)  # For reproducibility
user_shuffled_indices_of_Type = []
for i in range(4):
    user_shuffled_indices_of_Type.append(list(np.random.permutation(list(user_indices_of_Type[i]))))

In [15]:
all_accounts = []
new_account = []
random.seed(42)
Types = [0,1,2,3]
i = 0
for i in range(4):
    for account in range(number_of_accounts[i]):
        size = sizes[i]
        Type = random.sample(Types,size)
#         print('account',account,'of size',size, 'availible types',Type)

        new_account = []
        for t in Type:
#             print('from type',t)
            new_account.append(user_shuffled_indices_of_Type[t][0])
            user_shuffled_indices_of_Type[t].pop(0)

        if len(user_shuffled_indices_of_Type[t]) == 0 :
            Types.remove(t)

#         print(new_account)
#         print('lenT0',len(user_shuffled_indices_of_Type[0]),'lenT1',len(user_shuffled_indices_of_Type[1]),
#               'lenT2',len(user_shuffled_indices_of_Type[2]),'lenT3',len(user_shuffled_indices_of_Type[3]))
#         print('remaining types',Types)
        all_accounts.append(new_account)

NameError: name 'number_of_accounts' is not defined

### Dataset Generation

In [16]:
num_accounts = table['#accounts']['Total']
num_users = R_df.shape[0]
n = R_df.shape[1]

np.random.seed(42)  # For reproducibility
account_indices = np.random.permutation(num_accounts)

# Create a new DataFrame for R_sa
R_SAD_4C = np.zeros((num_accounts,n),dtype=float)
merge_mapping = []

# Function to merge user rows
def merge_users(user_indices):
    return R_df.iloc[user_indices,:].sum(axis=0)

for i in range(len(all_accounts)):
    R_SAD_4C[account_indices[i]] = merge_users(all_accounts[i])
    merge_mapping.append(all_accounts[i])

In [17]:
merge_mapping

[]

In [317]:
pd.DataFrame(R_SAD_4C).to_csv('InputData/R_SAD_4Clusters.csv')
# Convert the merge mapping to a DataFrame
merge_mapping_df = pd.DataFrame(merge_mapping,columns=['are mixed in row of R_sa','rows of R'])
merge_mapping_df.to_csv('InputData/GT_SAD_4Clusters.csv', index=False)

ValueError: 2 columns passed, passed data had 4 columns

In [316]:
merge_mapping

[[111, 424, 261, 177],
 [276, 318, 863],
 [401, 649, 638],
 [112, 212, 879],
 [622, 770, 84],
 [211, 392, 850],
 [692, 885],
 [896, 672],
 [291, 803],
 [765, 313],
 [605, 840],
 [205, 789],
 [939, 317],
 [882, 402],
 [338, 836],
 [42, 742],
 [465, 797],
 [897, 329],
 [309, 748],
 [91, 775],
 [150, 660],
 [375, 233],
 [695, 380],
 [938, 513],
 [186, 547],
 [153, 33],
 [560, 252],
 [221, 862],
 [436, 461],
 [754, 372],
 [931, 620],
 [178, 386],
 [45, 333],
 [630, 679],
 [787, 667],
 [895, 188],
 [438, 434],
 [362, 394],
 [490, 332],
 [415, 616],
 [591, 917],
 [201, 804],
 [845, 27],
 [870, 347],
 [553, 141],
 [275, 727],
 [718, 806],
 [388, 714],
 [644, 471],
 [631, 409],
 [590, 142],
 [912, 474],
 [516, 55],
 [128, 531],
 [208, 311],
 [829, 406],
 [162, 687],
 [143, 647],
 [63, 641],
 [270, 747],
 [352, 456],
 [507, 285],
 [267, 90],
 [94, 572],
 [307, 365],
 [786, 391],
 [66, 496],
 [915, 652],
 [164, 587],
 [771, 888],
 [398, 552],
 [224, 138],
 [681, 287],
 [70, 300],
 [93, 920],
 [5