In [1]:
import pandas as pd
import os

In [2]:
def get_root_dir_path(curr_working_dir, no_folders_above):
    dir_path = curr_working_dir
    for i in range(no_folders_above):
        dir_path = os.path.dirname(dir_path)
    print(f"Initial directory: {curr_working_dir}\n"
          f"Root directory: {dir_path}."
          )
    return dir_path

In [3]:
def group_data_by_column(df, column_to_keep, column_to_group_data):
    """Method to group the data within a dataframe given two columns' headers."""
    temp_df = df.groupby(column_to_keep)[column_to_group_data].count()
    temp_df = pd.DataFrame({'user_id': temp_df.index, 'no_reviews_user': temp_df.values})
    return temp_df

def cut_at_x_y(x_val, y_val, df, target_column):
    """Method used to filter out data for a specific Cut@x_y scenario"""
    print(f">>>>> Cut@{x_val}_{y_val} Scenario <<<<<\n"
          f"=============================")
    tmp_df = df[(df[target_column] >= x_val) & (df[target_column] <= y_val)]
    
    print(f"We have a total of {len(tmp_df)} users split as follows:")
    for i in range(x_val,y_val+1):
        print(f"- {count_users_by_no_review(tmp_df, i)} users with {i} reviews")
    print("=============================")
    
    return tmp_df

def count_users_by_no_review(df, review_no):
    return df[df['no_reviews_user'] == review_no]['user_id'].nunique()

In [4]:
COLS_SUBSET = ['user_id', 'item_id', 'rating', 'timestamp', 'title', 'brand', 'category']

In [5]:
def retrieve_user_data_from_dataframe(main_df, user_list, path_store_data):
    """Method to extract rows from a dataframe given a list of valid users"""
    
    # Filter dataframe
    tmp_df = main_df[main_df['user_id'].isin(user_list)]
    print(f"The current dataframe contains {len(tmp_df)} rows!\n"
          f"It contains {tmp_df['user_id'].nunique()} users!\n"
          f"It contains {tmp_df['item_id'].nunique()} items!\n"
          f"Storing dataframe...")
    
    df_cpy = tmp_df.copy()
    df_cpy.drop_duplicates(inplace=True, subset=COLS_SUBSET)

    if len(tmp_df) != len(df_cpy):
        print("Duplicates have been found!")
    
    print(f"The current dataframe contains {len(df_cpy)} rows!\n"
          f"It contains {df_cpy['user_id'].nunique()} users!\n"
          f"It contains {df_cpy['item_id'].nunique()} items!\n")
    # Store the new dataframe
    file_dir_path = path_store_data 
    df_cpy.drop(df_cpy.filter(regex="Unname"),axis=1, inplace=True)
    df_cpy.to_csv(file_dir_path, index=False)
    print(f"The dataframe has been stored!")
    print("=================================================")
    
    return tmp_df

In [6]:
COLUMNS_TYPES =  {
        'user_id': 'string',
        'item_id': 'string',
        'rating': 'int',
        'timestamp': 'string',
        'title': 'string',
        'brand': 'string',
        'category': 'string'
    }

ROOT_DIR = get_root_dir_path(os.getcwd(), 2)

# ==================== DATA DIRECTORIES ==================== #
DATA_DIR = ROOT_DIR + '/data/processed/'

BOOKS35_DIR = DATA_DIR + '/Books3_5/'
CDS35_DIR = DATA_DIR + '/CDs3_5/'
MOVIES35_DIR = DATA_DIR + '/Movies3_5/'

EXTRA_DATA_DIR = ROOT_DIR + '/data/processed/extra_cut/'

EXTRA_BOOKS_DIR = EXTRA_DATA_DIR + '/Books1/'
EXTRA_CDS_DIR = EXTRA_DATA_DIR + '/CDs1/'
EXTRA_MOVIES_DIR = EXTRA_DATA_DIR + '/Movies1/'
RAW_DIR = ROOT_DIR + '/src/data/raw'

Initial directory: d:\CrossDomain_RecSys_LLM-main\src\data
Root directory: d:\CrossDomain_RecSys_LLM-main.


# Books

In [7]:
cds_5_10__BOOKS = pd.read_csv(BOOKS35_DIR + "cds_5_10.csv")
movies_5_10__BOOKS = pd.read_csv(BOOKS35_DIR + "movies_5_10.csv")
'''filtered_books = pd.read_csv(RAW_DIR + "/filtered_books.csv")
filtered_movies = pd.read_csv(RAW_DIR + "/filtered_movies.csv")
filtered_cds = pd.read_csv(RAW_DIR + "/filtered_cds.csv")'''

In [None]:
# for testing
'''filtered_books_grouped = group_data_by_column(filtered_books, 'user_id', 'item_id')
filtered_movies_grouped = group_data_by_column(filtered_movies, 'user_id', 'item_id')
filtered_cds_grouped = group_data_by_column(filtered_cds, 'user_id', 'item_id')'''


'''# 5_10 base + 10_20 10_30

books_5_10 = cut_at_x_y(5, 10, filtered_books_grouped, 'no_reviews_user')
movies_5_10 = cut_at_x_y(5, 10, filtered_movies_grouped, 'no_reviews_user')
cds_5_10 = cut_at_x_y(5, 10, filtered_cds_grouped, 'no_reviews_user')

#10_20 e 10_30 target

books_10_20 = cut_at_x_y(10, 20, filtered_books_grouped, 'no_reviews_user')
books_10_30 = cut_at_x_y(10, 30, filtered_books_grouped, 'no_reviews_user')
movies_10_20 = cut_at_x_y(10, 20, filtered_movies_grouped, 'no_reviews_user')
movies_10_30 = cut_at_x_y(10, 30, filtered_movies_grouped, 'no_reviews_user')
cds_10_20 = cut_at_x_y(10, 20, filtered_cds_grouped, 'no_reviews_user')
cds_10_30 = cut_at_x_y(10, 30, filtered_cds_grouped, 'no_reviews_user')

#5_20 e 5_30 base

books_5_20 = cut_at_x_y(5, 20, filtered_books_grouped, 'no_reviews_user')
books_5_30 = cut_at_x_y(5, 30, filtered_books_grouped, 'no_reviews_user')
movies_5_20 = cut_at_x_y(5, 20, filtered_movies_grouped, 'no_reviews_user')
movies_5_30 = cut_at_x_y(5, 30, filtered_movies_grouped, 'no_reviews_user')
cds_5_20 = cut_at_x_y(5, 20, filtered_cds_grouped, 'no_reviews_user')
cds_5_30 = cut_at_x_y(5, 30, filtered_cds_grouped, 'no_reviews_user')'''

In [8]:
cds__BOOKS_grouped = group_data_by_column(cds_5_10__BOOKS, 'user_id', 'item_id')
movies__BOOKS_grouped = group_data_by_column(movies_5_10__BOOKS, 'user_id', 'item_id')


books_5_10__MOVIES = pd.read_csv(MOVIES35_DIR + "books5_10.csv")

cds_5_10__BOOKS_8_10 = cut_at_x_y(8, 10, cds__BOOKS_grouped, 'no_reviews_user')
movies_5_10__BOOKS_8_10 = cut_at_x_y(8, 10, movies__BOOKS_grouped, 'no_reviews_user')

cds_5_10__BOOKS_10_10 = cut_at_x_y(10, 10, cds__BOOKS_grouped, 'no_reviews_user')
movies_5_10__BOOKS_10_10 = cut_at_x_y(10, 10, movies__BOOKS_grouped, 'no_reviews_user')

>>>>> Cut@5_20 Scenario <<<<<
We have a total of 5657 users split as follows:
- 847 users with 5 reviews
- 711 users with 6 reviews
- 600 users with 7 reviews
- 533 users with 8 reviews
- 420 users with 9 reviews
- 377 users with 10 reviews
- 321 users with 11 reviews
- 301 users with 12 reviews
- 259 users with 13 reviews
- 244 users with 14 reviews
- 212 users with 15 reviews
- 190 users with 16 reviews
- 192 users with 17 reviews
- 156 users with 18 reviews
- 175 users with 19 reviews
- 119 users with 20 reviews
>>>>> Cut@5_30 Scenario <<<<<
We have a total of 6620 users split as follows:
- 847 users with 5 reviews
- 711 users with 6 reviews
- 600 users with 7 reviews
- 533 users with 8 reviews
- 420 users with 9 reviews
- 377 users with 10 reviews
- 321 users with 11 reviews
- 301 users with 12 reviews
- 259 users with 13 reviews
- 244 users with 14 reviews
- 212 users with 15 reviews
- 190 users with 16 reviews
- 192 users with 17 reviews
- 156 users with 18 reviews
- 175 users wi

In [None]:
#for testing
'''#base domain normali
path3 = EXTRA_MOVIES_DIR + 'movies_5_10.csv'
path4 = EXTRA_BOOKS_DIR + 'books_5_10.csv'
path5 = EXTRA_CDS_DIR + 'cds_5_10.csv'

#base domain
path6 = EXTRA_MOVIES_DIR + 'movies_5_20.csv'
path7 = EXTRA_MOVIES_DIR + 'movies_5_30.csv'
path8 = EXTRA_BOOKS_DIR + 'books_5_20.csv'
path9 = EXTRA_BOOKS_DIR + 'books_5_30.csv'
path10 = EXTRA_CDS_DIR + 'cds_5_20.csv'
path11 = EXTRA_CDS_DIR + 'cds_5_30.csv'

path24 = EXTRA_CDS_DIR + 'movies_5_20.csv'
path25 = EXTRA_CDS_DIR + 'movies_5_30.csv'
path26 = EXTRA_CDS_DIR + 'books_5_20.csv'
path27 = EXTRA_CDS_DIR + 'books_5_30.csv'
path28 = EXTRA_BOOKS_DIR + 'cds_5_20.csv'
path29 = EXTRA_BOOKS_DIR + 'cds_5_30.csv'

#base domain joinati
path12 = EXTRA_MOVIES_DIR + 'movies_5_10_1.csv' # books_10_20
path13 = EXTRA_MOVIES_DIR + 'movies_5_10_2.csv' # books_10_30
path14 = EXTRA_CDS_DIR + 'cds_5_10_1.csv' # books_10_20
path15 = EXTRA_CDS_DIR + 'cds_5_10_2.csv' # books_10_30
path16 = EXTRA_BOOKS_DIR + 'books_5_10_1.csv' #  movies_10_20
path17 = EXTRA_BOOKS_DIR + 'books_5_10_2.csv' # movies_10_30
path18 = EXTRA_CDS_DIR + 'cds_5_10_3.csv' # movies_10_20
path19 = EXTRA_CDS_DIR + 'cds_5_10_4.csv' # movies_10_30
path20 = EXTRA_MOVIES_DIR + 'movies_5_10_3.csv' # cds_10_20
path21 = EXTRA_MOVIES_DIR + 'movies_5_10_4.csv' # cds_10_30
path22 = EXTRA_BOOKS_DIR + 'books_5_10_3.csv' #  cds_10_20
path23 = EXTRA_BOOKS_DIR + 'books_5_10_4.csv' # cds_10_30

#base domain joinati 10_20 10_30
path12 = EXTRA_MOVIES_DIR + 'movies_5_20_1.csv' # books_10_20
path13 = EXTRA_MOVIES_DIR + 'movies_5_20_2.csv' # books_10_30
path14 = EXTRA_CDS_DIR + 'cds_5_20_1.csv' # books_10_20
path15 = EXTRA_CDS_DIR + 'cds_5_20_2.csv' # books_10_30
path16 = EXTRA_BOOKS_DIR + 'books_5_20_1.csv' #  movies_10_20
path17 = EXTRA_BOOKS_DIR + 'books_5_20_2.csv' # movies_10_30
path18 = EXTRA_CDS_DIR + 'cds_5_20_3.csv' # movies_10_20
path19 = EXTRA_CDS_DIR + 'cds_5_20_4.csv' # movies_10_30
path20 = EXTRA_MOVIES_DIR + 'movies_5_20_3.csv' # cds_10_20
path21 = EXTRA_MOVIES_DIR + 'movies_5_20_4.csv' # cds_10_30
path22 = EXTRA_BOOKS_DIR + 'books_5_20_3.csv' #  cds_10_20
path23 = EXTRA_BOOKS_DIR + 'books_5_20_4.csv' # cds_10_30

movies_510 = retrieve_user_data_from_dataframe(filtered_movies, movies_5_10['user_id'], path3)
books_510 = retrieve_user_data_from_dataframe(filtered_books, books_5_10['user_id'], path4)
cds_510 = retrieve_user_data_from_dataframe(filtered_cds, cds_5_10['user_id'], path5)


movies_520 = retrieve_user_data_from_dataframe(filtered_movies, movies_5_20['user_id'], path6)
movies_530 = retrieve_user_data_from_dataframe(filtered_movies, movies_5_30['user_id'], path7)
books_520 = retrieve_user_data_from_dataframe(filtered_books, books_5_20['user_id'], path8)
books_530 = retrieve_user_data_from_dataframe(filtered_books, books_5_30['user_id'], path9)
cds_520 = retrieve_user_data_from_dataframe(filtered_cds, cds_5_20['user_id'], path10)
cds_530 = retrieve_user_data_from_dataframe(filtered_cds, cds_5_30['user_id'], path11)
movies_520 = retrieve_user_data_from_dataframe(filtered_movies, movies_5_20['user_id'], path24)
movies_530 = retrieve_user_data_from_dataframe(filtered_movies, movies_5_30['user_id'], path25)
books_520 = retrieve_user_data_from_dataframe(filtered_books, books_5_20['user_id'], path26)
books_530 = retrieve_user_data_from_dataframe(filtered_books, books_5_30['user_id'], path27)
cds_520 = retrieve_user_data_from_dataframe(filtered_cds, cds_5_20['user_id'], path28)
cds_530 = retrieve_user_data_from_dataframe(filtered_cds, cds_5_30['user_id'], path29)

#base domain 5_10

movies_5_10 = pd.read_csv(EXTRA_MOVIES_DIR + "movies_5_10.csv")
cds_5_10 = pd.read_csv(EXTRA_CDS_DIR + "cds_5_10.csv")
books_5_10 = pd.read_csv(EXTRA_BOOKS_DIR + "books_5_10.csv")

#base domain 5_20 5_30

movies_5_20 = pd.read_csv(EXTRA_MOVIES_DIR + "movies_5_20.csv")
movies_5_30 = pd.read_csv(EXTRA_MOVIES_DIR + "movies_5_30.csv")
books_5_20 = pd.read_csv(EXTRA_BOOKS_DIR + "books_5_20.csv")
books_5_30 = pd.read_csv(EXTRA_BOOKS_DIR + "books_5_30.csv")
cds_5_20 = pd.read_csv(EXTRA_CDS_DIR + "cds_5_20.csv")
cds_5_30 = pd.read_csv(EXTRA_CDS_DIR + "cds_5_30.csv")

movies_10_20 = pd.read_csv(EXTRA_BOOKS_DIR + "movies_10_20.csv")
movies_10_30 = pd.read_csv(EXTRA_BOOKS_DIR + "movies_10_30.csv")
books_10_20 = pd.read_csv(EXTRA_MOVIES_DIR + "books_10_20.csv")
books_10_30 = pd.read_csv(EXTRA_MOVIES_DIR + "books_10_30.csv")
cds_10_20 = pd.read_csv(EXTRA_BOOKS_DIR + "cds_10_20.csv")
cds_10_30 = pd.read_csv(EXTRA_BOOKS_DIR + "cds_10_30.csv")

#target domain 10_20 10_30 con base 5_10
movies_510_1 = retrieve_user_data_from_dataframe(movies_5_10, books_1020['user_id'], path12)
movies_510_2 = retrieve_user_data_from_dataframe(movies_5_10, books_1030['user_id'], path13)
cds_510_1 = retrieve_user_data_from_dataframe(cds_5_10, books_1020['user_id'], path14)
cds_510_2 = retrieve_user_data_from_dataframe(cds_5_10, books_1030['user_id'], path15)
books_510_1 = retrieve_user_data_from_dataframe(books_5_10, movies_1020['user_id'], path16)
books_510_2 = retrieve_user_data_from_dataframe(books_5_10, movies_1030['user_id'], path17)
cds_510_3 = retrieve_user_data_from_dataframe(cds_5_10, movies_1020['user_id'], path18)
cds_510_4 = retrieve_user_data_from_dataframe(cds_5_10, movies_1030['user_id'], path19)
movies_510_3 = retrieve_user_data_from_dataframe(movies_5_10, cds_1020['user_id'], path20)
movies_510_4 = retrieve_user_data_from_dataframe(movies_5_10, cds_1030['user_id'], path21)
books_510_3 = retrieve_user_data_from_dataframe(books_5_10, cds_1020['user_id'], path22)
books_510_4 = retrieve_user_data_from_dataframe(books_5_10, cds_1030['user_id'], path23)

#target domain 10_20 10_30 con base 10_20 10_30

movies_520_1 = retrieve_user_data_from_dataframe(movies_5_20, books_10_20['user_id'], path12)
movies_520_2 = retrieve_user_data_from_dataframe(movies_5_20, books_10_30['user_id'], path13)
cds_520_1 = retrieve_user_data_from_dataframe(cds_5_20, books_10_20['user_id'], path14)
cds_520_2 = retrieve_user_data_from_dataframe(cds_5_20, books_10_30['user_id'], path15)
books_520_1 = retrieve_user_data_from_dataframe(books_5_20, movies_10_20['user_id'], path16)
books_520_2 = retrieve_user_data_from_dataframe(books_5_20, movies_10_30['user_id'], path17)
cds_520_3 = retrieve_user_data_from_dataframe(cds_5_20, movies_10_20['user_id'], path18)
cds_520_4 = retrieve_user_data_from_dataframe(cds_5_20, movies_10_30['user_id'], path19)
movies_520_3 = retrieve_user_data_from_dataframe(movies_5_20, cds_10_20['user_id'], path20)
movies_520_4 = retrieve_user_data_from_dataframe(movies_5_20, cds_10_30['user_id'], path21)
books_520_3 = retrieve_user_data_from_dataframe(books_5_20, cds_10_20['user_id'], path22)
books_520_4 = retrieve_user_data_from_dataframe(books_5_20, cds_10_30['user_id'], path23)

path12 = EXTRA_MOVIES_DIR + 'movies_5_30_1.csv' # books_10_20
path13 = EXTRA_MOVIES_DIR + 'movies_5_30_2.csv' # books_10_30
path14 = EXTRA_CDS_DIR + 'cds_5_30_1.csv' # books_10_20
path15 = EXTRA_CDS_DIR + 'cds_5_30_2.csv' # books_10_30
path16 = EXTRA_BOOKS_DIR + 'books_5_30_1.csv' #  movies_10_20
path17 = EXTRA_BOOKS_DIR + 'books_5_30_2.csv' # movies_10_30
path18 = EXTRA_CDS_DIR + 'cds_5_30_3.csv' # movies_10_20
path19 = EXTRA_CDS_DIR + 'cds_5_30_4.csv' # movies_10_30
path20 = EXTRA_MOVIES_DIR + 'movies_5_30_3.csv' # cds_10_20
path21 = EXTRA_MOVIES_DIR + 'movies_5_30_4.csv' # cds_10_30
path22 = EXTRA_BOOKS_DIR + 'books_5_30_3.csv' #  cds_10_20
path23 = EXTRA_BOOKS_DIR + 'books_5_30_4.csv' # cds_10_30

movies_530_1 = retrieve_user_data_from_dataframe(movies_5_30, books_10_20['user_id'], path12)
movies_530_2 = retrieve_user_data_from_dataframe(movies_5_30, books_10_30['user_id'], path13)
cds_530_1 = retrieve_user_data_from_dataframe(cds_5_30, books_10_20['user_id'], path14)
cds_530_2 = retrieve_user_data_from_dataframe(cds_5_30, books_10_30['user_id'], path15)
books_530_1 = retrieve_user_data_from_dataframe(books_5_30, movies_10_20['user_id'], path16)
books_530_2 = retrieve_user_data_from_dataframe(books_5_30, movies_10_30['user_id'], path17)
cds_530_3 = retrieve_user_data_from_dataframe(cds_5_30, movies_10_20['user_id'], path18)
cds_530_4 = retrieve_user_data_from_dataframe(cds_5_30, movies_10_30['user_id'], path19)
movies_530_3 = retrieve_user_data_from_dataframe(movies_5_30, cds_10_20['user_id'], path20)
movies_530_4 = retrieve_user_data_from_dataframe(movies_5_30, cds_10_30['user_id'], path21)
books_530_3 = retrieve_user_data_from_dataframe(books_5_30, cds_10_20['user_id'], path22)
books_530_4 = retrieve_user_data_from_dataframe(books_5_30, cds_10_30['user_id'], path23)'''

In [12]:
path1 = EXTRA_BOOKS_DIR + "cds_8_10.csv"
path2 = EXTRA_BOOKS_DIR + "cds_10_10.csv"

books__cd810 = retrieve_user_data_from_dataframe(cds_5_10__BOOKS, cds_5_10__BOOKS_8_10['user_id'], path1)
books__cd10 = retrieve_user_data_from_dataframe(cds_5_10__BOOKS, cds_5_10__BOOKS_10_10['user_id'], path2)

path1 = EXTRA_BOOKS_DIR + "movies_8_10.csv"
path2 = EXTRA_BOOKS_DIR + "movies_10_10.csv"

books__movies810 = retrieve_user_data_from_dataframe(movies_5_10__BOOKS, movies_5_10__BOOKS_8_10['user_id'], path1)
books__movies10 = retrieve_user_data_from_dataframe(movies_5_10__BOOKS, movies_5_10__BOOKS_10_10['user_id'], path2)

The current dataframe contains 53869 rows!
It contains 5494 users!
It contains 16237 items!
Storing dataframe...
Duplicates have been found!
The current dataframe contains 43057 rows!
It contains 5494 users!
It contains 16237 items!

The dataframe has been stored!
The current dataframe contains 73446 rows!
It contains 6278 users!
It contains 19045 items!
Storing dataframe...
Duplicates have been found!
The current dataframe contains 58560 rows!
It contains 6278 users!
It contains 19045 items!

The dataframe has been stored!
The current dataframe contains 56438 rows!
It contains 5657 users!
It contains 41402 items!
Storing dataframe...
Duplicates have been found!
The current dataframe contains 56434 rows!
It contains 5657 users!
It contains 41402 items!

The dataframe has been stored!
The current dataframe contains 80557 rows!
It contains 6620 users!
It contains 55394 items!
Storing dataframe...
Duplicates have been found!
The current dataframe contains 80551 rows!
It contains 6620 user

'path1 = EXTRA_BOOKS_DIR + "cds_8_10.csv"\npath2 = EXTRA_BOOKS_DIR + "cds_10_10.csv"\n\nbooks__cd810 = retrieve_user_data_from_dataframe(cds_5_10__BOOKS, cds_5_10__BOOKS_8_10[\'user_id\'], path1)\nbooks__cd10 = retrieve_user_data_from_dataframe(cds_5_10__BOOKS, cds_5_10__BOOKS_10_10[\'user_id\'], path2)\n\npath1 = EXTRA_BOOKS_DIR + "movies_8_10.csv"\npath2 = EXTRA_BOOKS_DIR + "movies_10_10.csv"\n\nbooks__movies810 = retrieve_user_data_from_dataframe(movies_5_10__BOOKS, movies_5_10__BOOKS_8_10[\'user_id\'], path1)\nbooks__movies10 = retrieve_user_data_from_dataframe(movies_5_10__BOOKS, movies_5_10__BOOKS_10_10[\'user_id\'], path2)'

# CDs

In [47]:
books_5_10__CDS = pd.read_csv(CDS35_DIR + "books5_10.csv")
movies_5_10__CDS = pd.read_csv(CDS35_DIR + "movies_5_10.csv")

In [48]:
books__CDS_grouped = group_data_by_column(books_5_10__CDS, 'user_id', 'item_id')
movies__CDS_grouped = group_data_by_column(movies_5_10__CDS, 'user_id', 'item_id')

books_5_10__CDS_8_10 = cut_at_x_y(8, 10, books__CDS_grouped, 'no_reviews_user')
movies_5_10__CDS_8_10 = cut_at_x_y(8, 10, movies__CDS_grouped, 'no_reviews_user')

books_5_10__CDS_10_10 = cut_at_x_y(10, 10, books__CDS_grouped, 'no_reviews_user')
movies_5_10__CDS_10_10 = cut_at_x_y(10, 10, movies__CDS_grouped, 'no_reviews_user')

>>>>> Cut@8_10 Scenario <<<<<
We have a total of 459 users split as follows:
- 179 users with 8 reviews
- 150 users with 9 reviews
- 130 users with 10 reviews
>>>>> Cut@8_10 Scenario <<<<<
We have a total of 413 users split as follows:
- 169 users with 8 reviews
- 132 users with 9 reviews
- 112 users with 10 reviews
>>>>> Cut@10_10 Scenario <<<<<
We have a total of 130 users split as follows:
- 130 users with 10 reviews
>>>>> Cut@10_10 Scenario <<<<<
We have a total of 112 users split as follows:
- 112 users with 10 reviews


In [49]:
path1 = EXTRA_CDS_DIR + "books_8_10.csv"
path2 = EXTRA_CDS_DIR + "books_10_10.csv"

cds__books810 = retrieve_user_data_from_dataframe(books_5_10__CDS, books_5_10__CDS_8_10['user_id'], path1)
cds__books10 = retrieve_user_data_from_dataframe(books_5_10__CDS, books_5_10__CDS_10_10['user_id'], path2)

path1 = EXTRA_CDS_DIR + "movies_8_10.csv"
path2 = EXTRA_CDS_DIR + "movies_10_10.csv"

cds__movies810 = retrieve_user_data_from_dataframe(movies_5_10__CDS, movies_5_10__CDS_8_10['user_id'], path1)
cds__movies10 = retrieve_user_data_from_dataframe(movies_5_10__CDS, movies_5_10__CDS_10_10['user_id'], path2)

The current dataframe contains 4082 rows!
It contains 459 users!
It contains 3926 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 1300 rows!
It contains 130 users!
It contains 1286 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 3660 rows!
It contains 413 users!
It contains 2891 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 1120 rows!
It contains 112 users!
It contains 1020 items!
Storing dataframe...
The dataframe has been stored!


# Movies

In [51]:
books_5_10__MOVIES = pd.read_csv(MOVIES35_DIR + "books5_10.csv")
cds_5_10__MOVIES = pd.read_csv(MOVIES35_DIR + "cds_5_10.csv")

In [52]:
books__MOVIES_grouped = group_data_by_column(books_5_10__MOVIES, 'user_id', 'item_id')
movies__MOVIES_grouped = group_data_by_column(cds_5_10__MOVIES, 'user_id', 'item_id')

books_5_10__MOVIES_8_10 = cut_at_x_y(8, 10, books__MOVIES_grouped, 'no_reviews_user')
cds_5_10__MOVIES_8_10 = cut_at_x_y(8, 10, movies__MOVIES_grouped, 'no_reviews_user')

books_5_10__MOVIES_10_10 = cut_at_x_y(10, 10, books__MOVIES_grouped, 'no_reviews_user')
cds_5_10__MOVIES_10_10 = cut_at_x_y(10, 10, movies__MOVIES_grouped, 'no_reviews_user')

>>>>> Cut@8_10 Scenario <<<<<
We have a total of 461 users split as follows:
- 192 users with 8 reviews
- 144 users with 9 reviews
- 125 users with 10 reviews
>>>>> Cut@8_10 Scenario <<<<<
We have a total of 371 users split as follows:
- 176 users with 8 reviews
- 109 users with 9 reviews
- 86 users with 10 reviews
>>>>> Cut@10_10 Scenario <<<<<
We have a total of 125 users split as follows:
- 125 users with 10 reviews
>>>>> Cut@10_10 Scenario <<<<<
We have a total of 86 users split as follows:
- 86 users with 10 reviews


In [54]:
path1 = EXTRA_MOVIES_DIR + "books_8_10.csv"
path2 = EXTRA_MOVIES_DIR + "books_10_10.csv"

movies__books810 = retrieve_user_data_from_dataframe(books_5_10__MOVIES, books_5_10__MOVIES_8_10['user_id'], path1)
movies__books10 = retrieve_user_data_from_dataframe(books_5_10__MOVIES, books_5_10__MOVIES_10_10['user_id'], path2)

path1 = EXTRA_MOVIES_DIR + "cds_8_10.csv"
path2 = EXTRA_MOVIES_DIR + "cds_10_10.csv"

movies__cds810 = retrieve_user_data_from_dataframe(cds_5_10__MOVIES, cds_5_10__MOVIES_8_10['user_id'], path1)
movies__cds10 = retrieve_user_data_from_dataframe(cds_5_10__MOVIES, cds_5_10__MOVIES_10_10['user_id'], path2)

The current dataframe contains 4082 rows!
It contains 461 users!
It contains 3897 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 1250 rows!
It contains 125 users!
It contains 1228 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 3249 rows!
It contains 371 users!
It contains 2971 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 860 rows!
It contains 86 users!
It contains 836 items!
Storing dataframe...
The dataframe has been stored!


# Base Domain

In [56]:
books_3_5__1 = pd.read_csv(BOOKS35_DIR + "books_3_5__1.csv")
books_3_5__2 = pd.read_csv(BOOKS35_DIR + "books_3_5__2.csv")
print(f"Books DFs len:\n"
      f"- books_3_5__1 {len(books_3_5__1)} || "
      f"- books_3_5__2 {len(books_3_5__2)}"
      )

cds_3_5__1 = pd.read_csv(CDS35_DIR + "cds_3_5__1.csv")
cds_3_5__3 = pd.read_csv(CDS35_DIR + "cds_3_5__3.csv")
print(f"CDs DFs len:\n"
      f"- cds_3_5__1 {len(cds_3_5__1)} || "
      f"- cds_3_5__3 {len(cds_3_5__3)}"
      )

movies_3_5__1 = pd.read_csv(MOVIES35_DIR + "movies_3_5__1.csv")
movies_3_5__3 = pd.read_csv(MOVIES35_DIR + "movies_3_5__3.csv")
print(f"Movies DFs len:\n"
      f"- movies_3_5__1 {len(movies_3_5__1)} || "
      f"- movies_3_5__3 {len(movies_3_5__3)}"
      )

Books DFs len:
- books_3_5__1 3397 || - books_3_5__2 3030
CDs DFs len:
- cds_3_5__1 4958 || - cds_3_5__3 4853
Movies DFs len:
- movies_3_5__1 4815 || - movies_3_5__3 4794


In [32]:
path1 = EXTRA_BOOKS_DIR + "books_3_5__4.csv"  # 8_10 cds
path2 = EXTRA_BOOKS_DIR + "books_3_5__5.csv"  # 10_10 cds 

books_3_5__4 = retrieve_user_data_from_dataframe(books_3_5__1, books__cd810['user_id'], path1)
books_3_5__5 = retrieve_user_data_from_dataframe(books_3_5__1, books__cd10['user_id'], path2)

path1 = EXTRA_BOOKS_DIR + "books_3_5__6.csv"  # 8_10 movies
path2 = EXTRA_BOOKS_DIR + "books_3_5__7.csv"  # 10_10 movies

books_3_5__6 = retrieve_user_data_from_dataframe(books_3_5__2, books__movies810['user_id'], path1)
books_3_5__7 = retrieve_user_data_from_dataframe(books_3_5__2, books__movies10['user_id'], path2)

The current dataframe contains 1088 rows!
It contains 262 users!
It contains 1061 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 306 rows!
It contains 74 users!
It contains 302 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 1033 rows!
It contains 246 users!
It contains 1005 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 313 rows!
It contains 75 users!
It contains 309 items!
Storing dataframe...
The dataframe has been stored!


In [50]:
path1 = EXTRA_CDS_DIR + "cds_3_5__5.csv"  # 8_10 books
path2 = EXTRA_CDS_DIR + "cds_3_5__6.csv"  # 10_10 books 

cds_3_5__5 = retrieve_user_data_from_dataframe(cds_3_5__1, cds__books810['user_id'], path1)
cds_3_5__6 = retrieve_user_data_from_dataframe(cds_3_5__1, cds__books10['user_id'], path2)

path1 = EXTRA_CDS_DIR + "cds_3_5__7.csv"  # 8_10 movies
path2 = EXTRA_CDS_DIR + "cds_3_5__8.csv"  # 10_10 movies

books_3_5__7 = retrieve_user_data_from_dataframe(cds_3_5__3, cds__movies810['user_id'], path1)
books_3_5__8 = retrieve_user_data_from_dataframe(cds_3_5__3, cds__movies10['user_id'], path2)

The current dataframe contains 1846 rows!
It contains 459 users!
It contains 1733 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 528 rows!
It contains 130 users!
It contains 521 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 1643 rows!
It contains 413 users!
It contains 1558 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 455 rows!
It contains 112 users!
It contains 449 items!
Storing dataframe...
The dataframe has been stored!


In [55]:
path1 = EXTRA_MOVIES_DIR + "movies_3_5__5.csv"  # 8_10 books
path2 = EXTRA_MOVIES_DIR + "movies_3_5__6.csv"  # 10_10 books 

movies_3_5__5 = retrieve_user_data_from_dataframe(movies_3_5__1, movies__books810['user_id'], path1)
movies_3_5__6 = retrieve_user_data_from_dataframe(movies_3_5__1, movies__books10['user_id'], path2)

path1 = EXTRA_MOVIES_DIR + "movies_3_5__7.csv"  # 8_10 cds
path2 = EXTRA_MOVIES_DIR + "movies_3_5__8.csv"  # 10_10 cds

movies_3_5__7 = retrieve_user_data_from_dataframe(movies_3_5__3, movies__cds810['user_id'], path1)
movies_3_5__8 = retrieve_user_data_from_dataframe(movies_3_5__3, movies__cds10['user_id'], path2)

The current dataframe contains 1808 rows!
It contains 461 users!
It contains 1569 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 501 rows!
It contains 125 users!
It contains 473 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 1457 rows!
It contains 371 users!
It contains 1259 items!
Storing dataframe...
The dataframe has been stored!
The current dataframe contains 337 rows!
It contains 86 users!
It contains 323 items!
Storing dataframe...
The dataframe has been stored!
