# Mapping and Parsing
In this notebook we map each users' True Ranking items to their rating, assigning:

- 0 to all ratings from 1 to 4 
- 1 to ratings equal to 5

For example:

- for the user "A3HACSQO6Y6YOX"
- we have ['B000021YBS', 'B00004T8RK', 'B000021XR5', 'B00000J7S8', 'B00000ILQQ'] as TrueRanking
- knowing that he rated the items as follows: 5, 5, 5, 5, 5
- we obtain [1, 1, 1, 1, 1] as TrueRanking_Mapping

 Moreover, we will check and fix any type of issues in Explanations computed during the primary tests.

In [13]:
import pandas as pd
import re
import os
import sys
sys.path.append('src')

In [14]:
def get_root_dir_path(curr_working_dir, no_folders_above):
    dir_path = curr_working_dir
    for i in range(no_folders_above):
        dir_path = os.path.dirname(dir_path)
    print(f"Initial directory: {curr_working_dir}\n"
          f"Root directory: {dir_path}."
          )
    return dir_path

def load_data_pickle_csv(data_to_load):

    print(f"The following files be loaded:\n"
          f"- {data_to_load[0]}.pkl\n"
          f"- {data_to_load[1]}.csv\n"
          f"===================\nLoading datasets.."
          )
    
    results_df = pd.read_pickle(data_to_load[0] + ".pkl")
    target_domain_df = pd.read_csv(data_to_load[1] + ".csv")

    print(f"Dropping some columns from the target domain df..")
    target_domain_df.drop(columns=["timestamp", "title", "brand", "category"], inplace=True)

    print(f"Datasets loaded!\n===================")

    return results_df, target_domain_df

def map_ranking_to_user_rating(results_data, target_domain_data, true_ranking_mapping):
    
    print(f"Starting computation....")
    for i in range(len(results_data)):

        user_true_rating_map = []
        
        # Data from the results <<< .pkl file
        curr_user = results_data['UserId'].iloc[i]
        curr_user_true_ranking = results_data['TrueRanking'].iloc[i]

        # Data from the processed <<< .csv file
        curr_user_ratings = target_domain_data[target_domain_data['user_id'] == curr_user]

        # Map True Ranking
        for item_id in curr_user_true_ranking:
            
            item_rating = curr_user_ratings[curr_user_ratings['item_id'] == item_id]

            if item_rating['rating'].values[0] == 5:
                user_true_rating_map.append(1)
            else:
                user_true_rating_map.append(0)
        
        true_ranking_mapping.append(user_true_rating_map) 

    print(f"Computation ended!\n{len(true_ranking_mapping)} mappings have been computed!")

def find_df_rows_by_sentence(df, column_to_search="Explanation", sentence="Items ranking:"):
    return df[df[column_to_search].str.startswith(sentence)]

In [16]:
COLUMNS_TYPES =  {
        'user_id': 'string',
        'item_id': 'string',
        'rating': 'int',
        'timestamp': 'string',
        'title': 'string',
        'brand': 'string',
        'category': 'string'
    }

ROOT_DIR = get_root_dir_path(os.getcwd(), 2)

# ==================== DATA DIRECTORIES ==================== #
DATA_DIR = ROOT_DIR + '/data/processed/'

BOOKS35_DIR = DATA_DIR + '/Books3_5/'
CDS35_DIR = DATA_DIR + '/CDs3_5/'
MOVIES35_DIR = DATA_DIR + '/Movies3_5/'

# ==================== PREDICTION DIRECTORIES ==================== #
PREDICTIONS_DIR = ROOT_DIR + '/models/predictions/'

BOOKS35_CDS510_DIR = PREDICTIONS_DIR + '/books35_cds510/'
BOOKS35_MOVIES510_DIR = PREDICTIONS_DIR + '/books35_movies510/'

CDS35_BOOKS510_DIR = PREDICTIONS_DIR + '/cds35_books510/'
CDS35_MOVIES510_DIR = PREDICTIONS_DIR + '/cds35_movies510/'

MOVIES35_BOOKS510_DIR = PREDICTIONS_DIR + '/movies35_books510/'
MOVIES35_CDS510_DIR = PREDICTIONS_DIR + '/movies35_cds510/'

Initial directory: c:\Users\ivanr\Desktop\LLM_Github\CrossDomain_RecSys_LLM\src\data
Root directory: c:\Users\ivanr\Desktop\LLM_Github\CrossDomain_RecSys_LLM.


In [17]:
PICKLE_DICT = {

    "books35_cds510_0s": [BOOKS35_CDS510_DIR + "books35_cds510_GPT_0s", BOOKS35_DIR + "cds_5_10"], 
    "books35_cds510_1s": [BOOKS35_CDS510_DIR + "books35_cds510_GPT_1s", BOOKS35_DIR + "cds_5_10"], 

    "books35_movies510_0s": [BOOKS35_MOVIES510_DIR + "books35_movies510_GPT_0s", BOOKS35_DIR + "movies_5_10"], 
    "books35_movies510_1s": [BOOKS35_MOVIES510_DIR + "books35_movies510_GPT_1s", BOOKS35_DIR + "movies_5_10"], 

    "cds35_books510_0s": [CDS35_BOOKS510_DIR + "cds35_books510_GPT_0s", CDS35_DIR + "books5_10"], 
    "cds35_books510_1s": [CDS35_BOOKS510_DIR + "cds35_books510_GPT_1s", CDS35_DIR + "books5_10"], 

    "cds35_movies510_0s": [CDS35_MOVIES510_DIR + "cds35_movies510_GPT_0s", CDS35_DIR + "movies_5_10"], 
    "cds35_movies510_1s": [CDS35_MOVIES510_DIR + "cds35_movies510_GPT_1s", CDS35_DIR + "movies_5_10"], 

    "movies35_books510_0s": [MOVIES35_BOOKS510_DIR + "movies35_books510_GPT_0s", MOVIES35_DIR + "books5_10"], 
    "movies35_books510_1s": [MOVIES35_BOOKS510_DIR + "movies35_books510_GPT_1s", MOVIES35_DIR + "books5_10"],

    "movies35_cds510_0s": [MOVIES35_CDS510_DIR + "movies35_cds510_GPT_0s", MOVIES35_DIR + "cds_5_10"], 
    "movies35_cds510_1s": [MOVIES35_CDS510_DIR + "movies35_cds510_GPT_1s", MOVIES35_DIR + "cds_5_10"],  

}

all_scenario = ['books35_cds510_0s', 'books35_cds510_1s',           # 0,1
                'books35_movies510_0s', 'books35_movies510_1s',     # 2,3
                'cds35_books510_0s', 'cds35_books510_1s',           # 4,5
                'cds35_movies510_0s', 'cds35_movies510_1s',         # 6,7
                'movies35_books510_0s', 'movies35_books510_1s',     # 8,9
                'movies35_cds510_0s', 'movies35_cds510_1s'          # 10,11      
                ]

# Compute TrueRanking Mapping

In [80]:
def map_ranking_rating():
      
      for scenario in all_scenario:

            print(f"Current scenario >>>> {scenario} <<<<")
            scenario_datasets = PICKLE_DICT[scenario]

            res_df, target_dom_df = load_data_pickle_csv(scenario_datasets)
            true_ranking_mapping_lists = []

            map_ranking_to_user_rating(res_df, target_dom_df, true_ranking_mapping_lists)

            res_df['TrueRanking_Mapping'] = true_ranking_mapping_lists

            res_df.to_pickle(scenario_datasets[0] + ".pkl")

            print(f"Completed scenario: {scenario}.\n=====================\n")
      
      print(f"All scenarios have been completed!")

In [81]:
map_ranking_rating()

Current scenario >>>> books35_cds510_0s <<<<
The following files be loaded:
- c:\Users\ivanr\Desktop\LLM_Github\CrossDomain_RecSys_LLM/models/predictions//books35_cds510/books35_cds510_GPT_0s.pkl
- c:\Users\ivanr\Desktop\LLM_Github\CrossDomain_RecSys_LLM/data/processed//Books3_5/cds_5_10.csv
Loading datasets..
Dropping some columns from the target domain df..
Datasets loaded!
Starting computation....
Computation ended!
825 mappings have been computed!
Completed scenario: books35_cds510_0s.

Current scenario >>>> books35_cds510_1s <<<<
The following files be loaded:
- c:\Users\ivanr\Desktop\LLM_Github\CrossDomain_RecSys_LLM/models/predictions//books35_cds510/books35_cds510_GPT_1s.pkl
- c:\Users\ivanr\Desktop\LLM_Github\CrossDomain_RecSys_LLM/data/processed//Books3_5/cds_5_10.csv
Loading datasets..
Dropping some columns from the target domain df..
Datasets loaded!
Starting computation....
Computation ended!
825 mappings have been computed!
Completed scenario: books35_cds510_1s.

Current 

In [79]:
# # Data Scenario 
# scenario_to_use = all_scenario[0]
# scenario_datasets = PICKLE_DICT[scenario_to_use]
# 
# # Results DF and Target Domain DF 
# true_ranking_mapping_lists = []
# 
# res_df, target_dom_df = load_data_pickle_csv(scenario_datasets)
# res_df.head(3)
# 
# # Mapping Ranked Items >> Items Rating
# map_ranking_to_user_rating(res_df, target_dom_df, true_ranking_mapping_lists)
# 
# # Store Mapping and Updated Results DF
# res_df['TrueRanking_Mapping'] = true_ranking_mapping_lists
# # res_df.to_pickle(scenario_datasets[0] + ".pkl")
# res_df.head(3)

# Check for Issues in Explanations

In [None]:
files_with_issues = []

for scenario in all_scenario:

    try:
        print(f"\n=======================================================")

        print(f"Current scenario >>>> {scenario} <<<<")
        scenario_datasets = PICKLE_DICT[scenario]

        res_df, _ = load_data_pickle_csv(scenario_datasets)

        rows_with_issue = find_df_rows_by_sentence(res_df)
        
        if len(rows_with_issue) > 0:
            print(f"=========== WATCH OUT ===========\n"
                  f"Current df has {len(rows_with_issue)} rows with issues!\n"
                  f"=================================================================\n"
                  )
            files_with_issues.append(scenario_datasets[0])
        else:
            print(f"No issues found.")

    except Exception as e:
        print(f"--------------->>> Error: file not found <<<---------------")

In [9]:
files_with_issues

['c:\\Users\\ivanr\\Desktop\\LLM_Github\\CrossDomain_RecSys_LLM/models/predictions//books35_movies510/books35_movies510_GPT_0s',
 'c:\\Users\\ivanr\\Desktop\\LLM_Github\\CrossDomain_RecSys_LLM/models/predictions//cds35_movies510/cds35_movies510_GPT_0s']

In [10]:
books35_movies510_0s_df = pd.read_pickle(files_with_issues[0] + '.pkl')
books35_movies510_0s_issues = find_df_rows_by_sentence(books35_movies510_0s_df)

cds35_movies510_0s_df = pd.read_pickle(files_with_issues[1] + '.pkl')
cds35_movies510_0s_issues = find_df_rows_by_sentence(cds35_movies510_0s_df)

In [23]:
def _extract_ranking_and_explanation(answer, split_token):
    """
        Extracts the item ranking and the explanation from the answer of the LLM.
    """
    
    sentences = answer.split(split_token)
    print(f"Numer of split sentences: {len(sentences)}")

    # Extract the explanation
    if len(sentences) > 1 and sentences[1]:
        
        if split_token == "\nExplanation: ":
            explanation = sentences[1]
        else:
            explanation_match = re.search(r"Explanation: (.+)", sentences[1])
            explanation = explanation_match.group(1).strip() if explanation_match else "" 
    else:
        print(f"ERROR!")
        explanation = answer

    return explanation

In [25]:
for i in range(len(books35_movies510_0s_issues)):
    print("======================")
    sentence = books35_movies510_0s_issues['Explanation'].iloc[i]

    if i == 0:
        split_tok = "\n"
    elif i == 1 or i == 3:
        split_tok = "\n\n"
    elif i == 2:
        split_tok = "\nExplanation: "

    explanation = _extract_ranking_and_explanation(sentence, split_tok)

    print(f"{i}° Explanation: {explanation}")
    books35_movies510_0s_issues['Explanation'].iloc[i] = explanation

Numer of split sentences: 2
0° Explanation: Based on the user's liked books, which are mainly in the categories of 'Books', 'Biographies & Memoirs', 'True Crime', 'Business & Money', 'Skills', 'Computers & Technology', 'Networking & Cloud Computing', and 'Politics & Social Sciences', the recommender system suggests movies that are in the categories of 'Movies & TV', 'Genre for Featured Categories', 'Drama', 'Westerns', 'Studio Specials', 'Lionsgate Home Entertainment', 'Hallmark Home Video', 'Paramount Home Entertainment', and 'All Lionsgate Titles'. These movies are likely to be of interest to the user as they share similar themes or genres with the liked books.
Numer of split sentences: 2
1° Explanation: Based on the user's liked books, which are mainly in the categories of 'Books', 'Christian Books & Bibles', 'Christian Living', and 'History', the recommended movies are selected from the candidate items based on their categories. The top 5 recommended movies have categories such as 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books35_movies510_0s_issues['Explanation'].iloc[i] = explanation


In [40]:
s = (cds35_movies510_0s_issues['Explanation'].iloc[0]).split("\nExplanation: ")[1]
cds35_movies510_0s_issues['Explanation'].iloc[0] = s

s = (cds35_movies510_0s_issues['Explanation'].iloc[1]).split("\nExplanation: ")[1]
cds35_movies510_0s_issues['Explanation'].iloc[1] = s

s = (cds35_movies510_0s_issues['Explanation'].iloc[2]).split("\nExplanation: ")[1]
cds35_movies510_0s_issues['Explanation'].iloc[2] = s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds35_movies510_0s_issues['Explanation'].iloc[0] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds35_movies510_0s_issues['Explanation'].iloc[1] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds35_movies510_0s_issues['Explanation'].iloc[2] = s


## Update Explanations

In [66]:
for i in range(len(books35_movies510_0s_issues)):

    user_id = books35_movies510_0s_issues['UserId'].iloc[i]
    parsed_explanation = books35_movies510_0s_issues['Explanation'].iloc[i]

    index_of_user = books35_movies510_0s_df.index[books35_movies510_0s_df['UserId'] == user_id].tolist()[0]
    print(f"Old Explanation: {books35_movies510_0s_df['Explanation'].iloc[index_of_user]}")
    books35_movies510_0s_df['Explanation'].iloc[index_of_user] = parsed_explanation
    print(f"New Explanation: {books35_movies510_0s_df['Explanation'].iloc[index_of_user]}"
          f"\n=====================================================\n"
          )   

Old Explanation: Items ranking: ['B00EU8RBXW', 'B000V5EYXS', 'B00HRYH6F4', 'B00008K77M', 'B00FOLGW48']
Explanation: Based on the user's liked books, which are mainly in the categories of 'Books', 'Biographies & Memoirs', 'True Crime', 'Business & Money', 'Skills', 'Computers & Technology', 'Networking & Cloud Computing', and 'Politics & Social Sciences', the recommender system suggests movies that are in the categories of 'Movies & TV', 'Genre for Featured Categories', 'Drama', 'Westerns', 'Studio Specials', 'Lionsgate Home Entertainment', 'Hallmark Home Video', 'Paramount Home Entertainment', and 'All Lionsgate Titles'. These movies are likely to be of interest to the user as they share similar themes or genres with the liked books.
New Explanation: Based on the user's liked books, which are mainly in the categories of 'Books', 'Biographies & Memoirs', 'True Crime', 'Business & Money', 'Skills', 'Computers & Technology', 'Networking & Cloud Computing', and 'Politics & Social Sciences'

In [68]:
for i in range(len(cds35_movies510_0s_issues)):

    user_id = cds35_movies510_0s_issues['UserId'].iloc[i]
    parsed_explanation = cds35_movies510_0s_issues['Explanation'].iloc[i]

    index_of_user = cds35_movies510_0s_df.index[cds35_movies510_0s_df['UserId'] == user_id].tolist()[0]
    print(f"Old Explanation: {cds35_movies510_0s_df['Explanation'].iloc[index_of_user]}")
    cds35_movies510_0s_df['Explanation'].iloc[index_of_user] = parsed_explanation
    print(f"New Explanation: {cds35_movies510_0s_df['Explanation'].iloc[index_of_user]}"
          f"\n=====================================================\n"
          )   

Old Explanation: Items ranking: [6305078599, 6304039565, 6302098440, B000HCO8KE, B000X07TLA]
Explanation: Based on the user's liked CDs, which include genres like Pop, Alternative Rock, and World Music, and the disliked movies, which include genres like Action, Adventure, and Comedy, I have re-ranked the candidate movies. 

The top recommended movie is "Cannibal Women in the Avocado Jungle of Death" (Id: 6305078599). This movie falls under the "Cult Movies" category, which aligns with the user's interest in alternative genres like Alternative Rock. 

The second recommended movie is "Ninth Configuration VHS" (Id: 6304039565). This movie falls under the "Studio Specials" category and has elements of drama, which aligns with the user's interest in different genres like Pop and World Music. 

The third recommended movie is "Prince Valiant VHS" (Id: 6302098440). Although it falls under the "Action" category, it is a Studio Specials movie and might have elements that appeal to the user's int

In [70]:
books35_movies510_0s_df.to_pickle(files_with_issues[0] + '.pkl')
cds35_movies510_0s_df.to_pickle(files_with_issues[1] + '.pkl')