In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import re
from sklearn.preprocessing import LabelEncoder


In [12]:
cd Downloads

/home/asehgal/Downloads


In [13]:
df = pd.read_csv("data-set(2).csv" ,sep=',')
df = df.dropna()
df = df.reset_index(drop=True)

In [14]:
df

Unnamed: 0,UserId,Actions,FilePath
0,7.0,PREVIEW,/Shared/Sample Docs/CIMs/CIM-03-Bar-Wash.pdf
1,8.0,DOWNLOAD,/Shared/Sample Docs/CIMs/CIM-01-Consolidated-...
2,5.0,PREVIEW,/Shared/Sample Docs/CIMs/CIM-02-American-Casi...
3,7.0,ADD_FILE,/Shared/Sample Docs/CIMs/CIM-06-Pizza-Hut.pdf
4,1.0,ADD_FILE,/Shared/Sample Docs/CIMs/CIM-04-Alcatel-Lucen...
...,...,...,...
654,5.0,ADD_FILE,/Shared/Sample Docs/Term Sheets/ILPA-Model-LP...
655,1.0,DOWNLOAD,/Shared/Sample Docs/Term Sheets/TERM_SHEET_EQ...
656,6.0,PREVIEW,/Shared/Sample Docs/Term Sheets/ILPA-Model-LP...
657,8.0,DOWNLOAD,/Shared/Sample Docs/Term Sheets/ILPA-Model-LP...


In [15]:
import random

def duplicate_rows_randomly(df, n_duplicates, n_delete_actions):
    duplicated_rows = pd.DataFrame(columns=df.columns)

    for user_id in df['UserId'].unique():
        user_data = df[df['UserId'] == user_id]

        # Duplicate existing rows
        for _ in range(n_duplicates):
            random_row = user_data.sample(n=1).iloc[0]
            duplicated_rows = pd.concat([duplicated_rows, random_row.to_frame().T], ignore_index=True)

        # Add 10 random file paths with action value set to "DELETE"
        for _ in range(n_delete_actions):
            random_file_path = "/Shared/FilePath/" + ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) + ".pdf"
            delete_row = {'UserId': user_id, 'Actions': 'DELETE', 'FilePath': random_file_path}
            duplicated_rows = pd.concat([duplicated_rows, pd.DataFrame(delete_row, index=[0])], ignore_index=True)

    new_df = pd.concat([df, duplicated_rows], ignore_index=True)
    new_df['Actions'] = new_df['Actions'].str.strip()

    return new_df
new_df = duplicate_rows_randomly(df, n_duplicates=100, n_delete_actions=20)

In [23]:
df = new_df
df["Actions"].unique()

array(['PREVIEW', 'DOWNLOAD', 'ADD_FILE', 'DELETE'], dtype=object)

In [24]:
def process_data_and_fit_algo(df, user_id=None, actions=None, filepath=None):
    # If user_id, actions, and filepath are provided, concatenate them to df
    if user_id is not None and actions is not None and filepath is not None:
        new_data = {'UserId': [user_id], 'Actions': [actions], 'FilePath': [filepath]}
        new_df = pd.DataFrame(new_data)
        df = pd.concat([df, new_df], ignore_index=True)

    # Apply label encoding to 'Actions' column
    label_encoder = LabelEncoder()
    df['Actions_Encoded'] = label_encoder.fit_transform(df['Actions'])
    encoded_action = df[df["Actions"] == actions]["Actions_Encoded"].values[0]
    df["Actions"] = df['Actions_Encoded']
    
    df = df.drop(["Actions_Encoded"], axis = 1)

    df['Rating'] = df.groupby(['UserId', 'FilePath'])['FilePath'].transform('count')

    # Content-Based Filtering using TF-IDF
# Content-Based Filtering using TF-IDF
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['FilePath'])
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Collaborative Filtering using Surprise library
    reader = Reader(rating_scale=(1, 10))
    data_surprise = Dataset.load_from_df(df[['UserId', 'FilePath', 'Rating']], reader)
    trainset, testset = train_test_split(data_surprise, test_size=0.2, random_state=42)


    algo = SVD()
    algo.fit(trainset)

    return df, algo, cosine_sim, encoded_action

In [31]:
def get_recommendations(df, user_id, user_given_file_path, user_actions):
    # Call the process_data_and_fit_algo function to preprocess data and fit collaborative filtering algorithm
    df, algo, cosine_sim, encoded_action = process_data_and_fit_algo(df, user_id=user_id, actions=user_actions, filepath=user_given_file_path)

    # Content-Based Filtering
    res = df[df['FilePath'].str.contains(user_given_file_path, case=False, regex=True)].index[0]

    sim_scores = list(enumerate(cosine_sim[res]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    excluded_file_path = df['FilePath'].iloc[res]
    sim_scores = [i for i in sim_scores if df['FilePath'].iloc[i[0]] != excluded_file_path]

    # Select top unique 5 similar files
    unique_similar_files = set()
    top_similar_files = []
    for i in sim_scores:
        if df['FilePath'].iloc[i[0]] not in unique_similar_files:
            unique_similar_files.add(df['FilePath'].iloc[i[0]])
            top_similar_files.append(i)
        if len(top_similar_files) == 5:
            break

    file_indices = [i[0] for i in top_similar_files]
    content_based_recommendations = df['FilePath'].iloc[file_indices].tolist()

    # Collaborative-Based Filtering
    
    # collab_based_recommendations = []
    # for item in df['FilePath'].unique():
    #     predicted_rating = algo.predict(user_id, item).est
    #     if predicted_rating > 7.0:
    #         collab_based_recommendations.append(item)
    
    collab_based_recommendations = []

    df_small = df[df["Actions"] == encoded_action]
    for item in df_small['FilePath'].unique():
        # actions_match = set(df[df['FilePath'] == item]['Actions'])
        predicted_rating = algo.predict(user_id, item).est

        if predicted_rating > 5.0:
            collab_based_recommendations.append(item)


    return content_based_recommendations, collab_based_recommendations, encoded_action



In [32]:
user_id = 1
content_based_recs, collab_based_recs, encoded_action = get_recommendations( df,user_id, "NDA","DOWNLOAD")
print(f"Content-Based Recommendations for User {user_id}: {content_based_recs}")
print(f"Collaborative Filtering Recommendations for User {user_id}: {collab_based_recs}")

Content-Based Recommendations for User 1: ['NDA', ' /Shared/Sample Docs/NDA/Bublup_mNDA.docx', ' /Shared/Sample Docs/NDA/Reciprocal NDA.doc', ' /Shared/Sample Docs/NDA/Cocoon Data Holdings Pty Ltd - NDA.docx', ' /Shared/Sample Docs/NDA/Tenna NDA - 2022.doc']
Collaborative Filtering Recommendations for User 1: [' /Shared/Sample Docs/CIMs/CIM-06-Pizza-Hut.pdf', ' /Shared/Sample Docs/CIMs/CIM-04-Alcatel-Lucent.pdf', ' /Shared/Sample Docs/CIMs/CIM-02-American-Casino.pdf', ' /Shared/Sample Docs/IMAS/GoldmanSachs - IMA.pdf', ' /Shared/Sample Docs/Investor Suitability Questionnaire/Suitability_Assessment_Form_Corporate.pdf', ' /Shared/Sample Docs/IPSs/IPS - Sample1.pdf', ' /Shared/Sample Docs/IPSs/Sample-Investment-Policy-Statement.pdf', ' /Shared/Sample Docs/LPAs/ILPA-Model-Limited-Partnership-Agreement-WOF.pdf', ' /Shared/Sample Docs/PPMs/PPM-IIFCL-MF-FINAL-23.03.2017-FOR-WEBSITE-1.pdf', ' /Shared/Sample Docs/PPMs/PPM - Blackcommerce LLC.pdf', ' /Shared/Sample Docs/PPMs/private-placement-me

In [33]:
user_id = 1
content_based_recs, collab_based_recs, encoded_action = get_recommendations( df,user_id, "NDA","PREVIEW")
print(f"Content-Based Recommendations for User {user_id}: {content_based_recs}")
print(f"Collaborative Filtering Recommendations for User {user_id}: {collab_based_recs}")

Content-Based Recommendations for User 1: ['NDA', ' /Shared/Sample Docs/NDA/Bublup_mNDA.docx', ' /Shared/Sample Docs/NDA/Reciprocal NDA.doc', ' /Shared/Sample Docs/NDA/Cocoon Data Holdings Pty Ltd - NDA.docx', ' /Shared/Sample Docs/NDA/Tenna NDA - 2022.doc']
Collaborative Filtering Recommendations for User 1: [' /Shared/Sample Docs/CIMs/CIM-02-American-Casino.pdf', ' /Shared/Sample Docs/CIMs/CIM-04-Alcatel-Lucent.pdf', ' /Shared/Sample Docs/CIMs/CIM-06-Pizza-Hut.pdf', ' /Shared/Sample Docs/IMAS/GoldmanSachs - IMA.pdf', ' /Shared/Sample Docs/Investor Suitability Questionnaire/Suitability_Assessment_Form_Corporate.pdf', ' /Shared/Sample Docs/IPSs/IPS - Sample1.pdf', ' /Shared/Sample Docs/IPSs/Sample-Investment-Policy-Statement.pdf', ' /Shared/Sample Docs/LPAs/ILPA-Model-Limited-Partnership-Agreement-WOF.pdf', ' /Shared/Sample Docs/PPMs/PPM-IIFCL-MF-FINAL-23.03.2017-FOR-WEBSITE-1.pdf', ' /Shared/Sample Docs/PPMs/private-placement-memorandum.pdf', ' /Shared/Sample Docs/PPMs/PPM - Blackcomm

In [34]:
user_id = 1
content_based_recs, collab_based_recs, encoded_action = get_recommendations( df,user_id, "NDA","DELETE")
print(f"Content-Based Recommendations for User {user_id}: {content_based_recs}")
print(f"Collaborative Filtering Recommendations for User {user_id}: {collab_based_recs}")

Content-Based Recommendations for User 1: ['NDA', ' /Shared/Sample Docs/NDA/Bublup_mNDA.docx', ' /Shared/Sample Docs/NDA/Reciprocal NDA.doc', ' /Shared/Sample Docs/NDA/Cocoon Data Holdings Pty Ltd - NDA.docx', ' /Shared/Sample Docs/NDA/Tenna NDA - 2022.doc']
Collaborative Filtering Recommendations for User 1: ['/Shared/FilePath/tdkzxqxsou.pdf', '/Shared/FilePath/kjssfaztst.pdf', '/Shared/FilePath/yaejacmahq.pdf', '/Shared/FilePath/chmhevbxbe.pdf']
