In [1]:
import pandas as pd

In [5]:
cd Downloads

/home/asehgal/Downloads


### Data Loading

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l|

In [None]:
df = pd.read_csv("data-set.csv")
df = df.dropna()

In [None]:
df = df.reset_index(drop=True)

In [None]:
def duplicate_rows_randomly(df, n_duplicates):
    duplicated_rows = []

    for user_id in df['UserId'].unique():
        user_data = df[df['UserId'] == user_id]

        for _ in range(n_duplicates):
            random_row = user_data.sample(n=1).iloc[0]
            duplicated_rows.append(random_row)

    duplicated_df = pd.DataFrame(duplicated_rows)
    new_df = pd.concat([df, duplicated_df], ignore_index=True)
    return new_df

# Example usage:
# Assuming your DataFrame has columns 'UserID', 'FilePath', and 'Action'
# You can adjust n_duplicates based on how many times you want to duplicate each row
new_df = duplicate_rows_randomly(df, n_duplicates=100)

In [None]:
df = new_df

### Creating Features

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Actions'] = label_encoder.fit_transform(df['Actions'])

In [None]:
df['Rating'] = df.groupby(['UserId', 'FilePath'])['FilePath'].transform('count')

In [None]:
df["Rating"].unique()

In [None]:
# Content-Based Filtering using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['FilePath'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# Collaborative Filtering using Surprise library
reader = Reader(rating_scale=(1, 10))
data_surprise = Dataset.load_from_df(df[['UserId', 'FilePath', 'Rating']], reader)
trainset, testset = train_test_split(data_surprise, test_size=0.2, random_state=42)


In [None]:
algo = SVD()
algo.fit(trainset)

In [None]:
collab_based_recommendations = []
for item in df['FilePath'].unique():
    predicted_rating = algo.predict(user_id, item).est
    if predicted_rating > 7.0:
        collab_based_recommendations.append(item)

In [None]:
def get_recommendations(user_id):
    # Content-Based Filtering
    file_indices = pd.Series(df.index, index=df['FilePath'])
    idx = file_indices[df[df['UserId'] == user_id]['FilePath']]
    # idx = file_indices[df[(df['UserId'] == user_id) & (df['Actions'] == user_action)]['FilePath']]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1][0], reverse=True)
    sim_scores = sim_scores[1:6]  # Top 5 similar files

    file_indices = [i[0] for i in sim_scores]

    content_based_recommendations = df['FilePath'].iloc[file_indices].tolist()

    # Collaborative Filtering
    collab_based_recommendations = []
    for item in df['FilePath'].unique():
        predicted_rating = algo.predict(user_id, item).est
        if predicted_rating > 9.0:
            collab_based_recommendations.append(item)

    return content_based_recommendations, collab_based_recommendations

In [None]:
user_id = 1
content_based_recs, collab_based_recs = get_recommendations(user_id)
print(f"Content-Based Recommendations for User {user_id}: {content_based_recs}")
print(f"Collaborative Filtering Recommendations for User {user_id}: {collab_based_recs}")

In [None]:
user_id = 1
content_based_recs, collab_based_recs = get_recommendations(user_id)
print(f"Content-Based Recommendations for User {user_id}: {content_based_recs}")
print(f"Collaborative Filtering Recommendations for User {user_id}: {collab_based_recs}")