Web scraping for data set

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

# Example URL (replace with a different URL you have permission to scrape)
url = 'https://www.amazon.eg/'  # Replace with the actual URL of a permitted site

# Define headers with a user-agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

# Send a request to the website
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize lists to store data
    user_ids = []
    product_ids = []
    ratings = []

    # Find the relevant sections in the HTML
    # Modify the selectors based on the website's structure
    products = soup.find_all('div', class_='product')  # Example class for product container

    for product in products:
        product_name = product.find('h2', class_='product-title').text.strip()  # Example for product title
        product_id = product_name  # Use the product name as the product ID for simplicity

        # Simulate user ratings
        for user_index in range(1, 51):  # 50 users
            # Generate a random rating for demonstration purposes
            rating = np.random.randint(1, 6)  # Ratings from 1 to 5

            user_ids.append(f'User_{user_index}')
            product_ids.append(product_id)
            ratings.append(rating)

    # Create a DataFrame from the scraped data
    df_ratings = pd.DataFrame({
        'User_ID': user_ids,
        'Product_ID': product_ids,
        'Rating': ratings
    })

    # Save the DataFrame to a CSV file
    df_ratings.to_csv('scraped_ratings.csv', index=False)
    print("Scraped data saved as 'scraped_ratings.csv'.")

else:
    print(f"Failed to retrieve data: {response.status_code}")

Scraped data saved as 'scraped_ratings.csv'.


Item based matrix

In [None]:

import pandas as pd

# Load the dataset
df_ratings = pd.read_csv('scraped_ratings.csv')  # Replace with your actual CSV file path

# Display the first few rows of the dataset to confirm structure
print("Dataset:")
print(df_ratings.head())

# Create an item-based user-item matrix
# Pivot the data to have Product_ID as rows and User_ID as columns, with ratings as values
item_based_matrix = df_ratings.pivot(index='Product_ID', columns='User_ID', values='Rating').fillna(0)

# Display the item-based matrix
print("\nItem-Based Matrix:")
print(item_based_matrix)

# Save the item-based matrix to a CSV file
item_based_matrix.to_csv('item_based_matrix.csv')
print("\nItem-Based Matrix saved as 'item_based_matrix.csv'")

Dataset:
  User_ID            Product_ID  Rating
0  User_1        Wireless Mouse       5
1  User_1  Bluetooth Headphones       4
2  User_1         USB-C Charger       5
3  User_1            Smartwatch       4
4  User_1                 4K TV       1

Item-Based Matrix:
User_ID               User_1  User_10  User_11  User_12  User_13  User_14  \
Product_ID                                                                  
4K TV                    1.0      5.0      4.0      3.0      4.0      2.0   
Action Camera            2.0      4.0      5.0      4.0      4.0      0.0   
Air Fryer                0.0      0.0      3.0      1.0      1.0      2.0   
Bluetooth Headphones     4.0      2.0      5.0      0.0      1.0      5.0   
Coffee Maker             1.0      3.0      3.0      4.0      2.0      0.0   
Electric Kettle          2.0      2.0      3.0      2.0      0.0      3.0   
Fitness Tracker          2.0      3.0      5.0      4.0      5.0      0.0   
Gaming Laptop            4.0      0.0 

Average rating

In [None]:
import pandas as pd

# Load the full item-based matrix (replace 'item_based_matrix.csv' with your file path)
df_ratings = pd.read_csv('item_based_matrix.csv', index_col=0)

# Calculate the average rating for each product (row-wise)
average_ratings = df_ratings.apply(lambda row: row[row > 0].mean(), axis=1)

# Display the average ratings for each product
print("Average Ratings for Each Product:")
print(average_ratings)

Average Ratings for Each Product:
Product_ID
4K TV                   2.861111
Action Camera           2.975000
Air Fryer               2.800000
Bluetooth Headphones    2.977273
Coffee Maker            3.025000
Electric Kettle         2.861111
Fitness Tracker         3.083333
Gaming Laptop           3.108108
Gaming Monitor          3.388889
Home Router             3.097561
Mechanical Keyboard     2.700000
Office Chair            3.289474
Portable SSD            3.295455
Robot Vacuum            3.027778
Smart Light Bulbs       2.886364
Smart Speaker           2.295455
Smartwatch              2.690476
USB-C Charger           2.976190
VR Headset              2.522727
Wireless Mouse          2.974359
dtype: float64


Calculate Cosine Similarity and Pearson Correlation

In [3]:
import pandas as pd
import numpy as np

# Load the item-based matrix (replace 'item_based_matrix.csv' with the actual file path)
df_ratings = pd.read_csv('item_based_matrix.csv', index_col=0)

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    magnitude1 = np.sqrt(np.dot(vec1, vec1))
    magnitude2 = np.sqrt(np.dot(vec2, vec2))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0  # To handle cases where there are no ratings
    return dot_product / (magnitude1 * magnitude2)

# Function to calculate Pearson correlation
def pearson_correlation(vec1, vec2):
    mask = (vec1 > 0) & (vec2 > 0)  # Only consider non-zero ratings
    if np.sum(mask) == 0:
        return 0  # No common ratings
    vec1 = vec1[mask]
    vec2 = vec2[mask]
    mean1 = np.mean(vec1)
    mean2 = np.mean(vec2)
    adjusted_vec1 = vec1 - mean1
    adjusted_vec2 = vec2 - mean2
    numerator = np.sum(adjusted_vec1 * adjusted_vec2)
    denominator = np.sqrt(np.sum(adjusted_vec1 ** 2)) * np.sqrt(np.sum(adjusted_vec2 ** 2))
    if denominator == 0:
        return 0
    return numerator / denominator

# Calculate similarity matrices
cosine_similarity_matrix = pd.DataFrame(index=df_ratings.index, columns=df_ratings.index)
pearson_correlation_matrix = pd.DataFrame(index=df_ratings.index, columns=df_ratings.index)

# Fill the similarity matrices
for i in df_ratings.index:
    for j in df_ratings.index:
        if i != j:
            cosine_similarity_matrix.loc[i, j] = cosine_similarity(df_ratings.loc[i].values, df_ratings.loc[j].values)
            pearson_correlation_matrix.loc[i, j] = pearson_correlation(df_ratings.loc[i].values, df_ratings.loc[j].values)

print("Cosine Similarity Matrix:")
print(cosine_similarity_matrix)

print("\nPearson Correlation Matrix:")
print(pearson_correlation_matrix)

Cosine Similarity Matrix:
Product_ID               4K TV Action Camera Air Fryer Bluetooth Headphones  \
Product_ID                                                                    
4K TV                      NaN      0.590851  0.585802             0.528678   
Action Camera         0.590851           NaN  0.641739             0.658593   
Air Fryer             0.585802      0.641739       NaN             0.712276   
Bluetooth Headphones  0.528678      0.658593  0.712276                  NaN   
Coffee Maker          0.679588      0.677353   0.68461             0.692736   
Electric Kettle        0.61328      0.549826  0.639021             0.633526   
Fitness Tracker       0.589379       0.73315  0.658812             0.697892   
Gaming Laptop         0.508084       0.57731  0.586601             0.705414   
Gaming Monitor        0.571951      0.646099  0.615794             0.650577   
Home Router           0.559498      0.643226  0.652149             0.708595   
Mechanical Keyboard   0.56

Rating Prediction and Top-N Recommendations

In [4]:
import pandas as pd
import numpy as np

# Load the item-based matrix and similarity matrices
df_ratings = pd.read_csv('item_based_matrix.csv', index_col=0)

# Assuming cosine_similarity_matrix and pearson_correlation_matrix are calculated as in Step 10
# For simplicity, I'll use cosine similarity for prediction, but the same approach can apply to Pearson correlation

def predict_rating(user_id, item_id, similarity_matrix, ratings_matrix):
    """
    Predicts the rating for a specific user-item pair based on item-based collaborative filtering.
    """
    # Get the ratings for the specific user
    user_ratings = ratings_matrix.loc[:, user_id]

    # Get similarity scores for the target item with other items
    similarities = similarity_matrix.loc[item_id]

    # Consider only items the user has already rated
    rated_items = user_ratings[user_ratings > 0].index

    # Calculate the weighted sum of ratings
    numerator = sum(similarities[j] * user_ratings[j] for j in rated_items)
    denominator = sum(abs(similarities[j]) for j in rated_items)

    if denominator == 0:
        return 0  # Return 0 if no similar items are rated

    return numerator / denominator

def get_top_n_recommendations(user_id, similarity_matrix, ratings_matrix, n=5):
    """
    Generates the top-N recommended items for a given user based on predicted ratings.
    """
    # Predict ratings for all items that the user has not yet rated
    unrated_items = ratings_matrix[ratings_matrix.loc[:, user_id] == 0].index
    predicted_ratings = {item: predict_rating(user_id, item, similarity_matrix, ratings_matrix) for item in unrated_items}

    # Sort items by predicted rating in descending order
    top_n_items = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:n]

    return top_n_items

# Example usage:
# Get the top 5 recommendations for User_1 using cosine similarity
top_n_recommendations_user_1 = get_top_n_recommendations('User_1', cosine_similarity_matrix, df_ratings, n=5)
print("Top 5 Recommendations for User_1:", top_n_recommendations_user_1)

Top 5 Recommendations for User_1: ['Air Fryer']


Code adjustment for comparison

In [5]:
# Using cosine similarity for top-N recommendations
top_n_cosine = get_top_n_recommendations('User_1', cosine_similarity_matrix, df_ratings, n=5)
print("Top 5 Recommendations for User_1 using Cosine Similarity:", top_n_cosine)

# Using Pearson correlation for top-N recommendations
top_n_pearson = get_top_n_recommendations('User_1', pearson_correlation_matrix, df_ratings, n=5)
print("Top 5 Recommendations for User_1 using Pearson Correlation:", top_n_pearson)

# Comparing the two lists
common_recommendations = set(top_n_cosine).intersection(top_n_pearson)
unique_to_cosine = set(top_n_cosine) - set(top_n_pearson)
unique_to_pearson = set(top_n_pearson) - set(top_n_cosine)

print("\nCommon Recommendations:", common_recommendations)
print("Unique to Cosine Similarity:", unique_to_cosine)
print("Unique to Pearson Correlation:", unique_to_pearson)

Top 5 Recommendations for User_1 using Cosine Similarity: ['Air Fryer']
Top 5 Recommendations for User_1 using Pearson Correlation: ['Air Fryer']

Common Recommendations: {'Air Fryer'}
Unique to Cosine Similarity: set()
Unique to Pearson Correlation: set()
