In [9]:
import pandas as pd, json, numpy as np, random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
import json

businesses = []

with open('business.json', 'r') as file:
    for line in file:
        if line.strip():  
            try:
                business = json.loads(line)
                businesses.append(business)
            except json.JSONDecodeError:
                print(f"Error decoding JSON from line: {line}")
data=[c for c in businesses if c['city']=='Philadelphia']
df = pd.DataFrame([[city['name'],city['stars'],city['review_count'],city['categories']] for city in data],columns=['name','stars','review_count','categories'])
df.to_csv('business.csv', index=False, sep=';')
df.head()



Hello


Unnamed: 0,name,stars,review_count,categories
0,St Honore Pastries,4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
1,Tuna Bar,4.0,245,"Sushi Bars, Restaurants, Japanese"
2,BAP,4.5,205,"Korean, Restaurants"
3,Bar One,4.0,65,"Cocktail Bars, Bars, Italian, Nightlife, Resta..."
4,DeSandro on Main,3.0,41,"Pizza, Restaurants, Salad, Soup"


In [46]:
# Load the data
df = pd.read_csv('business.csv', sep=';')

# Function to map categories to broader categories
def map_categories(categories):
    category_mapping = {
        'Food': ['Restaurants', 'Bakeries', 'Cafes', 'Food Trucks', 'Grocery', 'Specialty Food', 'Bars', 'Breweries', 'Wineries', 'Desserts', 'Ice Cream', 'Coffee & Tea', 'Juice Bars', 'Bubble Tea', 'Smoothies', 'Sushi', 'Korean', 'Japanese', 'Chinese', 'Italian', 'Mexican', 'Thai', 'Indian', 'French', 'American'],
        'Shopping': ['Retail', 'Clothing', 'Shoes', 'Accessories', 'Electronics', 'Books', 'Music', 'Home & Garden', 'Antiques', 'Thrift Stores', 'Department Stores'],
        'Beauty & Wellness': ['Hair Salons', 'Nail Salons', 'Spas', 'Massage', 'Gyms', 'Yoga', 'Fitness', 'Cosmetics', 'Barbers'],
        'Health & Medical': ['Doctors', 'Dentists', 'Hospitals', 'Pharmacies', 'Optometrists', 'Chiropractors', 'Mental Health'],
        'Automotive': ['Car Dealers', 'Auto Repair', 'Car Wash', 'Gas Stations', 'Parking', 'Auto Parts'],
        'Home Services': ['Contractors', 'Plumbers', 'Electricians', 'HVAC', 'Landscaping', 'Cleaning', 'Moving', 'Pest Control'],
        'Professional Services': ['Lawyers', 'Accountants', 'Insurance', 'Real Estate', 'Financial Services', 'Marketing', 'Printing'],
        'Education': ['Schools', 'Colleges', 'Tutoring', 'Language Schools', 'Art Classes', 'Music Lessons'],
        'Pets': ['Pet Stores', 'Veterinarians', 'Pet Grooming', 'Pet Boarding', 'Pet Training'],
        'Arts & Entertainment': ['Museums', 'Theaters', 'Music Venues', 'Art Galleries', 'Nightlife', 'Casinos'],
        'Travel & Lodging': ['Hotels', 'Vacation Rentals', 'Travel Agencies', 'Tours', 'Transportation'],
        'Sports & Recreation': ['Gyms', 'Parks', 'Sports Clubs', 'Golf', 'Swimming', 'Bowling', 'Martial Arts'],
        'Religious Organizations': ['Churches', 'Temples', 'Mosques', 'Synagogues'],
        'Public Services': ['Libraries', 'Post Offices', 'Government Offices', 'Community Centers'],
        'Event Services': ['Wedding Planning', 'Photographers', 'Caterers', 'Event Spaces', 'DJs'],
        'Other': []  # Catch-all for categories that don't fit elsewhere
    }

    reverse_mapping = {subcat.lower(): broad for broad, subcats in category_mapping.items() for subcat in subcats}

    categories_list = [cat.strip() for cat in categories.split(',')]
    mapped_categories = list(set(reverse_mapping.get(cat.lower(), 'Other') for cat in categories_list))
    return ', '.join(sorted(mapped_categories))


df = df.dropna(subset=['categories'])
# Apply the mapping to the dataset
df['broad_categories'] = df['categories'].apply(map_categories)

# Preprocess the data
df['review_count'] = df['review_count'].fillna(0)
df['stars'] = df['stars'].fillna(0)

# Create a TF-IDF vectorizer for broad categories
tfidf = TfidfVectorizer(token_pattern=r'\b[\w\s]+\b')
tfidf_matrix = tfidf.fit_transform(df['broad_categories'])

# Normalize stars and review_count
df['stars_normalized'] = (df['stars'] - df['stars'].min()) / (df['stars'].max() - df['stars'].min())
df['review_count_normalized'] = (df['review_count'] - df['review_count'].min()) / (df['review_count'].max() - df['review_count'].min())

# Combine features
features = np.hstack((
    tfidf_matrix.toarray(),
    df[['stars_normalized', 'review_count_normalized']].values
))

# Calculate item similarity
item_similarity = cosine_similarity(features)

# User preferences and ratings
user_preferences = {}
user_ratings = {}

def add_user_preference(user_id, category):
    if user_id not in user_preferences:
        user_preferences[user_id] = set()
    mapped_category = map_categories(category)
    user_preferences[user_id].add(mapped_category)
    user_preferences[user_id].update(mapped_category.split(', '))
    print(f"User {user_id} preferences: {user_preferences[user_id]}")

def rate_item(user_id, item_name, rating):
    if user_id not in user_ratings:
        user_ratings[user_id] = {}
    user_ratings[user_id][item_name] = rating

def get_user_profile(user_id):
    if user_id not in user_preferences:
        return np.zeros(len(features[0]))
    
    user_vector = np.zeros(len(features[0]))
    for category in user_preferences[user_id]:
        print(f"Checking category: {category}")
        category_lower = category.lower()
        if category_lower in tfidf.vocabulary_:
            user_vector[tfidf.vocabulary_[category_lower]] = 5
            print(f"Added preference for {category} to user vector")
     
    #print(tfidf.vocabulary_)
    
    # Incorporate user ratings
    if user_id in user_ratings:
        for item_name, rating in user_ratings[user_id].items():
            item_index = df[df['name'] == item_name].index[0]
            user_vector += features[item_index] * (rating / 5)  # Normalize rating to 0-1 scale
    
    norm = np.linalg.norm(user_vector)
    if norm == 0:
        return user_vector
    return user_vector / norm

def recommend_items(user_id, n=5):
    user_profile = get_user_profile(user_id)
    scores = cosine_similarity([user_profile], features)[0]
    
    # Exclude items the user has already rated
    rated_items = set(user_ratings.get(user_id, {}).keys())
    unrated_indices = [i for i, name in enumerate(df['name']) if name not in rated_items]
    
    top_indices = sorted(unrated_indices, key=lambda i: scores[i], reverse=True)[:n]
    return df.iloc[top_indices][['name', 'stars', 'review_count', 'broad_categories']]

# Example usage
user_id = 1
add_user_preference(user_id, 'Lawyers')
add_user_preference(user_id, 'Accountants')
add_user_preference(user_id, 'Insurance')

user_profile = get_user_profile(user_id)
print(f"User profile for user {user_id}: {user_profile}")


print("Initial recommendations:")
print(recommend_items(user_id))

# # User rates an item 
rate_item(user_id, 'Bar One', 4.5)

user_profile = get_user_profile(user_id)
print(f"User profile for user {user_id}: {user_profile}")

print("\nRecommendations after rating:")
print(recommend_items(user_id))


User 1 preferences: {'Professional Services'}
User 1 preferences: {'Professional Services'}
User 1 preferences: {'Professional Services'}
Checking category: Professional Services
Added preference for Professional Services to user vector
User profile for user 1: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Initial recommendations:
Checking category: Professional Services
Added preference for Professional Services to user vector
                                    name  stars  review_count  \
3291                               Cigna    1.0             5   
11456                     Liberty Mutual    1.0             5   
4190   The Philadelphia Contributionship    1.0             7   
652                  Woodward Properties    1.0             5   
1621                  AA Auto Protection    1.0             5   

                   broad_categories  
3291          Professional Services  
11456         Professional Services  
4190          Professional Services  
