In [2]:
import pandas as pd, json, numpy as np, random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime


In [6]:
spots = json.load(open('../datasets/data.json'))

spots = spots['businesses']

spots[0]
#keys to be kept in the recommendation system dataset:
#id, categories, review_count, rating, price, coordinates
#The rest of the keys are to be stored in a separate dataset for the user's knowledge
#keep id in both datasets to link them together
initial_weights = []
spot_details = []
for spot in spots:
    rec={}
    use={}
    for key in spot:
        if key == 'id':
            rec[key]=spot[key]
            use[key]=spot[key]
        else:
            if key in ['categories','review_count','rating','price','coordinates']:
                rec[key]=spot[key]
            else:
                use[key]=spot[key]
        
    initial_weights.append(rec)
    spot_details.append(use)

#Adding keys with made up random values to test drive integrating features into the recommender system.
#New Keys: 
# time_viewing [How much time the user spent looking at the place] (float)
# pressed_details [If the user pressed the details button] (bool)
# pressed_share [If they pressed share button] (bool)
# pressed_save [If they saved the place] (bool)

#user_preferences will be a two layer nesteddictionary with the user id as the key and a dictionary of the user's preferences as the value, 
# the user preferences is a nested dict where keys are spot ids, and values are spot specific preferences below is a sample of what the 
#user_preferences dictionary will look like
user_preferences = {
    1:{#uid
        'location_specific':{
            28384:{#spot_id
                'time_viewing':5.0,'pressed_share':False,'pressed_save':True,
                'rating':5.0
            },
            28385:{#spot_id
                'swiped_left':False,'swiped_right':True,'time_viewing':3.0,'pressed_share':False,'pressed_save':True,
                'rating':4.0
            },
        },
        'general_preferences':{
            'price':'$$',
            'categories':['restaurant','bar'],
            'coordinates':(39.9526,75.1652)
        }
    }
}
print("Spot Details:")
display(spot_details[0])
print("Initial Weights:")
display(initial_weights[0])
print("User Preferences:")
display(user_preferences)

#unique categories
# categories = set()
# for spot in initial_weights:
#     for category in spot['categories']:
#         for key,value in category.items():
#             if key == 'title':
#                 categories.add(value)
# list(categories)
# initial_weights[0]
# temp={}
# for entry in initial_weights:
#     temp[entry['id']]=entry
#     del temp[entry['id']]['id']
# initial_weights=temp
# temp={}
# for entry in spot_details:
#     temp[entry['id']]=entry
#     del temp[entry['id']]['id']
# spot_details=temp
# json.dump(initial_weights,open('Updating_datasets/initial_weights.json','w'))
# json.dump(spot_details,open('Updating_datasets/spot_details.json','w'))

Spot Details:


{'id': 'zj8Lq1T8KIC5zwFief15jg',
 'alias': 'prince-street-pizza-new-york-2',
 'name': 'Prince Street Pizza',
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/Jo9jBP5y6G_bG_g3H31fiw/o.jpg',
 'is_closed': False,
 'url': 'https://www.yelp.com/biz/prince-street-pizza-new-york-2?adjust_creative=g2Uocg3Kx8gT4IQM5axLiQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=g2Uocg3Kx8gT4IQM5axLiQ',
 'transactions': ['delivery', 'pickup'],
 'location': {'address1': '27 Prince St',
  'address2': None,
  'address3': '',
  'city': 'New York',
  'zip_code': '10012',
  'country': 'US',
  'state': 'NY',
  'display_address': ['27 Prince St', 'New York, NY 10012']},
 'phone': '+12129664100',
 'display_phone': '(212) 966-4100',
 'distance': 1950.7368390479296,
 'attributes': {'business_temp_closed': None,
  'menu_url': 'https://princestreetpizzanyc.com/menu',
  'open24_hours': None,
  'waitlist_reservation': None}}

Initial Weights:


{'id': 'zj8Lq1T8KIC5zwFief15jg',
 'review_count': 5291,
 'categories': [{'alias': 'pizza', 'title': 'Pizza'},
  {'alias': 'italian', 'title': 'Italian'}],
 'rating': 4.3,
 'coordinates': {'latitude': 40.72308755605564,
  'longitude': -73.99453001177575},
 'price': '$'}

User Preferences:


{1: {'location_specific': {28384: {'time_viewing': 5.0,
    'pressed_share': False,
    'pressed_save': True,
    'rating': 5.0},
   28385: {'swiped_left': False,
    'swiped_right': True,
    'time_viewing': 3.0,
    'pressed_share': False,
    'pressed_save': True,
    'rating': 4.0}},
  'general_preferences': {'price': '$$',
   'categories': ['restaurant', 'bar'],
   'coordinates': (39.9526, 75.1652)}}}

In [9]:
category_mapping = {
    'American': ['American', 'New American', 'Southern', 'Soul Food', 'Cajun/Creole', 'Tex-Mex'],
    'Asian': ['Chinese', 'Japanese', 'Korean', 'Thai', 'Vietnamese', 'Indian', 'Pakistani', 'Bangladeshi', 'Taiwanese', 'Filipino', 'Malaysian', 'Indonesian', 'Singaporean', 'Burmese', 'Cambodian', 'Laotian', 'Mongolian', 'Nepalese', 'Sri Lankan', 'Asian Fusion'],
    'European': ['Italian', 'French', 'Spanish', 'German', 'Greek', 'British', 'Irish', 'Scottish', 'Polish', 'Russian', 'Ukrainian', 'Hungarian', 'Czech', 'Austrian', 'Belgian', 'Dutch', 'Swiss', 'Scandinavian', 'Portuguese'],
    'Latin American': ['Mexican', 'Brazilian', 'Peruvian', 'Argentine', 'Colombian', 'Venezuelan', 'Cuban', 'Puerto Rican', 'Dominican', 'Salvadoran', 'Honduran', 'Nicaraguan', 'Guatemalan', 'Ecuadorian', 'Bolivian', 'Chilean'],
    'Middle Eastern': ['Lebanese', 'Turkish', 'Persian/Iranian', 'Israeli', 'Moroccan', 'Egyptian', 'Syrian', 'Armenian', 'Afghan', 'Iraqi', 'Uzbek', 'Georgian'],
    'African': ['Ethiopian', 'Nigerian', 'Ghanaian', 'Senegalese', 'South African', 'Eritrean', 'Somali', 'Kenyan', 'Tanzanian', 'Ugandan'],
    'Seafood': ['Seafood', 'Sushi Bars', 'Fish & Chips', 'Poke'],
    'Fast Food': ['Fast Food', 'Burgers', 'Pizza', 'Sandwiches', 'Hot Dogs', 'Chicken Wings'],
    'Vegetarian and Vegan': ['Vegetarian', 'Vegan', 'Raw Food'],
    'Breakfast and Brunch': ['Breakfast & Brunch', 'Pancakes', 'Waffles', 'Bagels', 'Donuts'],
    'Bakeries and Desserts': ['Bakeries', 'Desserts', 'Ice Cream & Frozen Yogurt', 'Cupcakes', 'Patisserie/Cake Shop', 'Gelato'],
    'Cafes and Coffee Shops': ['Cafes', 'Coffee & Tea', 'Bubble Tea'],
    'Bars and Pubs': ['Bars', 'Pubs', 'Sports Bars', 'Wine Bars', 'Beer Gardens', 'Cocktail Bars', 'Dive Bars', 'Hookah Bars'],
    'Specialty Food': ['Cheese Shops', 'Butcher', 'Farmers Market', 'Specialty Food', 'Organic Stores', 'Health Markets'],
    'Food Trucks and Stands': ['Food Trucks', 'Food Stands', 'Street Vendors'],
    'Grocery': ['Grocery', 'International Grocery', 'Convenience Stores'],
    'Nightlife': ['Nightlife', 'Dance Clubs', 'Karaoke', 'Comedy Clubs', 'Jazz & Blues'],
    'Arts and Entertainment': ['Museums', 'Art Galleries', 'Performing Arts', 'Music Venues', 'Theaters', 'Cinema'],
    'Outdoor Activities': ['Parks', 'Beaches', 'Hiking', 'Botanical Gardens', 'Playgrounds', 'Dog Parks'],
    'Fitness and Sports': ['Gyms', 'Yoga', 'Martial Arts', 'Swimming Pools', 'Tennis', 'Basketball Courts', 'Soccer'],
    'Shopping': ['Shopping Centers', 'Clothing', 'Shoes', 'Jewelry', 'Books', 'Electronics', 'Home & Garden'],
    'Beauty and Spas': ['Hair Salons', 'Nail Salons', 'Day Spas', 'Massage'],
    'Hotels and Accommodation': ['Hotels', 'Hostels', 'Bed & Breakfast'],
    'Event Planning and Services': ['Wedding Planning', 'Party & Event Planning', 'Caterers', 'Photographers'],
    'Automotive': ['Car Dealers', 'Auto Repair', 'Car Wash', 'Gas Stations'],
    'Professional Services': ['Lawyers', 'Accountants', 'Real Estate', 'Insurance'],
    'Education': ['Schools', 'Colleges', 'Tutoring', 'Cooking Classes', 'Art Schools'],
    'Pets': ['Pet Stores', 'Veterinarians', 'Pet Groomers', 'Dog Walkers'],
    'Religious Organizations': ['Churches', 'Mosques', 'Synagogues', 'Temples'],
    'Other': []  # Catch-all for categories that don't fit elsewhere
}

# Create a reverse mapping for easy lookup
reverse_category_mapping = {sub_cat: main_cat for main_cat, sub_cats in category_mapping.items() for sub_cat in sub_cats}


# Load the data
with open('../datasets/initial_weights.json', 'r') as file:
    data = json.load(file)

index=[]
vals = []
for key,value in data.items():
    index.append(key)
    vals.append([val for val in value.values()])
    
initial_weights = pd.DataFrame(vals,columns=[key for key in data[index[0]].keys()],index=index)

with open('../datasets/spot_details.json', 'r') as file:
    spot_details = json.load(file)
index,vals = [],[]
# print(spot_details)
for key,value in spot_details.items():
    index.append(key)
    vals.append([val for val in value.values()])
# print(len(vals),len(index))

spot_details = pd.DataFrame(vals,columns=[key for key in spot_details[index[0]].keys()],index=index)
spot_details = spot_details.copy().reset_index()
spot_details.rename(columns={'index': 'id'}, inplace=True)
user_preferences = {}

def preprocess_data(initial_weights):
    df = initial_weights.copy().reset_index()
    df.rename(columns={'index': 'id'}, inplace=True)
    
    # Extract categories and map to general categories
    df['categories'] = df['categories'].apply(lambda x: [reverse_category_mapping.get(cat['title'], 'Other') for cat in x])
    df['categories'] = df['categories'].apply(lambda x: ', '.join(set(x)) if x else 'Other')
    
    # Handle missing values
    df['review_count'] = df['review_count'].fillna(0)
    df['rating'] = df['rating'].fillna(0)
    df['price'] = df['price'].fillna('$')
    df['latitude'] = df['coordinates'].apply(lambda x: x['latitude'] if x and 'latitude' in x else 0)
    df['longitude'] = df['coordinates'].apply(lambda x: x['longitude'] if x and 'longitude' in x else 0)
    df = df.drop(columns=['coordinates'])
    
    return df



df = preprocess_data(initial_weights)
print(df)


                           id  review_count  \
0      zj8Lq1T8KIC5zwFief15jg          5291   
1      j1S3NUrkB3BVT49n_e76NQ          4805   
2      lynQoI3w_pzYfHGeuUU-Qg          1443   
3      vk7W3_sQwr7eZbRFsXv6rw          3374   
4      X8ZS-dgiMIJvhwf9SaDnjw          2159   
...                       ...           ...   
12160  UGBwmvJ47rDhDGxdUSBZdg            32   
12161  FxswnvW4Fat6dFYcQNaE7Q            75   
12162  --V7PYYPxSgXU3EmFkVDXQ            28   
12163  _BsM67A1_JqAmgXA3SznKg           317   
12164  QERbo2vqq4T9GyIFtb2Ccg           392   

                                         categories  rating price   latitude  \
0                               European, Fast Food     4.3     $  40.723088   
1      Cafes and Coffee Shops, Breakfast and Brunch     4.5     $  40.752268   
2                                             Asian     4.5     $  40.717411   
3                      Asian, Bakeries and Desserts     4.3     $  40.717890   
4                                  

In [20]:
df.shape

(12165, 7)

In [12]:
#one time thing, should be saved into the database as read only, unless we expand with more datasets
def create_feature_matrix(df):
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['categories'])
    
    scaler = MinMaxScaler()
    numerical_features = scaler.fit_transform(df[['review_count', 'rating', 'latitude', 'longitude']].fillna(0))
    
    price_dummies = pd.get_dummies(df['price'], prefix='price').fillna(0)
    
    features = np.hstack((tfidf_matrix.toarray(), numerical_features, price_dummies.values))
    
    return features, tfidf, scaler, tfidf_matrix

features, tfidf, coordinate_scaler,tfidf_matrix = create_feature_matrix(df)
# item_similarity = cosine_similarity(features)


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess each row to replace commas with a unique separator (like '||') to treat the entire category as one token
df['processed_categories'] = df['categories'].apply(lambda x: x.replace(', ', '||'))

# Initialize TfidfVectorizer with a custom tokenizer that splits on '||' instead of spaces
t = TfidfVectorizer(tokenizer=lambda x: x.split('||'))

# Fit and transform the processed categories
tfidf_matrix = t.fit_transform(df['processed_categories'])

# You can now access the vocabulary and use it as needed
print(list(t.vocabulary_.keys())[:5])

len(t.vocabulary_)

['european', 'fast food', 'cafes and coffee shops', 'breakfast and brunch', 'asian']




29

In [16]:
####
#This just reads from the database, no need for an api for this
###
# Function to get user profile
def get_user_profile(user_id, tfidf, coordinate_scaler):
    if user_id not in user_preferences:
        return np.zeros(features.shape[1])
    
    user_vector = np.zeros(features.shape[1])
    user_data = user_preferences[user_id]
    
    general_prefs = user_data['general_preferences']
    for category in general_prefs['categories']:
        mapped_category = reverse_category_mapping.get(category, category)
        if mapped_category in tfidf.vocabulary_:
            user_vector[tfidf.vocabulary_[mapped_category]] = 1
    
    price_index = features.shape[1] - 4 + len(general_prefs['price'])
    user_vector[price_index] = 1
    
    user_coords = np.array(general_prefs['coordinates']).reshape(1, -1)
    # Create a dummy array with 4 features to match the scaler's expected input
    dummy_coords = np.zeros((1, 4))
    dummy_coords[0, 2:] = user_coords  # Assuming latitude and longitude are the last two features
    scaled_coords = coordinate_scaler.transform(dummy_coords)
    user_vector[-6:-4] = scaled_coords[0, 2:] 

    for spot_id, spot_data in user_data['location_specific'].items():
        spot_index = df[df['id'] == spot_id].index
        if len(spot_index) > 0:
            spot_index = spot_index[0]
            
            if not spot_data.get('time_viewing', True):
                # User pressed details, incorporate positive interactions
                interaction_weight = 1.0
                
                if spot_data.get('pressed_share', False):
                    interaction_weight += 0.3
                if spot_data.get('pressed_save', False):
                    interaction_weight += 0.3
                
                # Incorporate viewing time
                max_viewing_time = 60
                viewing_time = min(spot_data.get('time_viewing', 0), max_viewing_time)
                time_factor = viewing_time / max_viewing_time
                interaction_weight *= (1 + time_factor)
                
                # Incorporate rating with penalty for low ratings
                rating = spot_data.get('rating', 2.5)
                if rating < 3:
                    # Apply penalty that increases as rating approaches 1
                    penalty = 1 - (rating - 1) / 2  # This will be 1 at rating 1, and 0 at rating 3
                    interaction_weight *= (1 - penalty * 0.5)  # Adjust the 0.5 to control penalty strength
                
                # Add to user vector
                user_vector += features[spot_index] * interaction_weight
            
            else:
                # User didn't press details, subtract a fraction of the feature vector
                user_vector -= features[spot_index] * 0.2
    
    norm = np.linalg.norm(user_vector)
    if norm > 0:
        user_vector /= norm
    
    return user_vector

####
#create an api for this, read from user and write to database
###
def add_new_user(user_id, general_preferences,name,pword,age):
    if user_id in user_preferences:
        raise ValueError("User ID already exists")
    
    user_preferences[user_id] = {
        'general_preferences': general_preferences,
        'location_specific': {},
        'last_active': datetime.now()
    }
    user_preferences[user_id]['friends'] = []
    user_preferences[user_id]['name'] = name
    user_preferences[user_id]['password'] = pword
    user_preferences[user_id]['age'] = age
    print(f"New user {user_id} added successfully")

####
#create an api for this, read from user and write to database
###
def update_user_preferences(user_id, new_preferences):
    if user_id not in user_preferences:
        raise ValueError("User ID does not exist")
    
    user_preferences[user_id]['general_preferences'].update(new_preferences)
    user_preferences[user_id]['last_active'] = datetime.now()
    print(f"Preferences updated for user {user_id}")

####
#create an api for this, read from user and write to database
###
def record_spot_interaction(user_id, spot_id, interaction):
    if user_id not in user_preferences:
        raise ValueError("User ID does not exist")
    
    if spot_id not in user_preferences[user_id]['location_specific']:
        user_preferences[user_id]['location_specific'][spot_id] = {}
    
    user_preferences[user_id]['location_specific'][spot_id].update(interaction)
    user_preferences[user_id]['last_active'] = datetime.now()
    print(f"Interaction recorded for user {user_id} with spot {spot_id}")

# def get_user_stats(user_id):
#     if user_id not in user_preferences:
#         raise ValueError("User ID does not exist")
    
#     user_data = user_preferences[user_id]
#     total_interactions = len(user_data['location_specific'])
#     likes = sum(1 for spot in user_data['location_specific'].values() if spot.get('pressed_details', False))
    
#     return {
#         'total_interactions': total_interactions,
#         'likes': likes,
#         'last_active': user_data['last_active']
#     }


####
#create an api for this, read from user and write to database
###
def update_user_coordinates(user_id, new_coordinates):
    if user_id not in user_preferences:
        raise ValueError("User ID does not exist")
    
    user_preferences[user_id]['general_preferences']['coordinates'] = new_coordinates
    user_preferences[user_id]['last_active'] = datetime.now()
    print(f"Coordinates updated for user {user_id}")

####
#internal function, no api needed, just reads from database
###
def get_next_items(user_id, n=10):
    # Ensure n is even
    n = n if n % 2 == 0 else n + 1
    
    # Get n/2 recommendations based on user profile
    user_based_recommendations = user_based_recommend(user_id, n // 2)
    
    # Get n/2 recommendations based on item similarity to the user-based recommendations
    item_based_recommendations = item_based_recommend(user_based_recommendations, n // 2)
    
    # Combine and shuffle the recommendations
    all_recommendations = pd.concat([user_based_recommendations, item_based_recommendations])
    all_recommendations.sample(n=len(all_recommendations))
    return all_recommendations

####
#internal function, no api needed, just reads from database
### 
def user_based_recommend(user_id, n):
    if user_id not in user_preferences:
        # New user: use a fallback method (e.g., popular items)
        return popular_items_recommend(n)
    
    user_profile = get_user_profile(user_id, tfidf, coordinate_scaler)
    scores = cosine_similarity([user_profile], features)[0]
    
    top_indices = scores.argsort()[-n:][::-1]
    recommended_ids = df.iloc[top_indices]['id'].tolist()
    
    return spot_details[spot_details['id'].isin(recommended_ids)][['id', 'name', 'image_url', 'phone']]

####
#internal function, no api needed, just reads from database
###
def item_based_recommend(base_items, n):
    base_indices = df[df['id'].isin(base_items['id'])].index
    
    similar_items = set()
    for idx in base_indices:
        # Get top similar items for each base item
        similar_indices = item_similarity[idx].argsort()[-n:][::-1]
        similar_items.update(df.iloc[similar_indices]['id'].tolist())
    
    # Remove base items from similar items
    similar_items = list(similar_items - set(base_items['id']))
    
    # If we don't have enough similar items, pad with popular items
    if len(similar_items) < n:
        popular = popular_items_recommend(n - len(similar_items))
        similar_items.extend(popular['id'].tolist())
    
    return spot_details[spot_details['id'].isin(similar_items[:n])][['id', 'name', 'image_url', 'phone']]

####
#internal function, no api needed, just reads from database
###
def popular_items_recommend(n):
    # Recommend based on a combination of rating and review count
    scores = df['review_count'] * df['rating'].fillna(0)
    top_indices = scores.argsort()[-n:][::-1]
    recommended_ids = df.iloc[top_indices]['id'].tolist()
    return spot_details[spot_details['id'].isin(recommended_ids)][['id', 'name', 'image_url', 'phone']]

####
#create an api for this, read from database and display in ui (put to front end)
###
# Update the get_next_spot function to use get_next_items
def get_next_spot(user_id):
    recommendations = get_next_items(user_id, n=10)  # Get 10 recommendations
    seen_spots = set(user_preferences[user_id]['location_specific'].keys())

    #  [] [0] = location     {id:[data]}
    
    for _, spot in recommendations.iterrows():
        if spot['id'] not in seen_spots:
            return spot
    
    return None  # Return None if all recommended spots have been seen

####
#internal function, no api needed, just reads from database
###
def get_group_profile(user_ids, tfidf, coordinate_scaler):
    group_vector = np.zeros(features.shape[1])
    for user_id in user_ids:
        user_vector = get_user_profile(user_id, tfidf, coordinate_scaler)
        group_vector += user_vector
    
    # Normalize the group vector
    norm = np.linalg.norm(group_vector)
    if norm > 0:
        group_vector /= norm
    
    return group_vector

####
#internal function, no api needed, just reads from database
###
def group_based_recommend(user_ids, n=10):
    group_profile = get_group_profile(user_ids, tfidf, coordinate_scaler)
    scores = cosine_similarity([group_profile], features)[0]
    
    top_indices = scores.argsort()[-n:][::-1]
    recommended_ids = df.iloc[top_indices]['id'].tolist()
    
    return spot_details[spot_details['id'].isin(recommended_ids)][['id', 'name', 'image_url', 'phone']]

####
#create an api for this, read from database, and from user and display in ui (put to front end)
###
def get_next_group_spot(user_ids):
    recommendations = group_based_recommend(user_ids, n=20)  # Get more recommendations for groups
    
    # Get all seen spots for the group
    seen_spots = set()
    for user_id in user_ids:
        seen_spots.update(user_preferences[user_id]['location_specific'].keys())
    
    for _, spot in recommendations.iterrows():
        if spot['id'] not in seen_spots:
            return spot

####
#internal function, no api needed, just reads from database
###
def least_misery_group_recommend(user_ids, n=10):
    individual_scores = []
    for user_id in user_ids:
        user_profile = get_user_profile(user_id, tfidf, coordinate_scaler)
        scores = cosine_similarity([user_profile], features)[0]
        individual_scores.append(scores)
    
    # Take the minimum score for each item across all users
    group_scores = np.min(individual_scores, axis=0)
    
    top_indices = group_scores.argsort()[-n:][::-1]
    recommended_ids = df.iloc[top_indices]['id'].tolist()
    
    return spot_details[spot_details['id'].isin(recommended_ids)][['id', 'name', 'image_url', 'phone']]

    return None  # Return None if all recommended spots have been seen

####
#internal function, no api needed, just reads from database
###
def get_group_recommendation(user_ids):
    # You could alternate between different group recommendation strategies
    strategies = [group_based_recommend, least_misery_group_recommend]
    strategy = random.choice(strategies)
    
    recommendations = strategy(user_ids, n=1)
    if not recommendations.empty:
        return recommendations.iloc[0]
    else:
        return None

#merely for testing purposes
def main():
    # add_new_user(1, {'price': '$$', 'categories': ['Italian', 'Bars'], 'coordinates': (39.9526, 75.1652)})
    # update_user_preferences(1, {'categories': ['Italian', 'Bars', 'Seafood']})
    # record_spot_interaction(1, 'j1S3NUrkB3BVT49n_e76NQ', {'time_viewing': 5.0})
    # record_spot_interaction(1, 'zj8Lq1T8KIC5zwFief15jg', {'time_viewing': 2.0})

    # next_spot = get_next_spot(1)
    # print("Next spot to show:", next_spot)

    #stats = get_user_stats(1)
    #print("User stats:", stats)
    #Add more test users
    add_new_user(2, {'price': '$', 'categories': ['New American', 'Bars'], 'coordinates': (40.7128, 74.0060)}, 'Jack Ingof', 'password', 23)
    add_new_user(3, {'price': '$$', 'categories': ['Mexican', 'Bars'], 'coordinates': (34.0522, 118.2437)},'Don T. Beakunt', 'peep', 11)
    add_new_user(4, {'price': '$$$', 'categories': ['Japanese', 'Cafes'], 'coordinates': (35.6895, 139.6917)},'Ben Dover', 'bread6', 76)
    add_new_user(5, {'price': '$', 'categories': ['Fish & Chips', 'Food Stands'], 'coordinates': (51.5074, 0.1278)},'Mike Hawk', 'mandilj', 999)


    

if __name__ == "__main__":
    main()

New user 2 added successfully
New user 3 added successfully
New user 4 added successfully
New user 5 added successfully


In [13]:
from pymongo import MongoClient

client = MongoClient('mongodb+srv://loko:melike2004@lovelores.h1nkog2.mongodb.net/?retryWrites=true&w=majority&appName=LoveLores')
db = client.GoSpot
collection = db['User']
#list every user in the database
# list(collection.find({}))
#collection.insert_one(user_preferences[1])
# for key in user_preferences:
#     user_preferences[key]['_id'] = key
#     collection.insert_one(user_preferences[key])
# list(collection.find({'_id':3}))


In [5]:
list(collection.find_one({'_id': 1}).get('location_specific',{}).keys()  )

['j1S3NUrkB3BVT49n_e76NQ', 'zj8Lq1T8KIC5zwFief15jg']

In [12]:
import pandas as pd
example_df = pd.DataFrame([{'id': 1, 'name': 'Spot 1', 'image_url': 'url1', 'phone': '123-456-7890'},{ 'id': 2, 'name': 'Spot 2', 'image_url': 'url2', 'phone': '123-456-7890'}])
#display each entry in the dataframe as a series
for index, row in example_df.iterrows():
    print(row['id'])


1
2


In [17]:
collection = db['User']
#create a new field 'groups' for each user, and set it to an empty list
collection.update_many({},{'$set':{'groups':[]}})

UpdateResult({'n': 9, 'electionId': ObjectId('7fffffff0000000000000022'), 'opTime': {'ts': Timestamp(1722787185, 11), 't': 34}, 'nModified': 9, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1722787185, 11), 'signature': {'hash': b'\x04\xc5\x06\x0c\xa8\xee\r\x81\x02)\xf2\x81\x12A\xb9\x80L\x04u\x1b', 'keyId': 7351819825506680836}}, 'operationTime': Timestamp(1722787185, 11), 'updatedExisting': True}, acknowledged=True)

In [15]:
import numpy as np
np.random.rand() - 0.5

-0.042782440211265205

In [20]:
import pandas as pd, json, numpy as np, random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from annoy import AnnoyIndex

category_mapping = {
    'American': ['American', 'New American', 'Southern', 'Soul Food', 'Cajun/Creole', 'Tex-Mex'],
    'Asian': ['Chinese', 'Japanese', 'Korean', 'Thai', 'Vietnamese', 'Indian', 'Pakistani', 'Bangladeshi', 'Taiwanese', 'Filipino', 'Malaysian', 'Indonesian', 'Singaporean', 'Burmese', 'Cambodian', 'Laotian', 'Mongolian', 'Nepalese', 'Sri Lankan', 'Asian Fusion'],
    'European': ['Italian', 'French', 'Spanish', 'German', 'Greek', 'British', 'Irish', 'Scottish', 'Polish', 'Russian', 'Ukrainian', 'Hungarian', 'Czech', 'Austrian', 'Belgian', 'Dutch', 'Swiss', 'Scandinavian', 'Portuguese'],
    'Latin American': ['Mexican', 'Brazilian', 'Peruvian', 'Argentine', 'Colombian', 'Venezuelan', 'Cuban', 'Puerto Rican', 'Dominican', 'Salvadoran', 'Honduran', 'Nicaraguan', 'Guatemalan', 'Ecuadorian', 'Bolivian', 'Chilean'],
    'Middle Eastern': ['Lebanese', 'Turkish', 'Persian/Iranian', 'Israeli', 'Moroccan', 'Egyptian', 'Syrian', 'Armenian', 'Afghan', 'Iraqi', 'Uzbek', 'Georgian'],
    'African': ['Ethiopian', 'Nigerian', 'Ghanaian', 'Senegalese', 'South African', 'Eritrean', 'Somali', 'Kenyan', 'Tanzanian', 'Ugandan'],
    'Seafood': ['Seafood', 'Sushi Bars', 'Fish & Chips', 'Poke'],
    'Fast Food': ['Fast Food', 'Burgers', 'Pizza', 'Sandwiches', 'Hot Dogs', 'Chicken Wings'],
    'Vegetarian and Vegan': ['Vegetarian', 'Vegan', 'Raw Food'],
    'Breakfast and Brunch': ['Breakfast & Brunch', 'Pancakes', 'Waffles', 'Bagels', 'Donuts'],
    'Bakeries and Desserts': ['Bakeries', 'Desserts', 'Ice Cream & Frozen Yogurt', 'Cupcakes', 'Patisserie/Cake Shop', 'Gelato'],
    'Cafes and Coffee Shops': ['Cafes', 'Coffee & Tea', 'Bubble Tea'],
    'Bars and Pubs': ['Bars', 'Pubs', 'Sports Bars', 'Wine Bars', 'Beer Gardens', 'Cocktail Bars', 'Dive Bars', 'Hookah Bars'],
    'Specialty Food': ['Cheese Shops', 'Butcher', 'Farmers Market', 'Specialty Food', 'Organic Stores', 'Health Markets'],
    'Food Trucks and Stands': ['Food Trucks', 'Food Stands', 'Street Vendors'],
    'Grocery': ['Grocery', 'International Grocery', 'Convenience Stores'],
    'Nightlife': ['Nightlife', 'Dance Clubs', 'Karaoke', 'Comedy Clubs', 'Jazz & Blues'],
    'Arts and Entertainment': ['Museums', 'Art Galleries', 'Performing Arts', 'Music Venues', 'Theaters', 'Cinema'],
    'Outdoor Activities': ['Parks', 'Beaches', 'Hiking', 'Botanical Gardens', 'Playgrounds', 'Dog Parks'],
    'Fitness and Sports': ['Gyms', 'Yoga', 'Martial Arts', 'Swimming Pools', 'Tennis', 'Basketball Courts', 'Soccer'],
    'Shopping': ['Shopping Centers', 'Clothing', 'Shoes', 'Jewelry', 'Books', 'Electronics', 'Home & Garden'],
    'Beauty and Spas': ['Hair Salons', 'Nail Salons', 'Day Spas', 'Massage'],
    'Hotels and Accommodation': ['Hotels', 'Hostels', 'Bed & Breakfast'],
    'Event Planning and Services': ['Wedding Planning', 'Party & Event Planning', 'Caterers', 'Photographers'],
    'Automotive': ['Car Dealers', 'Auto Repair', 'Car Wash', 'Gas Stations'],
    'Professional Services': ['Lawyers', 'Accountants', 'Real Estate', 'Insurance'],
    'Education': ['Schools', 'Colleges', 'Tutoring', 'Cooking Classes', 'Art Schools'],
    'Pets': ['Pet Stores', 'Veterinarians', 'Pet Groomers', 'Dog Walkers'],
    'Religious Organizations': ['Churches', 'Mosques', 'Synagogues', 'Temples'],
    'Other': []  # Catch-all for categories that don't fit elsewhere
}

# Create a reverse mapping for easy lookup
reverse_category_mapping = {sub_cat: main_cat for main_cat, sub_cats in category_mapping.items() for sub_cat in sub_cats}

with open('Updating_datasets/initial_weights.json', 'r') as file:
    data = json.load(file)

index=[]
vals = []
for key,value in data.items():
    index.append(key)
    vals.append([val for val in value.values()])
    
initial_weights = pd.DataFrame(vals,columns=[key for key in data[index[0]].keys()],index=index)

def preprocess_data(initial_weights):
    df = initial_weights.copy().reset_index()
    df.rename(columns={'index': 'id'}, inplace=True)
    
    # Extract categories and map to general categories
    df['categories'] = df['categories'].apply(lambda x: [reverse_category_mapping.get(cat['title'], 'Other') for cat in x])
    df['categories'] = df['categories'].apply(lambda x: ', '.join(set(x)) if x else 'Other')
    
    # Handle missing values
    df['review_count'] = df['review_count'].fillna(0)
    df['rating'] = df['rating'].fillna(0)
    df['price'] = df['price'].fillna('$')
    df['latitude'] = df['coordinates'].apply(lambda x: x['latitude'] if x and 'latitude' in x else 0)
    df['longitude'] = df['coordinates'].apply(lambda x: x['longitude'] if x and 'longitude' in x else 0)
    df = df.drop(columns=['coordinates'])
    
    return df



df = preprocess_data(initial_weights)
#print(df)

#one time thing, should be saved into the database as read only, unless we expand with more datasets
def create_feature_matrix(df):
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['categories'])
    
    scaler = MinMaxScaler()
    numerical_features = scaler.fit_transform(df[['review_count', 'rating', 'latitude', 'longitude']].fillna(0))
    
    price_dummies = pd.get_dummies(df['price'], prefix='price').fillna(0)
    
    features = np.hstack((tfidf_matrix.toarray(), numerical_features, price_dummies.values))
    
    return features, tfidf, scaler

def build_annoy_index(features, n_trees=150):
    print(f"Building Annoy index with {features.shape[1]} dimensions...")
    f = features.shape[1]
    t = AnnoyIndex(f, 'angular')
    try:
        for i in range(features.shape[0]):
            print(f"Adding item {i} to the index")
            t.add_item(i, features[i])
        print(f"Added {features.shape[0]} items to the index")
        
        print(f"Starting to build index with {n_trees} trees...")
        t.build(n_trees)
        print(f"Built index with {n_trees} trees")
    except Exception as e:
        print(f"Error in build_annoy_index: {str(e)}")
        import traceback
        traceback.print_exc()
        raise
    return t

def create_and_save_annoy_index(df):
    # print("Starting create_feature_matrix...")
    features, tfidf, scaler = create_feature_matrix(df)
    # print(f"Feature matrix created. Shape: {features.shape}")
    
    # print("Building Annoy index...")
    annoy_index = build_annoy_index(features)
    # print("Annoy index built successfully")
    
    print("Saving Annoy index...")
    try:
        annoy_index.save('item_similarity.ann')
        print("Annoy index saved successfully")
    except Exception as e:
        print(f"Error saving Annoy index: {str(e)}")
        # Try saving to a different location
        try:
            annoy_index.save('C:/temp/item_similarity.ann')
            print("Annoy index saved successfully to C:/temp/")
        except Exception as e:
            print(f"Error saving Annoy index to C:/temp/: {str(e)}")
    
    return features, tfidf, scaler


'\ndef build_annoy_index(features, n_trees=30):\n    print(f"Building Annoy index with {features.shape[1]} dimensions...")\n    f = features.shape[1]\n    t = AnnoyIndex(f, \'angular\')\n    try:\n        for i in range(features.shape[0]):\n            print(f"Adding item {i} to the index")\n            t.add_item(i, features[i])\n        print(f"Added {features.shape[0]} items to the index")\n        \n        print(f"Starting to build index with {n_trees} trees...")\n        t.build(n_trees)\n        print(f"Built index with {n_trees} trees")\n    except Exception as e:\n        print(f"Error in build_annoy_index: {str(e)}")\n        import traceback\n        traceback.print_exc()\n        raise\n    return t\n\ndef create_and_save_annoy_index(df):\n    # print("Starting create_feature_matrix...")\n    features, tfidf, scaler = create_feature_matrix(df)\n    # print(f"Feature matrix created. Shape: {features.shape}")\n    \n    # print("Building Annoy index...")\n    annoy_index = bu

In [21]:
features,_,_ = create_feature_matrix(df)

In [57]:
from pymongo import MongoClient
client = MongoClient('mongodb+srv://loko:melike2004@lovelores.h1nkog2.mongodb.net/?retryWrites=true&w=majority&appName=LoveLores')
db = client.GoSpot
collection = db['Spot']
import json
spots = json.load(open('../datasets/data.json'))

In [64]:
collection.insert_one(spots['businesses'][0])

InsertOneResult('zj8Lq1T8KIC5zwFief15jg', acknowledged=True)