# Recommender System Pipeline
## Synthetic Data Generation

how tags can be used

In [2]:
import csv
import random
from collections import defaultdict
import pandas as pd

# EVENTS
# Define possible values for each structured tag
structured_tags = {
    'Free/Paid': ['Free', 'Paid'],
    'Individual/Group': ['Individual', 'Group'],
    'Indoor/Outdoor': ['Indoor', 'Outdoor'],
    # Add other structured tags here
}

# List of unstructured tags
unstructured_tags = ['Well-Being', 'CAPS', 'Stress-Relief', 'Healthy-Cooking-Demos', 'Food',
                     'Forum', 'Identity', 'BIPOC', 'RISE-CCR', 'Recovery', 'Mindfulness',
                     'Virtual', 'RISE', 'International', 'Academics', 'Yoga', 'Stress',
                     'Healthy-Cooking-Demos', 'Friendship', 'Hybrid', 'grad-prof', 'LGBTQ',
                     'Meditation', 'Loss']

def generate_data_point():
    data_point = {}
    # Generate structured tags
    for tag, values in structured_tags.items():
        data_point[tag] = random.choice(values)
    # Generate random unstructured tag
    data_point['Unstructured Tag'] = random.choice(unstructured_tags)
    # data_point['Rating'] = random.randint(1, 5)
    return data_point

def generate_data_set(num_points):
    return [generate_data_point() for _ in range(num_points)]

def write_to_csv(data, filename):
    keys = data[0].keys()
    with open(filename, 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

data_set = generate_data_set(1000)
write_to_csv(data_set, 'synthetic_data.csv')

# USERS

# Define possible values for each user attribute
user_attributes = {
    'Age': ['18-25', '26-35', '36-45', '46-55', '56+'],
    'Engagement': ['Low', 'Medium', 'High'],
    # Add other user attributes here
}

def generate_user():
    user = {}
    # Generate user attributes
    for attribute, values in user_attributes.items():
        user[attribute] = random.choice(values)
    return user

def generate_user_matrix(num_users):
    return [generate_user() for _ in range(num_users)]

def write_users_to_csv(users, filename):
    keys = users[0].keys()
    with open(filename, 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(users)

user_matrix = generate_user_matrix(500)
write_users_to_csv(user_matrix, 'user_matrix.csv')

# --------

data = pd.read_csv('synthetic_data.csv')
# data.index.name = 'Events'

users = pd.read_csv('user_matrix.csv')
# users.index.name = 'User_ID'

# --------
# Extract unique tags from both structured and unstructured tags
all_tags = []
for values in structured_tags.values():
    all_tags.extend(values)
all_tags.extend(unstructured_tags)
all_tags = list(set(all_tags))

# Create user-item matrix
user_item_matrix = defaultdict(dict)

# Create one-hot encoded vectors for each user and item
for user_id, user in users.iterrows():
    user_item_matrix[user_id] = {tag: 0 for tag in all_tags}

for event_id, event in data.iterrows():
    for tag, value in event.items():
        if tag != 'Unstructured Tag':
            user_item_matrix[event_id][value] = 1
    user_item_matrix[event_id][event['Unstructured Tag']] = 1

user_item_df = pd.DataFrame.from_dict(user_item_matrix, orient='index')
user_item_df.fillna(0, inplace=True)  # Fill NaN values with 0 (for tags not associated with events)
user_item_df.index.name = 'User_ID' 
user_item_df.to_csv('user_item_matrix.csv')

In [4]:
user_item_df.head(50)

Unnamed: 0_level_0,BIPOC,Stress,Mindfulness,Yoga,Forum,Healthy-Cooking-Demos,International,Paid,Group,LGBTQ,...,Hybrid,Free,Friendship,Virtual,Recovery,Loss,grad-prof,Meditation,Academics,Identity
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
data

Unnamed: 0_level_0,Free/Paid,Individual/Group,Indoor/Outdoor,Unstructured Tag,All_Tags
Events,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Paid,Group,Indoor,Loss,Paid Group Indoor Loss
1,Free,Group,Indoor,Food,Free Group Indoor Food
2,Free,Individual,Indoor,Healthy-Cooking-Demos,Free Individual Indoor Healthy-Cooking-Demos
3,Free,Individual,Indoor,CAPS,Free Individual Indoor CAPS
4,Paid,Group,Outdoor,Stress-Relief,Paid Group Outdoor Stress-Relief
...,...,...,...,...,...
995,Paid,Individual,Outdoor,grad-prof,Paid Individual Outdoor grad-prof
996,Paid,Individual,Indoor,Stress,Paid Individual Indoor Stress
997,Paid,Group,Outdoor,Stress,Paid Group Outdoor Stress
998,Free,Individual,Indoor,International,Free Individual Indoor International


 #### Next Steps:
 
 - Finalize data sources for event/services
     - SHWB - Tockify: RISE/CAPS?
     - other websites/student orgs
     - ex. REC? FusionGo API data
 - scraping data - APIs?
 - data from Tockify
 - score data
 - beyond Tockify (diff formats data - compatibility)
 - services - as a combination of corr event tags
 - Onboarding/Triage - ask for tags
 - Adding Incentives to the RecSys
 
 Evaluation performance metrics?

automatically update every morning

In [53]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

user_item_df = pd.read_csv('user_item_matrix.csv', index_col='User_ID')

def recommend_events_for_user(user_id, top_n=5):
    if user_id not in user_item_df.index:
        return "User not found in the database."
    
    # Get user preferences (tags)
    user_preferences = user_item_df.loc[user_id].values.reshape(1, -1)

    # Compute cosine similarity between user preferences and all events
    similarities = cosine_similarity(user_preferences, user_item_df.values)
    sim_scores = list(enumerate(similarities[0]))

    # Sort events based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N similar events
    top_event_indices = [x[0] for x in sim_scores[1:top_n+1]]  # Exclude user's own preferences
    top_events = user_item_df.iloc[top_event_indices]
    
    return top_events

user_id = random.randint(0, 999)
recommended_events = recommend_events_for_user(user_id)
recommended_events.to_csv('recommended_events_for_user.csv')

In [71]:
recommended_events

Unnamed: 0_level_0,Academics,Stress-Relief,Loss,Well-Being,Yoga,grad-prof,RISE,Outdoor,BIPOC,Mindfulness,...,International,CAPS,Individual,Paid,Friendship,Stress,Group,Healthy-Cooking-Demos,RISE-CCR,LGBTQ
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [54]:
users['Age']

User_ID
0      36-45
1      26-35
2        56+
3      26-35
4      46-55
       ...  
495    46-55
496    26-35
497    18-25
498    36-45
499      56+
Name: Age, Length: 500, dtype: object

In [57]:
users.index.values

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [51]:
for _, user in users.iterrows():
    user_id = user['User_ID']

KeyError: 'User_ID'

In [42]:
user_item = pd.read_csv('user_item_matrix.csv')
user_item

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.990,0.991,0.992,0.993,0.994,0.995,0.996,0.997,0.998,0.999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Synthetic Data
### Item Matrix (Events)

In [34]:
data

Unnamed: 0_level_0,Free/Paid,Individual/Group,Indoor/Outdoor,Unstructured Tag,All_Tags
Events,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Free,Group,Outdoor,Yoga,Free Group Outdoor Yoga
1,Paid,Individual,Indoor,Food,Paid Individual Indoor Food
2,Free,Individual,Outdoor,International,Free Individual Outdoor International
3,Free,Group,Outdoor,Academics,Free Group Outdoor Academics
4,Paid,Group,Outdoor,Stress,Paid Group Outdoor Stress
...,...,...,...,...,...
995,Paid,Group,Outdoor,Yoga,Paid Group Outdoor Yoga
996,Free,Individual,Outdoor,Food,Free Individual Outdoor Food
997,Paid,Group,Outdoor,Friendship,Paid Group Outdoor Friendship
998,Paid,Group,Outdoor,Healthy-Cooking-Demos,Paid Group Outdoor Healthy-Cooking-Demos


(1, 31)

### User Matrix

array([[0.24538254],
       [0.48765551],
       [0.18201463],
       [0.24931057],
       [0.        ],
       [0.22770978],
       [0.        ],
       [0.        ],
       [0.2475157 ],
       [0.48765551],
       [0.22247371],
       [0.23583537],
       [0.        ],
       [0.51593905],
       [0.23507107],
       [0.47036137],
       [0.23281725],
       [0.        ],
       [0.24903088],
       [0.40128233],
       [0.27635504],
       [0.19749092],
       [0.23507107],
       [0.25272434],
       [0.24842094],
       [0.2589548 ],
       [0.        ],
       [0.39772063],
       [0.22591478],
       [0.25976451],
       [0.24817259],
       [0.        ],
       [0.47621887],
       [0.47583623],
       [0.        ],
       [0.24822248],
       [0.25671183],
       [0.23434878],
       [0.24817259],
       [0.22770978],
       [0.51593905],
       [0.2589548 ],
       [0.25511865],
       [0.        ],
       [0.22960618],
       [0.25593026],
       [0.48913527],
       [0.   

In [98]:
import numpy as np
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

## Recommendations based on tags similarity

In [39]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# Read the generated synthetic data
data = pd.read_csv('synthetic_data.csv')
data.index.name = 'Events'

# Combine structured and unstructured tags into a single feature
data['All_Tags'] = data.apply(lambda x: ' '.join(x[structured_tags.keys()]) + ' ' + x['Unstructured Tag'], axis=1)

# Use TF-IDF to vectorize the combined tags
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['All_Tags'])

# Similarity scores using cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Recommendations based on item similarity
def get_recommendations(index, cosine_sim=cosine_sim):
    sim_scores = list(enumerate(cosine_sim[index])) # pairwise similarity scores of indexed item   
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Sort items based on similarity scores
    sim_scores = sim_scores[1:6] # Top 5 most similar items (excluding itself)
    indices = [i[0] for i in sim_scores] # indices of recommended items
    return data.iloc[indices]

# Recommendations based on item similarity
def get_recommendations_new(index, cosine_sim=cosine_sim):
    sim_scores = list(enumerate(cosine_sim[index])) # pairwise similarity scores of indexed item   
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Sort items based on similarity scores
    sim_scores = sim_scores[1:6] # Top 5 most similar items (excluding itself)
    indices = [i[0] for i in sim_scores] # indices of recommended items
    return data.iloc[indices]


idx = random.randint(0, 999)
recommendations = get_recommendations(idx)
# recommendations.to_csv(f"recommendations_for_item_{idx}.csv", index=False)
    
print('5 most similar events to event',idx, 'are:')
recommendations

5 most similar events to event 169 are:


Unnamed: 0_level_0,Free/Paid,Individual/Group,Indoor/Outdoor,Unstructured Tag,All_Tags
Events,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
396,Free,Group,Outdoor,Loss,Free Group Outdoor Loss
489,Free,Group,Outdoor,Loss,Free Group Outdoor Loss
588,Free,Group,Outdoor,Loss,Free Group Outdoor Loss
596,Free,Group,Outdoor,Loss,Free Group Outdoor Loss
863,Free,Group,Outdoor,Loss,Free Group Outdoor Loss


In [73]:
# data.iloc[idx]
# get_recommendations(idx)

(1000, 1000)

In [28]:
print(tfidf_matrix)

  (0, 30)	0.8090022972083648
  (0, 22)	0.34131493637972427
  (0, 11)	0.3333565926936534
  (0, 8)	0.34335517969497875
  (1, 6)	0.8182079270978875
  (1, 16)	0.32991941755320636
  (1, 15)	0.33779570430182215
  (1, 23)	0.3279985184581891
  (2, 17)	0.8176068176182771
  (2, 15)	0.3317626125593982
  (2, 22)	0.3317626125593982
  (2, 8)	0.33374575592752975
  (3, 0)	0.8061030761885406
  (3, 22)	0.3436199644379856
  (3, 11)	0.3356078750656337
  (3, 8)	0.3456739862832312
  (4, 28)	0.7686571967226775
  (4, 23)	0.36493243707936973
  (4, 22)	0.37583282444466287
  (4, 11)	0.3670696369405867
  (5, 29)	0.8212780859420851
  (5, 23)	0.3254915153735665
  (5, 22)	0.3352138180279521
  (5, 11)	0.32739773239020403
  (6, 14)	0.8272593461520313
  :	:
  (994, 23)	0.33275881525051126
  (994, 22)	0.34269818927403145
  (994, 11)	0.33470759267206296
  (995, 23)	0.33275881525051126
  (995, 30)	0.8122809546881231
  (995, 22)	0.34269818927403145
  (995, 11)	0.33470759267206296
  (996, 6)	0.8128805877484173
  (996, 15)	0

In [2]:
import csv
import random
from collections import defaultdict

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import numpy as np

# EVENTS
# Define possible values for each structured tag
structured_tags = {
    'Free/Paid': ['Free', 'Paid'],
    'Individual/Group': ['Individual', 'Group'],
    'Indoor/Outdoor': ['Indoor', 'Outdoor'],
    # Add other structured tags here
}

# List of unstructured tags
unstructured_tags = ['Well-Being', 'CAPS', 'Stress-Relief', 'Healthy-Cooking-Demos', 'Food',
                     'Forum', 'Identity', 'BIPOC', 'RISE-CCR', 'Recovery', 'Mindfulness',
                     'Virtual', 'RISE', 'International', 'Academics', 'Yoga', 'Stress',
                     'Healthy-Cooking-Demos', 'Friendship', 'Hybrid', 'grad-prof', 'LGBTQ',
                     'Meditation', 'Loss']

def generate_data_point():
    data_point = {}
    # Generate structured tags
    for tag, values in structured_tags.items():
        data_point[tag] = random.choice(values)
    # Generate random unstructured tag
    data_point['Unstructured Tag'] = random.choice(unstructured_tags)
    # data_point['Rating'] = random.randint(1, 5)
    return data_point

def generate_data_set(num_points):
    return [generate_data_point() for _ in range(num_points)]

def write_to_csv(data, filename):
    keys = data[0].keys()
    with open(filename, 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

data_set = generate_data_set(1000)
write_to_csv(data_set, 'synthetic_data.csv')
data = pd.read_csv('synthetic_data.csv')



In [3]:
data

Unnamed: 0,Free/Paid,Individual/Group,Indoor/Outdoor,Unstructured Tag
0,Paid,Group,Indoor,Loss
1,Free,Group,Indoor,Food
2,Free,Individual,Indoor,Healthy-Cooking-Demos
3,Free,Individual,Indoor,CAPS
4,Paid,Group,Outdoor,Stress-Relief
...,...,...,...,...
995,Paid,Individual,Outdoor,grad-prof
996,Paid,Individual,Indoor,Stress
997,Paid,Group,Outdoor,Stress
998,Free,Individual,Indoor,International


In [14]:
# Read the generated synthetic data
data = pd.read_csv('synthetic_data.csv')
data.index.name = 'Events'

# Combine structured and unstructured tags into a single feature
data['All_Tags'] = data.apply(lambda x: ' '.join(x[structured_tags.keys()]) + ' ' + x['Unstructured Tag'], axis=1)

# Use TF-IDF to vectorize the combined tags
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(data['All_Tags'])

In [42]:
def find_candidates(interests=['free'],k=5):
    user_vector = tfidf.transform(interests)
    similarity = cosine_similarity(tfidf_matrix, user_vector)
    out = np.argsort(similarity[:,0])[::-1][:k]
    return out

In [49]:
new_user_tag = 'free group '
recs = find_candidates([new_user_tag])

In [50]:
recs

array([816, 114, 949, 647, 656], dtype=int64)

# Tockify

In [39]:
!pip install selenium



In [2]:
!pip install cloudscraper

Collecting cloudscraper
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl (99 kB)
     ---------------------------------------- 99.7/99.7 kB 5.6 MB/s eta 0:00:00
Collecting requests-toolbelt>=0.9.1
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
     ---------------------------------------- 54.5/54.5 kB ? eta 0:00:00
Installing collected packages: requests-toolbelt, cloudscraper
Successfully installed cloudscraper-1.2.71 requests-toolbelt-1.0.0


In [3]:
import json
import cloudscraper
# from bs4 import BeautifulSoup  
import time

class TockifyEventScraper:
	"""
	Scraper for all Tockify based calendars within UCSD
	
	TODO:
	Create base class on which this class with extend.
	This will be a template for all future calendars
	
	"""

	def __init__(self):
		# Scraper object to bypass AJAX shields
		self.scraper = cloudscraper.create_scraper()

		# Potentially these fields could be inherited from base class
		self.raw_text = ""
		self.events_extracted = {}
	
	def get_list_of_events(self, 
						   calendar_name: str = "ucenevents", 
						   calendar_type: str = "monthly", 
						   search_query: str = "",
						   time_ms: int = -1):
		"""
		Function to extract events from specific calendar
		
		Inputs:
		-- calendar_name: Tockify's calendar name. E.g. University Centers is called 'ucenevents'
		-- calendar_type: Can be 'pinboard', 'agenda' or 'monthly'
			Always choose monthly to get full view of events
		-- search_query: Search query if requested, else left as "" will show all events
		-- time_ms: Start time for query in milliseconds.
			If set to (-1) it will take current time and find all events after that
			
		Output:
		-- events: JSON array of all events, along with tags
		(TODO: clean up array into events_extracted that converts info in standardized format)
		"""
		
		if time_ms == -1:
			time_ms = int(round(time.time() * 1000))
		
		url = f"https://tockify.com/{calendar_name}/{calendar_type}/?start_ms={time_ms}&search={search_query}"
		self.raw_text = self.scraper.get(url).text

		# Find query output section in code
		# NOTE: these were found experimentally by examing HTML dump
		# These can change in the future and might need timely readjustment
		
		start_idx = self.raw_text.find("query")+28
		end_idx = self.raw_text.find("metaData")-2

		# Convert segment into json => creates list of events
		# print(self.raw_text[start_idx: end_idx])
		
		events = json.loads(self.raw_text[start_idx: end_idx])
		
		# Generate Tockify event page URL from eid
		"""
		Example URL: https://tockify.com/ucenevents/detail/4627/1708470000000
		breaks down to ../{calendar_name}/detail/{eid.uid}/{eid.tid}
		"""
		for event in events:
			event['event_url'] = r"https://tockify.com/" + str(calendar_name) + \
								"/" + str(event['eid']['uid']) + "/" + str(event['eid']['tid'])

		return events

if __name__ == '__main__':
	scr = TockifyEventScraper()
	events = scr.get_list_of_events(calendar_name="ucenevents", search_query="")
	print(events)

[{'calid': '5e8cb682df82fe3367f55892', 'eid': {'uid': '4626', 'seq': 0, 'tid': 1707881400000, 'rid': 0}, 'when': {'start': {'millis': 1707881400000, 'tzid': 'America/Los_Angeles', 'ltz': 'PST', 'offset': -28800000}, 'end': {'millis': 1711936800000, 'tzid': 'America/Los_Angeles', 'ltz': 'PDT', 'offset': -25200000}, 'allDay': False}, 'content': {'summary': {'text': 'Redwood at La Jolla Playhouse'}, 'description': {'text': 'I hear my great escape calling Broadway icon Idina Menzel makes her La Jolla Playhouse debut in the world premiere of Redwood, a one-of-a-kind theatrical event about one woman’s journey into the preci'}, 'tagset': {'tags': {'default': ['Art', 'Fun', 'Theater']}}, 'place': 'Potiker Theatre, La Jolla Playhouse', 'address': '2910 La Jolla Village Drive, La Jolla, CA 92037, USA', 'location': {}, 'vlocation': {}, 'imageId': 'BWC_6595a8fd1e206623ed4f8740', 'attachments': [], 'noDetail': False, 'imageIdNg': '6595a8fd1e206623ed4f8740', 'imageSets': [{'ownerId': '5e8cb604df82fe

In [7]:
for i in len(events):
    events[i]

57

In [23]:
events[0]

{'calid': '5e8cb682df82fe3367f55892',
 'eid': {'uid': '4626', 'seq': 0, 'tid': 1707881400000, 'rid': 0},
 'when': {'start': {'millis': 1707881400000,
   'tzid': 'America/Los_Angeles',
   'ltz': 'PST',
   'offset': -28800000},
  'end': {'millis': 1711936800000,
   'tzid': 'America/Los_Angeles',
   'ltz': 'PDT',
   'offset': -25200000},
  'allDay': False},
 'content': {'summary': {'text': 'Redwood at La Jolla Playhouse'},
  'description': {'text': 'I hear my great escape calling Broadway icon Idina Menzel makes her La Jolla Playhouse debut in the world premiere of Redwood, a one-of-a-kind theatrical event about one woman’s journey into the preci'},
  'tagset': {'tags': {'default': ['Art', 'Fun', 'Theater']}},
  'place': 'Potiker Theatre, La Jolla Playhouse',
  'address': '2910 La Jolla Village Drive, La Jolla, CA 92037, USA',
  'location': {},
  'vlocation': {},
  'imageId': 'BWC_6595a8fd1e206623ed4f8740',
  'attachments': [],
  'noDetail': False,
  'imageIdNg': '6595a8fd1e206623ed4f8740

In [None]:
events[0].keys()

In [15]:
# Import libraries
import re
import requests
from bs4 import BeautifulSoup

# Define the URL of the webpage
url = "https://studentwellbeing.ucsd.edu/calendar/index.html"

# Send a GET request and get the response
response = requests.get(url)

patt = re.compile("\$\.get\('/leaderboards/(\w+)/")
js = "http://www.lolking.net/leaderboards/{}/eune/1.json"
soup = BeautifulSoup(requests.get("http://www.lolking.net/leaderboards#/eune/1").content)
script = soup.find("script", text=re.compile("\$\.get\('/leaderboards/"))

val = patt.search(script.text).group(1)
data = requests.get(js.format(val)).json()



# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find the calendar element by its ID
calendar = soup.find(id="WebPartWPQ2")

# Check if the calendar element was found
if calendar is not None:
    # Find all the links within the calendar element
    links = calendar.find_all("a")
    # Continue with the rest of your code...
else:
    print("Calendar element not found")

# Find all the links within the calendar element
links = calendar.find_all("a")

# Loop through the links and extract the event details
events = []
for link in links:
    # Get the event title from the link text
    title = link.text.strip()
    # Get the event URL from the link href attribute
    url = link["href"]
    # Get the event date and time from the URL query parameters
    date = url.split("date=")[-1].split("&")[0]
    time = url.split("time=")[-1].split("&")[0]
    # Create a dictionary with the event details
    event = {
        "title": title,
        "url": url,
        "date": date,
        "time": time
    }
    # Append the event to the events list
    events.append(event)

# Print the events list
print(events)

Calendar element not found


AttributeError: 'NoneType' object has no attribute 'find_all'

In [18]:
from selenium import webdriver
import time


# Define the URL of the webpage
url = "https://studentwellbeing.ucsd.edu/calendar/index.html"

from selenium.webdriver.chrome.service import Service

service = Service(executable_path="C:/Users/pbedm/OneDrive/Desktop/chromedriver_win32/chromedriver.exe")
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)

# Sleep to allow time for the page to load (you can use WebDriverWait for better handling)
time.sleep(5)  # Adjust as needed

# Find the calendar element using the CSS selector
calendar = driver.find_element_by_css_selector("#tkf-body > div > div > div.viewerArea__main.ng-scope > ui-view > div > div.pinboardView > div > div")

# Check if the calendar element was found
if calendar is not None:
    # Find all the links within the calendar element
    links = calendar.find_elements_by_tag_name("a")
    # Continue with the rest of your code...
    events = []
    for link in links:
        # Get the event title from the link text
        title = link.text.strip()
        # Get the event URL from the link href attribute
        url = link.get_attribute("href")
        # Get the event date and time from the URL query parameters
        date = url.split("date=")[-1].split("&")[0]
        time = url.split("time=")[-1].split("&")[0]
        # Create a dictionary with the event details
        event = {
            "title": title,
            "url": url,
            "date": date,
            "time": time
        }
        # Append the event to the events list
        events.append(event)
    # Print the events list
    print(events)
else:
    print("Calendar element not found")

# Close the webdriver
driver.quit()

SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 114
Current browser version is 119.0.6045.200 with binary path C:\Program Files\Google\Chrome\Application\chrome.exe
Stacktrace:
Backtrace:
	GetHandleVerifier [0x003FA813+48355]
	(No symbol) [0x0038C4B1]
	(No symbol) [0x00295358]
	(No symbol) [0x002B61AC]
	(No symbol) [0x002B1EF3]
	(No symbol) [0x002B0579]
	(No symbol) [0x002E0C55]
	(No symbol) [0x002E093C]
	(No symbol) [0x002DA536]
	(No symbol) [0x002B82DC]
	(No symbol) [0x002B93DD]
	GetHandleVerifier [0x0065AABD+2539405]
	GetHandleVerifier [0x0069A78F+2800735]
	GetHandleVerifier [0x0069456C+2775612]
	GetHandleVerifier [0x004851E0+616112]
	(No symbol) [0x00395F8C]
	(No symbol) [0x00392328]
	(No symbol) [0x0039240B]
	(No symbol) [0x00384FF7]
	BaseThreadInitThunk [0x76407BA9+25]
	RtlInitializeExceptionChain [0x7726BD2B+107]
	RtlClearBits [0x7726BCAF+191]


In [47]:
from selenium.webdriver.common.by import By
url = "https://studentwellbeing.ucsd.edu/calendar/index.html"

options = webdriver.EdgeOptions()
driver = webdriver.Edge(options=options)
driver.get(url)
# response = requests.get(url)
# soup1 = BeautifulSoup(response.content, "html.parser")

elem = driver.find_element(By.CLASS_NAME, "main-section")

# tkf-client-window-wrap-1

In [48]:
elem

<selenium.webdriver.remote.webelement.WebElement (session="8b37e11910f6e190d00d693c1ccdb92a", element="141E29DAF649326C392BC7637079F616_element_13")>

In [44]:
soup1

<!DOCTYPE html>

<html class="no-js" lang="en-US" xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="initial-scale=1.0" name="viewport"/>
<meta content="7MiYjX2ZghMOWvtKRWlar--c6mD2ose5EcbQFQ6szFA" name="google-site-verification"/>
<title>Calendar</title>
<meta content="University of California, San Diego" name="ORGANIZATION"/>
<meta content="index,follow,noarchive" name="robots"/>
<meta content="Well-Being" name="SITE"/>
<meta content="Calendar" name="PAGETITLE"/>
<meta content="Calendar" name="DESCRIPTION"/>
<meta content="a86129a00aaf69f57b25c953d0427c34" name="id"/>
<link href="//cdn.ucsd.edu/cms/decorator-5/styles/bootstrap.min.css" rel="stylesheet"/>
<link href="//cdn.ucsd.edu/cms/decorator-5/styles/base.min.css" rel="stylesheet"/>
<!-- Site-specific CSS files -->
<!-- Other CSS files -->
<!--[if lt IE 9]>
<link rel="stylesheet" href="https://cdn.ucsd.edu/cms/decorator-5/styles/ie-support.css"

In [32]:
import requests
from bs4 import BeautifulSoup

url = 'https://studentwellbeing.ucsd.edu/calendar/index.html'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    tag_elements = soup.find_all('span', class_='eventTags__text')
    
    tag_names = [tag.text.strip() for tag in tag_elements]
    print(tag_names)
else:
    print('Failed to fetch the page')

[]


In [41]:
import json
import urllib.request as urllib2
# new url      
url = 'https://studentwellbeing.ucsd.edu/calendar/index.html'

# read all data
page = urllib2.urlopen(url).read()

# convert json text to python dictionary
data = json.loads(page)
data
# print(data['principal_activities'])

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

# Scrapy

In [1]:
!pip install scrapy



In [2]:
!pip install crochet



In [7]:
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerRunner
import re # text cleaning
from crochet import setup, wait_for # Reactor restart
setup()

class QuotesToCsv(scrapy.Spider):
    """scrape first line of  quotes from `wikiquote` by Maynard James Keenan and save to json file"""
    name = "MJKQuotesToCsv"
    start_urls = ['https://en.wikiquote.org/wiki/Maynard_James_Keenan']
    custom_settings = {'ITEM_PIPELINES': {'__main__.ExtractFirstLine': 1},
                       'FEEDS': {'quotes.csv': {'format': 'csv','overwrite': True}}}

    def parse(self, response):
        """parse data from urls"""
        for quote in response.css('div.mw-parser-output > ul > li'):
            yield {'quote': quote.extract()}


class ExtractFirstLine(object):
    def process_item(self, item, spider):
        """text processing"""
        lines = dict(item)["quote"].splitlines()
        first_line = self.__remove_html_tags__(lines[0])

        return {'quote': first_line}

    def __remove_html_tags__(self, text):
        """remove html tags from string"""
        html_tags = re.compile('<.*?>')
        return re.sub(html_tags, '', text)

@wait_for(10)
def run_spider():
    """run spider with MJKQuotesToCsv"""
    crawler = CrawlerRunner()
    d = crawler.crawl(QuotesToCsv)
    return d

run_spider()
df = pd.read_csv('quotes.csv')
df

Unnamed: 0,quote
0,Tool is not Slayer. I went to art school. I sp...
1,I think there’s a reason why wine figures into...
2,You can grow grapes in almost any part of the ...
3,"For the music, it’s not about the individual —..."
4,It’s in my blood. My great-grandfather made wi...
5,"It's the role of us to run our government, the..."
6,"Every now and then, you get people who tend to..."
7,The army influences everything I do. Certainly...
8,The process that we go through in recording wi...
9,One of my biggest heroes in music has been Dav...


In [54]:
response.xpath("//*[contains(@class,'class_B')]").getall()

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

class ExtractDivData(scrapy.Spider):
    name = 'DivData'
    start_urls = ['https://studentwellbeing.ucsd.edu/calendar/index.html']

    def parse(self, response):
        div_xpath = '/html/body/div/div[4]/div'
        #'/html/body/div/div[4]/div/div/div[3]/ui-view/div/div[2]/div/div/div[2]/div[3]/div/div'
        div_content = response.xpath(div_xpath).get()
        if div_content:
            print(div_content)
        else:
            print("Div content not found.")

process = CrawlerProcess()
process.crawl(ExtractDivData)
process.start()

2023-12-06 21:16:08 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
2023-12-06 21:16:08 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.10.6 | packaged by conda-forge | (main, Oct  7 2022, 20:14:50) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1q  5 Jul 2022), cryptography 37.0.1, Platform Windows-10-10.0.22621-SP0
2023-12-06 21:16:08 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-12-06 21:16:08 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-12-06 21:16:08 [scrapy.extensions.telnet] INFO: Telnet Password: 50c4b73f0e5ebc60
2023-12-06 21:16:08 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.

Div content not found.


In [None]:
//*[@id="eid_1280_1701712800000_0"]

In [None]:
/html/body/div/div[4]/div/div/div[3]/ui-view/div/div[2]/div/div/div[2]/div[2]/div/div