In [7]:
import instaloader
import pandas as pd
import datetime 
import time
import logging
from requests.exceptions import RequestException

def scrape_handle(L, handle, cutoffdate):
    max_retries = 3
    retry_count = 0
    posts_data = []

    while retry_count < max_retries:
        try:
            profile = instaloader.Profile.from_username(L.context, handle)
            for post in profile.get_posts():
                if post.date > cutoffdate:
                    photo_caption = post.accessibility_caption if post.accessibility_caption is not None else ""
                    caption = post.caption if post.caption is not None else ""
                    posts_data.append({
                        'url': post.shortcode,
                        'likes': post.likes,
                        'display_photo': post.url,
                        'account': handle.replace('\"','\\\"'),
                        'date': post.date,
                        'caption': caption.replace("\n",""),
                        'accessibility_caption': photo_caption.replace("\n",""),
                    })
                else:
                    break
            return posts_data
        except (instaloader.exceptions.InstaloaderException, RequestException) as e:
            retry_count += 1
            logging.error(f"Error scraping {handle} (attempt {retry_count}/{max_retries}): {str(e)}")
            if retry_count < max_retries:
                logging.info(f"Retrying {handle}...")
                time.sleep(10)  # Wait for 5 seconds before retrying
            else:
                logging.error(f"Max retries reached for {handle}. Moving to next handle.")
    return []

def scrape_instagram():
    cutoffdate = datetime.datetime.today() - datetime.timedelta(days=10)
    handles = ['uwengsoc','uwcsa','uw_ux','uwblueprint','uwaterlooeng','uwaterloottc','uwaterloodsc','uwaterloopm','uwmcc','gdscwaterloo','uwsmileclub','socratica.info','wataiteam','uwawscloud','techplusuw','itshera.co','uwstartups','electriummobility','uwhiphop','uwaterloo_ksa','uw_aviation','uwaterloopm','uwmcc','uwmsa','gdscwaterloo','waterloo_ultimate','uwcheeseclub','uwstreetdance','uwmidsun','watolink_uw','uwaterlooeng','uwpokerclub','uwaterloocycling','uwaterloobsa','uw_phys_club','uw.gsa','uwcsclub','uwfintech','uwaterloosc','uwactsciclub','uwstatsclub']
    postsDf = pd.DataFrame()

    L = instaloader.Instaloader()
    
    for handle in handles:
        logging.info(f"Scraping {handle}")
        handle_data = scrape_handle(L, handle, cutoffdate)
        if handle_data:
            postsDf = pd.concat([postsDf, pd.DataFrame(handle_data)], ignore_index=True)
        time.sleep(1)  # Delay between handles to avoid rate limiting

    return postsDf

def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    try:
        result = scrape_instagram()
        print(result)
        logging.info("Scraping completed successfully.")
        return result
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")

if __name__ == "__main__":
    postsDf = main()

2024-08-27 15:07:52,646 - INFO - Scraping uwengsoc
2024-08-27 15:08:00,298 - INFO - Scraping uwcsa
2024-08-27 15:08:04,605 - INFO - Scraping uw_ux
2024-08-27 15:08:09,971 - INFO - Scraping uwblueprint
2024-08-27 15:08:14,245 - INFO - Scraping uwaterlooeng
2024-08-27 15:08:19,279 - INFO - Scraping uwaterloottc
2024-08-27 15:08:24,447 - INFO - Scraping uwaterloodsc
2024-08-27 15:08:29,079 - INFO - Scraping uwaterloopm
2024-08-27 15:08:33,689 - INFO - Scraping uwmcc
2024-08-27 15:08:36,461 - INFO - Scraping gdscwaterloo
2024-08-27 15:08:38,725 - INFO - Scraping uwsmileclub
2024-08-27 15:08:43,191 - INFO - Scraping socratica.info
2024-08-27 15:08:51,444 - INFO - Scraping wataiteam
2024-08-27 15:08:54,376 - INFO - Scraping uwawscloud
2024-08-27 15:08:58,572 - INFO - Scraping techplusuw
2024-08-27 15:09:01,873 - INFO - Scraping itshera.co
2024-08-27 15:09:04,896 - INFO - Scraping uwstartups
2024-08-27 15:09:08,686 - INFO - Scraping electriummobility
2024-08-27 15:09:13,998 - INFO - Scraping 

            url  likes                                      display_photo  \
0   C-5xqSrA81K     84  https://instagram.fyyc6-1.fna.fbcdn.net/v/t39....   
1   C_Le_jEAIR4     14  https://instagram.fyyc6-1.fna.fbcdn.net/v/t51....   
2   C_GwM9nSn_X     23  https://instagram.fyyc6-1.fna.fbcdn.net/v/t51....   
3   C-3fNYjPJAE     63  https://instagram.fyyc6-1.fna.fbcdn.net/v/t51....   
4   C-5xqSrA81K     84  https://instagram.fyyc6-1.fna.fbcdn.net/v/t39....   
5   C_CFHGdNhdc     68  https://instagram.fyyc6-1.fna.fbcdn.net/v/t39....   
6   C-_S--MNpit     87  https://instagram.fyyc6-1.fna.fbcdn.net/v/t39....   
7   C--mXDTScac    104  https://instagram.fyyc6-1.fna.fbcdn.net/v/t51....   
8   C-6FveoSVkt    147  https://instagram.fyyc6-1.fna.fbcdn.net/v/t51....   
9   C-3glYmypqo    198  https://instagram.fyyc6-1.fna.fbcdn.net/v/t39....   
10  C-3JeD3y5Ck     92  https://instagram.fyyc6-1.fna.fbcdn.net/v/t39....   
11  C-ytzX3NdJM    185  https://instagram.fyyc6-1.fna.fbcdn.net/v/t51....   

In [8]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("instagram_raw.csv")

In [9]:
#Preliminary Screening to determine if posts contains event or not

from together import Together
import os
from dotenv import load_dotenv
import pandas as pd


load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

postsDf = pd.read_csv("instagram_raw.csv").replace('"','', regex=True)
postsDf["is_event"] = pd.NA
postsDf["processed_json"]=pd.NA

def check_string(input_string):
    if any(word in input_string for word in ['yes', 'Yes', 'True', 'true']):
        return True
    elif any(word in input_string for word in ['no', 'No', 'False', 'false']):
        return False
    else:
        return False  # This handles cases where none of the words are found


cnt=0
for index, row in postsDf.iterrows():
    currentjson = f"'account': '{row['account']}'; 'caption': '{row['caption']}'; 'photo_caption': '{row['accessibility_caption']}'"
    postsDf.at[index, "processed_json"] = currentjson
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "user", "content": f'Does the following instagram post contain a club event with a specified time. RETURN Yes or No: {currentjson}'}],
        max_tokens=2
    )
    print(response.choices[0].message.content)
    is_event = check_string(response.choices[0].message.content)
    postsDf.at[index, "is_event"] = is_event

No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
Yes
No
No
No
Yes
No
No
No
No
No
No
No
No
No
Yes
No
No
No
Yes
No
No
No
Yes
No
Yes
No
Yes
No
Yes
Yes
Yes
No
No
No
No
Yes
No
Yes
No
No
No
No
Yes
No
Yes
Yes
Yes
No
Yes
Yes
Yes
No
Yes
No
No
No
Yes.
Yes
No
No
No
No
No
No
Yes
No
Yes
Yes
No
No
No
No
Yes
Yes
Yes
No
No
Yes
Yes
Yes
No
Yes
Yes
No
Yes
No
Yes
No
Yes
Yes
Yes
Yes.
Yes
No
Yes
No
Yes
Yes
Yes
Yes
Yes
Yes
No
No
No
Yes
Yes
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
No
No
Yes
Yes
No
No
Yes
Yes
Yes
Yes
No
No
No
Yes
No
No
No
No
Yes
Yes
No
No
No
No
No
Yes
No
No
Yes
No
Yes
Yes
Yes
Yes
No
Yes
Yes
Yes
Yes
Yes
Yes
No
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
No
Yes
Yes
Yes
Yes
Yes
No
No
No
Yes
No
No
No
No
No
No
Yes
No
Yes
No
No
Yes
No
No
No
No
No
No
No
No
No
Yes
No
No
No
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
No
No
No
No
No
Yes
No
Yes
No
No
No
No
No
No
Yes
Yes
No
No
Yes
Yes
No
No
Yes
No
No
No
No
No
No
No
No
No
No
No
No


In [11]:
postsDf.to_csv("preliminaryProcessedInformation.csv")

In [14]:
from together import Together
import os
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import json
import pandas as pd
from enum import Enum, auto
import re

load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

basePrompt = open("C:/Users/david/Desktop/uw-upnext/WebScraper/basePrompt.in","r", encoding = "utf-8").read()

postsDf = pd.read_csv("C:/Users/david/Desktop/uw-upnext/WebScraper/preliminaryProcessedInformation.csv")

basePrompt = open("basePrompt.in","r", encoding = "utf-8").read()

def remove_emojis(text):
    # Unicode ranges for emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

class Category(Enum):
    TECH = auto()
    DESIGN = auto()
    SOCIAL = auto()
    ENTERTAINMENT = auto()
    CULTURE = auto()
    SPORTS = auto()
    NETWORKING = auto()
    GAMING = auto()

class Event(BaseModel):
    is_event: bool = Field(description="Whether the post contains an event")
    event_name: str = Field(description="The Name of the Event")
    event_description: str = Field(description='Concise 20 word summary of the event without time or location')
    event_categories: list[str] = Field(description='Categorize the Event into at least one or more of the following: TECH, DESIGN, SOCIAL, MUSIC, CULTURE, SPORTS, NETWORK, GAMING')
    start_time: str = Field(description="The Start time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    end_time: str = Field(description="The End time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    location: str = Field(description= "The location of event")

def return_event_details(inputJson : str):
    chat_completion = client.chat.completions.create(
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    response_format={"type": "json_object", "schema": Event.model_json_schema()},
    messages=[
        {
            "role": "system",
            "content": basePrompt,
        },
        {
            "role": "user",
            "content": "\n\n Input " + remove_emojis(inputJson) + "\n\n" + "Output",
        },
    ])
    
    created_event = json.loads(chat_completion.choices[0].message.content)
    return created_event

def extract_details_with_error_handling(inputJson, index):
        try: 
            created_event = return_event_details(inputJson)
            return created_event
        except Exception as err:
            print(str(err))
            return {'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
    
# print(extract_details_with_error_handling(processedjson[0], 0))

In [15]:
postsDf = pd.read_csv("preliminaryProcessedInformation.csv")

postsDf["event_details"] = pd.NA


for index, row in postsDf.iterrows():
    if postsDf.at[index, "is_event"]== True:
        event_details = return_event_details(str(postsDf.at[index, "processed_json"]))
        print(index, event_details)
        if event_details["event_name"] == None or event_details["start_time"] == None or event_details["end_time"] == None:
            postsDf.at[index, "is_event"] = False
            postsDf.at[index, "event_details"] = None
        else:
            postsDf.at[index, "event_details"] = event_details
    else: print(index, "no event detected")

0 no event detected
1 no event detected
2 no event detected
3 no event detected
4 no event detected
5 no event detected
6 no event detected
7 no event detected
8 no event detected
9 no event detected
10 no event detected
11 no event detected
12 no event detected
13 no event detected
14 no event detected
15 no event detected
16 no event detected
17 no event detected
18 no event detected
19 no event detected
20 no event detected
21 no event detected
22 no event detected
23 no event detected
24 no event detected
25 no event detected
26 no event detected
27 {'return_id': 'null', 'is_event': True, 'event_name': 'PEO-SC 2024', 'event_description': 'PEO-SC 2024 conference for engineering students at Waterloo', 'event_categories': ['ENGINEERING', 'TECH'], 'start_time': '2024-10-04T00:00:00+00:00', 'end_time': '2024-10-06T00:00:00+00:00', 'location': 'null'}
28 no event detected
29 no event detected
30 {'return_id': 'null', 'is_event': True, 'event_name': 'PEO-SC 2024', 'event_description': 'Co

In [16]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("information_for_mongo.csv")

print(postsDf)

     Unnamed: 0.1  Unnamed: 0   account                 date  \
0               0           0  uwengsoc  2024-05-26 19:29:29   
1               1           1  uwengsoc  2024-08-26 17:33:54   
2               2           2  uwengsoc  2024-08-26 17:31:50   
3               3           3  uwengsoc  2024-08-26 17:28:35   
4               4           4  uwengsoc  2024-08-26 17:22:48   
..            ...         ...       ...                  ...   
478           478         478  uwmidsun  2024-07-26 00:53:57   
479           479         479  uwmidsun  2024-07-17 18:11:26   
480           480         480  uwmidsun  2024-07-17 18:09:17   
481           481         481  uwmidsun  2024-07-17 15:08:59   
482           482         482  uwmidsun  2024-07-17 13:42:11   

                                               caption  accessibility_caption  \
0    EngSoc Student Services Hours\r\n*Subject to c...                    NaN   
1    Introducing our Fall 2024 Operations Commissio...               