In [1]:
import instaloader
import pandas as pd
import datetime 
import time
import logging
import random
from requests.exceptions import RequestException

def scrape_handle(L, handle, cutoffdate):
    max_retries = 3
    retry_count = 0
    posts_data = []

    while retry_count < max_retries:
        try:
            profile = instaloader.Profile.from_username(L.context, handle)
            for post in profile.get_posts():
                if post.date > cutoffdate:
                    photo_caption = post.accessibility_caption if post.accessibility_caption is not None else ""
                    caption = post.caption if post.caption is not None else ""
                    posts_data.append({
                        'url': post.shortcode,
                        'likes': post.likes,
                        'display_photo': post.url,
                        'account': handle.replace('\"','\\\"'),
                        'date': post.date,
                        'caption': caption.replace("\n",""),
                        'accessibility_caption': photo_caption.replace("\n",""),
                    })
                else:
                    break
            return posts_data
        except (instaloader.exceptions.InstaloaderException, RequestException) as e:
            retry_count += 1
            logging.error(f"Error scraping {handle} (attempt {retry_count}/{max_retries}): {str(e)}")
            if retry_count < max_retries:
                logging.info(f"Retrying {handle}...")
                time.sleep(10)  # Wait for 5 seconds before retrying
            else:
                logging.error(f"Max retries reached for {handle}. Moving to next handle.")
    return []

def scrape_instagram():
    cutoffdate = datetime.datetime.today() - datetime.timedelta(days=1)
    handles = ['uwengsoc','uwcsa','uw_ux','uwblueprint','uwaterlooeng','uwaterloottc','uwaterloodsc','uwaterloopm','uwmcc','gdscwaterloo','uwsmileclub','socratica.info','yourwusa','wataiteam','uwawscloud','techplusuw','itshera.co','uwstartups','electriummobility','uwhiphop','uwaterloo_ksa','uw_aviation','uwaterloopm','uwmcc','uwmsa','gdscwaterloo','waterloo_ultimate','uwcheeseclub','uwstreetdance','uwmidsun','watolink_uw','uwaterlooeng','uwpokerclub','uwaterloocycling','uwaterloobsa','uw_phys_club','uw.gsa','uwcsclub','uwfintech','uwaterloosc','uwactsciclub','uwstatsclub','waterloo.frosh','wat.street','waterlooblockchain','waterloo.ai','uw_watsam','uwrealitylabs','uwafow','uwmuaythai','uw.farmsa','uw_bmsa','uwtsa','uwmariokart','uwhiphop','uw.movie.watchers','uwbeautyclub','uwteaclub','uw_urc','uw.dhamaka']
    random.shuffle(handles)
    postsDf = pd.DataFrame()

    L = instaloader.Instaloader()
    
    for handle in handles:
        logging.info(f"Scraping {handle}")
        handle_data = scrape_handle(L, handle, cutoffdate)
        if handle_data:
            postsDf = pd.concat([postsDf, pd.DataFrame(handle_data)], ignore_index=True)
        time.sleep(random.randrange(4,5))  # Delay between handles to avoid rate limiting

    return postsDf

def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    try:
        result = scrape_instagram()
        print(result)
        logging.info("Scraping completed successfully.")
        return result
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")

if __name__ == "__main__":
    postsDf = main()

  readline_hook.enable(use_pyreadline=use_pyreadline)
2024-09-02 14:34:27,671 - INFO - Scraping uwmuaythai
2024-09-02 14:34:36,548 - INFO - Scraping techplusuw
2024-09-02 14:34:44,476 - INFO - Scraping uwafow
2024-09-02 14:34:50,731 - INFO - Scraping yourwusa
2024-09-02 14:34:58,255 - INFO - Scraping uwaterloosc
2024-09-02 14:35:04,747 - INFO - Scraping uwmidsun
2024-09-02 14:35:12,050 - INFO - Scraping uwaterloodsc
2024-09-02 14:35:20,926 - INFO - Scraping uwmcc
2024-09-02 14:35:28,493 - INFO - Scraping watolink_uw
2024-09-02 14:35:35,079 - INFO - Scraping uwhiphop
2024-09-02 14:35:40,722 - INFO - Scraping uwaterloopm
2024-09-02 14:35:47,211 - INFO - Scraping uwaterloocycling
2024-09-02 14:35:53,566 - INFO - Scraping waterloo.frosh
2024-09-02 14:36:00,173 - INFO - Scraping uwactsciclub
2024-09-02 14:36:06,457 - INFO - Scraping uwcheeseclub
2024-09-02 14:36:11,813 - INFO - Scraping socratica.info
2024-09-02 14:36:18,050 - INFO - Scraping uwaterlooeng
2024-09-02 14:36:24,763 - INFO - Sc


HTTP redirect from https://i.instagram.com/api/v1/users/web_profile_info/?username=uw_watsam to https://i.instagram.com/accounts/login/?next=/api/v1/users/web_profile_info/


2024-09-02 14:38:17,467 - ERROR - Error scraping uw_watsam (attempt 2/3): Redirected to login page. Use --login or --load-cookies.
2024-09-02 14:38:17,469 - INFO - Retrying uw_watsam...



HTTP redirect from https://i.instagram.com/api/v1/users/web_profile_info/?username=uw_watsam to https://i.instagram.com/accounts/login/?next=/api/v1/users/web_profile_info/


2024-09-02 14:38:35,034 - INFO - Scraping uw.gsa
2024-09-02 14:38:40,772 - INFO - Scraping uwmsa
2024-09-02 14:38:47,727 - INFO - Scraping uwaterloopm
2024-09-02 14:38:55,535 - INFO - Scraping uwhiphop
2024-09-02 14:39:02,523 - INFO - Scraping uwcsclub
2024-09-02 14:39:14,550 - INFO - Scraping uwstartups
2024-09-02 14:39:20,864 - INFO - Scraping uwaterloottc
2024-09-02 14:39:26,674 - INFO - Scraping itshera.co
2024-09-02 14:39:32,038 - INFO - Scraping uwblueprint
2024-09-02 14:39:40,131 - INFO - Scraping uwstreetdance
2024-09-02 14:39:45,947 - INFO - Scraping uw.farmsa
2024-09-02 14:39:51,279 - INFO - Scraping gdscwaterloo
2024-09-02 14:39:56,654 - INFO - Scraping uwtsa
2024-09-02 14:40:08,124 - INFO - Scraping uw_aviation
2024-09-02 14:40:15,585 - INFO - Scraping uwawscloud
2024-09-02 14:40:21,000 - INFO - Scraping uwbeautyclub
2024-09-02 14:40:28,178 - INFO - Scraping wat.street
2024-09-02 14:40:33,772 - INFO - Scraping uwteaclub
2024-09-02 14:40:39,410 - INFO - Scraping waterloo.ai


: 

In [None]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("instagram_raw.csv")

In [None]:
#Preliminary Screening to determine if posts contains event or not

from together import Together
import os
from dotenv import load_dotenv
import pandas as pd


load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

postsDf = pd.read_csv("instagram_raw.csv").replace('"','', regex=True)
postsDf["is_event"] = pd.NA
postsDf["processed_json"]=pd.NA

def check_string(input_string):
    if any(word in input_string for word in ['yes', 'Yes', 'True', 'true']):
        return True
    elif any(word in input_string for word in ['no', 'No', 'False', 'false']):
        return False
    else:
        return False  # This handles cases where none of the words are found


cnt=0
for index, row in postsDf.iterrows():
    currentjson = f"'account': '{row['account']}'; 'caption': '{row['caption']}'; 'photo_caption': '{row['accessibility_caption']}'"
    postsDf.at[index, "processed_json"] = currentjson
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "user", "content": f'Does the following instagram post contain a club event with a specified time. RETURN Yes or No: {currentjson}'}],
        max_tokens=2
    )
    print(response.choices[0].message.content)
    is_event = check_string(response.choices[0].message.content)
    postsDf.at[index, "is_event"] = is_event

In [None]:
postsDf.to_csv("preliminaryProcessedInformation.csv")

In [None]:
from together import Together
import os
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import json
import pandas as pd
from enum import Enum, auto
import re

load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

basePrompt = open("C:/Users/david/Desktop/uw-upnext/WebScraper/basePrompt.in","r", encoding = "utf-8").read()

postsDf = pd.read_csv("C:/Users/david/Desktop/uw-upnext/WebScraper/preliminaryProcessedInformation.csv")

basePrompt = open("basePrompt.in","r", encoding = "utf-8").read()

def remove_emojis(text):
    # Unicode ranges for emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

class Category(Enum):
    TECH = auto()
    DESIGN = auto()
    SOCIAL = auto()
    ENTERTAINMENT = auto()
    CULTURE = auto()
    SPORTS = auto()
    NETWORKING = auto()
    GAMING = auto()

class Event(BaseModel):
    is_event: bool = Field(description="Whether the post contains an event")
    event_name: str = Field(description="The Name of the Event")
    event_description: str = Field(description='Concise 20 word summary of the event without time or location')
    event_categories: list[str] = Field(description='Categorize the Event into at least one or more of the following: TECH, DESIGN, SOCIAL, MUSIC, CULTURE, SPORTS, NETWORK, GAMING')
    start_time: str = Field(description="The Start time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    end_time: str = Field(description="The End time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    location: str = Field(description= "The location of event")

def return_event_details(inputJson : str):
    chat_completion = client.chat.completions.create(
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    response_format={"type": "json_object", "schema": Event.model_json_schema()},
    messages=[
        {
            "role": "system",
            "content": basePrompt,
        },
        {
            "role": "user",
            "content": "\n\n Input " + remove_emojis(inputJson) + "\n\n" + "Output",
        },
    ])
    
    created_event = json.loads(chat_completion.choices[0].message.content)
    return created_event

def extract_details_with_error_handling(inputJson, index):
        try: 
            created_event = return_event_details(inputJson)
            return created_event
        except Exception as err:
            print(str(err))
            return {'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
    
# print(extract_details_with_error_handling(processedjson[0], 0))

In [None]:
postsDf = pd.read_csv("preliminaryProcessedInformation.csv")

postsDf["event_details"] = pd.NA


for index, row in postsDf.iterrows():
    if postsDf.at[index, "is_event"]== True:
        event_details = return_event_details(str(postsDf.at[index, "processed_json"]))
        print(index, event_details)
        if event_details["event_name"] == None or event_details["start_time"] == None or event_details["end_time"] == None:
            postsDf.at[index, "is_event"] = False
            postsDf.at[index, "event_details"] = None
        else:
            postsDf.at[index, "event_details"] = event_details
    else: print(index, "no event detected")

In [None]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("information_for_mongo.csv")

print(postsDf)