In [2]:
import instaloader
import pandas as pd
import datetime 
import time
import logging
import random
from requests.exceptions import RequestException

def scrape_handle(L, handle, cutoffdate):
    max_retries = 3
    retry_count = 0
    posts_data = []

    while retry_count < max_retries:
        try:
            profile = instaloader.Profile.from_username(L.context, handle)
            for post in profile.get_posts():
                if post.date > cutoffdate:
                    photo_caption = post.accessibility_caption if post.accessibility_caption is not None else ""
                    caption = post.caption if post.caption is not None else ""
                    posts_data.append({
                        'url': post.shortcode,
                        'likes': post.likes,
                        'display_photo': post.url,
                        'account': handle.replace('\"','\\\"'),
                        'date': post.date,
                        'caption': caption.replace("\n",""),
                        'accessibility_caption': photo_caption.replace("\n",""),
                    })
                else:
                    break
            return posts_data
        except (instaloader.exceptions.InstaloaderException, RequestException) as e:
            retry_count += 1
            logging.error(f"Error scraping {handle} (attempt {retry_count}/{max_retries}): {str(e)}")
            if retry_count < max_retries:
                logging.info(f"Retrying {handle}...")
                time.sleep(random.randrange(300, 400))  # Wait for a few minutes before retrying
            else:
                logging.error(f"Max retries reached for {handle}. Moving to next handle.")
    return []

def scrape_instagram():
    cutoffdate = datetime.datetime.today() - datetime.timedelta(days=3)
    handles = ['uwengsoc','uwcsa','uw_ux','uwblueprint','uwaterlooeng','uwaterloottc','uwaterloodsc','uwaterloopm','uwmcc','gdscwaterloo','uwsmileclub','socratica.info','yourwusa','wataiteam','uwawscloud','techplusuw','itshera.co','uwstartups','electriummobility','uwhiphop','uwaterloo_ksa','uw_aviation','uwaterloopm','uwmcc','uwmsa','gdscwaterloo','waterloo_ultimate','uwcheeseclub','uwstreetdance','uwmidsun','watolink_uw','uwaterlooeng','uwpokerclub','uwaterloocycling','uwaterloobsa','uw_phys_club','uw.gsa','uwcsclub','uwfintech','uwaterloosc','uwactsciclub','uwstatsclub','waterloo.frosh','wat.street','waterlooblockchain','waterloo.ai','uw_watsam','uwrealitylabs','uwafow','uwmuaythai','uw.farmsa','uw_bmsa','uwtsa','uwmariokart','uwhiphop','uw.movie.watchers','uwbeautyclub','uwteaclub','uw_urc','uw.dhamaka']
    random.shuffle(handles)
    postsDf = pd.DataFrame()

    L = instaloader.Instaloader()
    
    for handle in handles:
        logging.info(f"Scraping {handle}")
        handle_data = scrape_handle(L, handle, cutoffdate)
        if handle_data:
            postsDf = pd.concat([postsDf, pd.DataFrame(handle_data)], ignore_index=True)
        time.sleep(4)  # Delay between handles to avoid rate limiting
    return postsDf

def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    try:
        result = scrape_instagram()
        print(result)
        logging.info("Scraping completed successfully.")
        return result
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")

if __name__ == "__main__":
    postsDf = main()

2024-09-05 12:13:14,580 - INFO - Scraping gdscwaterloo
2024-09-05 12:13:20,984 - INFO - Scraping uwmsa
2024-09-05 12:13:28,411 - INFO - Scraping electriummobility
2024-09-05 12:13:36,918 - INFO - Scraping uwteaclub
2024-09-05 12:13:42,782 - INFO - Scraping wataiteam
2024-09-05 12:13:50,327 - INFO - Scraping uw.gsa
2024-09-05 12:13:57,090 - INFO - Scraping gdscwaterloo
2024-09-05 12:14:05,545 - INFO - Scraping uwawscloud
2024-09-05 12:14:12,371 - INFO - Scraping uwactsciclub
2024-09-05 12:14:20,667 - INFO - Scraping wat.street
2024-09-05 12:14:26,028 - INFO - Scraping uw_urc
2024-09-05 12:14:37,525 - INFO - Scraping uwafow
2024-09-05 12:14:45,403 - INFO - Scraping uw.farmsa
2024-09-05 12:14:50,823 - INFO - Scraping uwaterloopm
2024-09-05 12:14:56,624 - INFO - Scraping uw_phys_club
2024-09-05 12:15:02,800 - INFO - Scraping uwaterloottc
2024-09-05 12:15:09,821 - INFO - Scraping uwcsa
2024-09-05 12:15:16,282 - INFO - Scraping uw_bmsa
2024-09-05 12:15:22,571 - INFO - Scraping uwhiphop
2024-

           url  likes                                      display_photo  \
0  C_igPQpREfY    221  https://scontent-dfw5-1.cdninstagram.com/v/t39...   
1  C_hFj0CtTmS     69  https://scontent-dfw5-1.cdninstagram.com/v/t39...   
2  C_eYGJnNPZS    203  https://scontent-dfw5-1.cdninstagram.com/v/t39...   
3  C_isJ7WAT9F     15  https://scontent-dfw5-2.cdninstagram.com/v/t51...   
4  C_eR7HLg3vb     37  https://scontent-dfw5-2.cdninstagram.com/v/t51...   
5  C_d2BJKy26q     71  https://scontent-dfw5-2.cdninstagram.com/v/t39...   
6  C_hWH4eA3AX     19  https://scontent-dfw5-2.cdninstagram.com/v/t39...   
7  C_eNTU_AHXC     13  https://scontent-dfw5-2.cdninstagram.com/v/t51...   
8  C_hLFl4ACbt     48  https://scontent-dfw5-2.cdninstagram.com/v/t51...   
9  C_b5ATAgNVY     59  https://scontent-dfw5-1.cdninstagram.com/v/t51...   

            account                date  \
0             uwmsa 2024-09-05 14:39:11   
1             uwmsa 2024-09-05 01:26:49   
2             uwmsa 2024-09-04 00:

In [3]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("./Data/instagram_raw.csv")

In [4]:
#Preliminary Screening to determine if posts contains event or not

from together import Together
import os
from dotenv import load_dotenv
import pandas as pd


load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

postsDf = pd.read_csv("instagram_raw.csv").replace('"','', regex=True)
postsDf["is_event"] = pd.NA
postsDf["processed_json"]=pd.NA

def check_string(input_string):
    if any(word in input_string for word in ['yes', 'Yes', 'True', 'true']):
        return True
    elif any(word in input_string for word in ['no', 'No', 'False', 'false']):
        return False
    else:
        return False  # This handles cases where none of the words are found


cnt=0
for index, row in postsDf.iterrows():
    currentjson = f"'account': '{row['account']}'; 'caption': '{row['caption']}'; 'photo_caption': '{row['accessibility_caption']}'"
    postsDf.at[index, "processed_json"] = currentjson
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "user", "content": f'Does the following instagram post contain a club event with a specified time. RETURN Yes or No: {currentjson}'}],
        max_tokens=2
    )
    print(response.choices[0].message.content)
    is_event = check_string(response.choices[0].message.content)
    postsDf.at[index, "is_event"] = is_event

No
No
No
Yes
Yes
Yes
Yes
No
No
No


In [5]:
postsDf.to_csv("preliminaryProcessedInformation.csv")

In [7]:
from together import Together
import os
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import json
import pandas as pd
from enum import Enum, auto
import re

load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

basePrompt = open("C:/Users/david/Desktop/uw-upnext/WebScraper/basePrompt.in","r", encoding = "utf-8").read()

postsDf = pd.read_csv("C:/Users/david/Desktop/uw-upnext/WebScraper/preliminaryProcessedInformation.csv")


def remove_emojis(text):
    # Unicode ranges for emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

class Category(Enum):
    TECH = auto()
    DESIGN = auto()
    SOCIAL = auto()
    ENTERTAINMENT = auto()
    CULTURE = auto()
    SPORTS = auto()
    NETWORKING = auto()
    GAMING = auto()

class Event(BaseModel):
    is_event: bool = Field(description="Whether the post contains an event")
    event_name: str = Field(description="The Name of the Event")
    event_description: str = Field(description='Concise 20 word summary of the event without time or location')
    event_categories: list[str] = Field(description='Categorize the Event into at least one or more of the following: TECH, DESIGN, SOCIAL, MUSIC, CULTURE, SPORTS, NETWORK, GAMING')
    start_time: str = Field(description="The Start time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    end_time: str = Field(description="The End time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    location: str = Field(description= "The location of event")

def return_event_details(inputJson : str):
    chat_completion = client.chat.completions.create(
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    response_format={"type": "json_object", "schema": Event.model_json_schema()},
    messages=[
        {
            "role": "system",
            "content": basePrompt,
        },
        {
            "role": "user",
            "content": "\n\n Input " + remove_emojis(inputJson) + "\n\n" + "Output",
        },
    ])
    
    created_event = json.loads(chat_completion.choices[0].message.content)
    return created_event

def extract_details_with_error_handling(inputJson, index):
        try: 
            created_event = return_event_details(inputJson)
            return created_event
        except Exception as err:
            print(str(err))
            return {'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
    
# print(extract_details_with_error_handling(processedjson[0], 0))

In [8]:
postsDf = pd.read_csv("preliminaryProcessedInformation.csv")

postsDf["event_details"] = pd.NA

import requests
import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import pandas as pd

postsDf = pd.read_csv("instagram_raw.csv").replace('"','', regex=True)


def download_instagram_image(url, folder_path):
    # Send a GET request to the Instagram post URL
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the image URL (this may change if Instagram updates their HTML structure)
        image_url = soup.find('meta', property='og:image')['content']
        
        if image_url:
            # Download the image
            image_response = requests.get(image_url)
            
            if image_response.status_code == 200:
                # Create the folder if it doesn't exist
                os.makedirs(folder_path, exist_ok=True)
                
                # Generate a filename from the URL
                filename = os.path.basename(urlparse(image_url).path)
                file_path = os.path.join(folder_path, filename)
                
                # Save the image
                with open(file_path, 'wb') as file:
                    file.write(image_response.content)
                
                print(f"Image saved successfully: {file_path}")
            else:
                print("Failed to download the image.")
        else:
            print("Could not find the image URL in the Instagram post.")
    else:
        print("Failed to access the Instagram post.")


for index, row in postsDf.iterrows():
    if postsDf.at[index, "is_event"]== True:
        event_details = return_event_details(str(postsDf.at[index, "processed_json"]))
        print(index, event_details)
        if event_details["event_name"] == None or event_details["start_time"] == None or event_details["end_time"] == None:
            postsDf.at[index, "is_event"] = False
            postsDf.at[index, "event_details"] = None
        else:
            postsDf.at[index, "event_details"] = event_details
            imageurl = row["display_photo"]
            postID = row["url"]
            filepath = f'./public/InstagramImages/{postID}'
            download_instagram_image(imageurl, postID)
            break
    else: print(index, "no event detected")

0 no event detected
1 no event detected
2 no event detected
3 {'is_event': True, 'event_name': 'University of Waterloo Table Tennis Club Fall 2024', 'event_description': 'Table Tennis Club resumes for Fall 2024 term', 'event_categories': ['SPORTS'], 'start_time': '2024-09-08T14:30:00+00:00', 'end_time': '2024-09-08T17:00:00+00:00', 'location': 'null'}
4 {'is_event': False, 'event_name': '', 'event_description': '', 'event_categories': [], 'start_time': '', 'end_time': '', 'location': ''}
5 {'is_event': True, 'event_name': 'Intro to Verkada', 'event_description': 'Intro to Verkada with free boba and swag', 'event_categories': ['TECH'], 'start_time': '2024-09-09T18:00:00+00:00', 'end_time': '2024-09-09T19:00:00+00:00', 'location': 'E7 5353'}
6 {'is_event': False, 'event_name': '', 'event_description': '', 'event_categories': [], 'start_time': '', 'end_time': '', 'location': ''}
7 no event detected
8 no event detected
9 no event detected


In [9]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("information_for_mongo.csv")

print(postsDf)

   Unnamed: 0.1  Unnamed: 0          url  likes  \
0             0           0  C_igPQpREfY    221   
1             1           1  C_hFj0CtTmS     69   
2             2           2  C_eYGJnNPZS    203   
3             3           3  C_isJ7WAT9F     15   
4             4           4  C_eR7HLg3vb     37   
5             5           5  C_d2BJKy26q     71   
6             6           6  C_hWH4eA3AX     19   
7             7           7  C_eNTU_AHXC     13   
8             8           8  C_hLFl4ACbt     48   
9             9           9  C_b5ATAgNVY     59   

                                       display_photo           account  \
0  https://scontent-dfw5-1.cdninstagram.com/v/t39...             uwmsa   
1  https://scontent-dfw5-1.cdninstagram.com/v/t39...             uwmsa   
2  https://scontent-dfw5-1.cdninstagram.com/v/t39...             uwmsa   
3  https://scontent-dfw5-2.cdninstagram.com/v/t51...      uwaterloottc   
4  https://scontent-dfw5-2.cdninstagram.com/v/t51...        uwmuayth