In [1]:
import instaloader
import pandas as pd
import datetime 
import time
import logging
from requests.exceptions import RequestException

def scrape_handle(L, handle, cutoffdate):
    max_retries = 3
    retry_count = 0
    posts_data = []

    while retry_count < max_retries:
        try:
            profile = instaloader.Profile.from_username(L.context, handle)
            for post in profile.get_posts():
                if post.date > cutoffdate:
                    photo_caption = post.accessibility_caption if post.accessibility_caption is not None else ""
                    caption = post.caption if post.caption is not None else ""
                    posts_data.append({
                        'url': post.shortcode,
                        'likes': post.likes,
                        'display_photo': post.url,
                        'account': handle.replace('\"','\\\"'),
                        'date': post.date,
                        'caption': caption.replace("\n",""),
                        'accessibility_caption': photo_caption.replace("\n",""),
                    })
                else:
                    break
            return posts_data
        except (instaloader.exceptions.InstaloaderException, RequestException) as e:
            retry_count += 1
            logging.error(f"Error scraping {handle} (attempt {retry_count}/{max_retries}): {str(e)}")
            if retry_count < max_retries:
                logging.info(f"Retrying {handle}...")
                time.sleep(10)  # Wait for 5 seconds before retrying
            else:
                logging.error(f"Max retries reached for {handle}. Moving to next handle.")
    return []

def scrape_instagram():
    cutoffdate = datetime.datetime.today() - datetime.timedelta(days=10)
    handles = ['uwengsoc','uwcsa','uw_ux','uwblueprint','uwaterlooeng','uwaterloottc','uwaterloodsc','uwaterloopm','uwmcc','gdscwaterloo','uwsmileclub','socratica.info','yourwusa','wataiteam','uwawscloud','techplusuw','itshera.co','uwstartups','electriummobility','uwhiphop','uwaterloo_ksa','uw_aviation','uwaterloopm','uwmcc','uwmsa','gdscwaterloo','waterloo_ultimate','uwcheeseclub','uwstreetdance','uwmidsun','watolink_uw','uwaterlooeng','uwpokerclub','uwaterloocycling','uwaterloobsa','uw_phys_club','uw.gsa','uwcsclub','uwfintech','uwaterloosc','uwactsciclub','uwstatsclub','waterloo.frosh','wat.street','waterlooblockchain','waterloo.ai','uw_watsam','uwrealitylabs','uwafow','uwmuaythai','uw.farmsa','uw_bmsa','uwtsa','uwmariokart','uw.origins','uwhiphop','uw.movie.watchers','uwactsciclub','uwbeautyclub','uwteaclub','psa.uw','uw_urc','uw.dhamaka']
    postsDf = pd.DataFrame()

    L = instaloader.Instaloader()
    
    for handle in handles:
        logging.info(f"Scraping {handle}")
        handle_data = scrape_handle(L, handle, cutoffdate)
        if handle_data:
            postsDf = pd.concat([postsDf, pd.DataFrame(handle_data)], ignore_index=True)
        time.sleep(1)  # Delay between handles to avoid rate limiting

    return postsDf

def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    try:
        result = scrape_instagram()
        print(result)
        logging.info("Scraping completed successfully.")
        return result
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")

if __name__ == "__main__":
    postsDf = main()

  readline_hook.enable(use_pyreadline=use_pyreadline)
2024-08-31 23:23:00,662 - INFO - Scraping uwengsoc
2024-08-31 23:23:07,338 - INFO - Scraping uwcsa
2024-08-31 23:23:10,576 - INFO - Scraping uw_ux
2024-08-31 23:23:15,860 - INFO - Scraping uwblueprint
2024-08-31 23:23:21,332 - INFO - Scraping uwaterlooeng
2024-08-31 23:23:26,400 - INFO - Scraping uwaterloottc
2024-08-31 23:23:28,538 - INFO - Scraping uwaterloodsc
2024-08-31 23:23:31,479 - INFO - Scraping uwaterloopm
2024-08-31 23:23:34,740 - INFO - Scraping uwmcc
2024-08-31 23:23:41,244 - INFO - Scraping gdscwaterloo
2024-08-31 23:23:43,449 - INFO - Scraping uwsmileclub
2024-08-31 23:23:47,275 - INFO - Scraping socratica.info
2024-08-31 23:23:51,074 - INFO - Scraping yourwusa
2024-08-31 23:23:57,326 - INFO - Scraping wataiteam
2024-08-31 23:24:04,608 - INFO - Scraping uwawscloud
2024-08-31 23:24:08,043 - INFO - Scraping techplusuw
2024-08-31 23:24:10,567 - INFO - Scraping itshera.co
2024-08-31 23:24:17,428 - INFO - Scraping uwstartu

            url  likes                                      display_photo  \
0   C_OA6KDApHH   1174  https://scontent-sea1-1.cdninstagram.com/v/t51...   
1   C_V6AADvZOw     93  https://scontent-sea1-1.cdninstagram.com/v/t51...   
2   C_V88v9OXxF     44  https://scontent-sea1-1.cdninstagram.com/v/t51...   
3   C_Vf-BkAdsr    103  https://scontent-sea1-1.cdninstagram.com/v/t51...   
4   C_TFdVgA6Sw    297  https://scontent-sea1-1.cdninstagram.com/v/t51...   
5   C_RMLamRcYQ    199  https://scontent-sea1-1.cdninstagram.com/v/t51...   
6   C_RJR6QPCUb     84  https://scontent-sea1-1.cdninstagram.com/v/t51...   
7   C_MlOd8gFf4    457  https://scontent-sea1-1.cdninstagram.com/v/t51...   
8   C_BA2e8AGK1    406  https://scontent-sea1-1.cdninstagram.com/v/t51...   
9   C_MOF5uAYoB     52  https://scontent-sea1-1.cdninstagram.com/v/t51...   
10  C_Le_jEAIR4     19  https://scontent-sea1-1.cdninstagram.com/v/t51...   
11  C_XGyF7NSvS      4  https://scontent-sea1-1.cdninstagram.com/v/t51...   

In [2]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("instagram_raw.csv")

In [3]:
#Preliminary Screening to determine if posts contains event or not

from together import Together
import os
from dotenv import load_dotenv
import pandas as pd


load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

postsDf = pd.read_csv("instagram_raw.csv").replace('"','', regex=True)
postsDf["is_event"] = pd.NA
postsDf["processed_json"]=pd.NA

def check_string(input_string):
    if any(word in input_string for word in ['yes', 'Yes', 'True', 'true']):
        return True
    elif any(word in input_string for word in ['no', 'No', 'False', 'false']):
        return False
    else:
        return False  # This handles cases where none of the words are found


cnt=0
for index, row in postsDf.iterrows():
    currentjson = f"'account': '{row['account']}'; 'caption': '{row['caption']}'; 'photo_caption': '{row['accessibility_caption']}'"
    postsDf.at[index, "processed_json"] = currentjson
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "user", "content": f'Does the following instagram post contain a club event with a specified time. RETURN Yes or No: {currentjson}'}],
        max_tokens=2
    )
    print(response.choices[0].message.content)
    is_event = check_string(response.choices[0].message.content)
    postsDf.at[index, "is_event"] = is_event

No
Yes
No
Yes
Yes
No
Yes
No
No
No
Yes
Yes
No
No
No
No
Yes
No
No
Yes
No
No
No
Yes
No
No
Yes
Yes
Yes
No
No
No
No
No


In [4]:
postsDf.to_csv("preliminaryProcessedInformation.csv")

In [5]:
from together import Together
import os
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import json
import pandas as pd
from enum import Enum, auto
import re

load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

basePrompt = open("C:/Users/david/Desktop/uw-upnext/WebScraper/basePrompt.in","r", encoding = "utf-8").read()

postsDf = pd.read_csv("C:/Users/david/Desktop/uw-upnext/WebScraper/preliminaryProcessedInformation.csv")

basePrompt = open("basePrompt.in","r", encoding = "utf-8").read()

def remove_emojis(text):
    # Unicode ranges for emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

class Category(Enum):
    TECH = auto()
    DESIGN = auto()
    SOCIAL = auto()
    ENTERTAINMENT = auto()
    CULTURE = auto()
    SPORTS = auto()
    NETWORKING = auto()
    GAMING = auto()

class Event(BaseModel):
    is_event: bool = Field(description="Whether the post contains an event")
    event_name: str = Field(description="The Name of the Event")
    event_description: str = Field(description='Concise 20 word summary of the event without time or location')
    event_categories: list[str] = Field(description='Categorize the Event into at least one or more of the following: TECH, DESIGN, SOCIAL, MUSIC, CULTURE, SPORTS, NETWORK, GAMING')
    start_time: str = Field(description="The Start time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    end_time: str = Field(description="The End time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    location: str = Field(description= "The location of event")

def return_event_details(inputJson : str):
    chat_completion = client.chat.completions.create(
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    response_format={"type": "json_object", "schema": Event.model_json_schema()},
    messages=[
        {
            "role": "system",
            "content": basePrompt,
        },
        {
            "role": "user",
            "content": "\n\n Input " + remove_emojis(inputJson) + "\n\n" + "Output",
        },
    ])
    
    created_event = json.loads(chat_completion.choices[0].message.content)
    return created_event

def extract_details_with_error_handling(inputJson, index):
        try: 
            created_event = return_event_details(inputJson)
            return created_event
        except Exception as err:
            print(str(err))
            return {'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
    
# print(extract_details_with_error_handling(processedjson[0], 0))

In [6]:
postsDf = pd.read_csv("preliminaryProcessedInformation.csv")

postsDf["event_details"] = pd.NA


for index, row in postsDf.iterrows():
    if postsDf.at[index, "is_event"]== True:
        event_details = return_event_details(str(postsDf.at[index, "processed_json"]))
        print(index, event_details)
        if event_details["event_name"] == None or event_details["start_time"] == None or event_details["end_time"] == None:
            postsDf.at[index, "is_event"] = False
            postsDf.at[index, "event_details"] = None
        else:
            postsDf.at[index, "event_details"] = event_details
    else: print(index, "no event detected")

0 no event detected
1 {'is_event': True, 'event_name': 'ProjectX 2024', 'event_description': 'The world’s largest undergraduate machine learning research competition', 'event_categories': ['TECH'], 'start_time': '2024-09-01T00:00:00+00:00', 'end_time': '2024-12-31T23:59:59+00:00', 'location': 'null'}
2 no event detected
3 {'is_event': False, 'event_name': '', 'event_description': '', 'event_categories': [], 'start_time': '', 'end_time': '', 'location': ''}
4 {'is_event': False, 'event_name': '', 'event_description': '', 'event_categories': [], 'start_time': '', 'end_time': '', 'location': 'null'}
5 no event detected
6 {'is_event': True, 'event_name': 'Yourwusa Concert', 'event_description': 'Concert with Twentyfiveroses.music and Tysloyens', 'event_categories': ['MUSIC'], 'start_time': '2024-09-07T18:15:00+00:00', 'end_time': 'null', 'location': 'North Campus, Field 7'}
7 no event detected
8 no event detected
9 no event detected
10 {'is_event': True, 'event_name': 'AWS Conference', 'ev

In [7]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("information_for_mongo.csv")

print(postsDf)

    Unnamed: 0.1  Unnamed: 0          url  likes  \
0              0           0  C_OA6KDApHH   1174   
1              1           1  C_V6AADvZOw     93   
2              2           2  C_V88v9OXxF     44   
3              3           3  C_Vf-BkAdsr    103   
4              4           4  C_TFdVgA6Sw    297   
5              5           5  C_RMLamRcYQ    199   
6              6           6  C_RJR6QPCUb     84   
7              7           7  C_MlOd8gFf4    457   
8              8           8  C_BA2e8AGK1    406   
9              9           9  C_MOF5uAYoB     52   
10            10          10  C_Le_jEAIR4     19   
11            11          11  C_XGyF7NSvS      4   
12            12          12  C_TsSiRRQ0j     37   
13            13          13  C_GwM9nSn_X     25   
14            14          14  C_WwLKjN4JB     35   
15            15          15  C_UGelxtbbl     91   
16            16          16  C_RcDz-SZ1X    142   
17            17          17  C_Qn8buStJI    104   
18          