In [7]:
import instaloader
import pandas as pd
import datetime 
import time
import logging
from requests.exceptions import RequestException

def scrape_handle(L, handle, cutoffdate):
    max_retries = 3
    retry_count = 0
    posts_data = []

    while retry_count < max_retries:
        try:
            profile = instaloader.Profile.from_username(L.context, handle)
            for post in profile.get_posts():
                if post.date > cutoffdate:
                    photo_caption = post.accessibility_caption if post.accessibility_caption is not None else ""
                    caption = post.caption if post.caption is not None else ""
                    posts_data.append({
                        'url': post.shortcode,
                        'likes': post.likes,
                        'display_photo': post.url,
                        'account': handle.replace('\"','\\\"'),
                        'date': post.date,
                        'caption': caption.replace("\n",""),
                        'accessibility_caption': photo_caption.replace("\n",""),
                    })
                else:
                    break
            return posts_data
        except (instaloader.exceptions.InstaloaderException, RequestException) as e:
            retry_count += 1
            logging.error(f"Error scraping {handle} (attempt {retry_count}/{max_retries}): {str(e)}")
            if retry_count < max_retries:
                logging.info(f"Retrying {handle}...")
                time.sleep(10)  # Wait for 5 seconds before retrying
            else:
                logging.error(f"Max retries reached for {handle}. Moving to next handle.")
    return []

def scrape_instagram():
    cutoffdate = datetime.datetime.today() - datetime.timedelta(days=10)
    handles = ['uwengsoc','uwcsa','uw_ux','uwblueprint','uwaterlooeng','uwaterloottc','uwaterloodsc','uwaterloopm','uwmcc','gdscwaterloo','uwsmileclub','socratica.info','wataiteam','uwawscloud','techplusuw','itshera.co','uwstartups','electriummobility','uwhiphop','uwaterloo_ksa','uw_aviation','uwaterloopm','uwmcc','uwmsa','gdscwaterloo','waterloo_ultimate','uwcheeseclub','uwstreetdance','uwmidsun','watolink_uw','uwaterlooeng','uwpokerclub','uwaterloocycling','uwaterloobsa','uw_phys_club','uw.gsa','uwcsclub','uwfintech','uwaterloosc','uwactsciclub','uwstatsclub']
    postsDf = pd.DataFrame()

    L = instaloader.Instaloader()
    
    for handle in handles:
        logging.info(f"Scraping {handle}")
        handle_data = scrape_handle(L, handle, cutoffdate)
        if handle_data:
            postsDf = pd.concat([postsDf, pd.DataFrame(handle_data)], ignore_index=True)
        time.sleep(1)  # Delay between handles to avoid rate limiting

    return postsDf

def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    try:
        result = scrape_instagram()
        print(result)
        logging.info("Scraping completed successfully.")
        return result
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")

if __name__ == "__main__":
    postsDf = main()

2024-08-27 15:07:52,646 - INFO - Scraping uwengsoc
2024-08-27 15:08:00,298 - INFO - Scraping uwcsa
2024-08-27 15:08:04,605 - INFO - Scraping uw_ux
2024-08-27 15:08:09,971 - INFO - Scraping uwblueprint
2024-08-27 15:08:14,245 - INFO - Scraping uwaterlooeng
2024-08-27 15:08:19,279 - INFO - Scraping uwaterloottc
2024-08-27 15:08:24,447 - INFO - Scraping uwaterloodsc
2024-08-27 15:08:29,079 - INFO - Scraping uwaterloopm
2024-08-27 15:08:33,689 - INFO - Scraping uwmcc
2024-08-27 15:08:36,461 - INFO - Scraping gdscwaterloo
2024-08-27 15:08:38,725 - INFO - Scraping uwsmileclub
2024-08-27 15:08:43,191 - INFO - Scraping socratica.info


In [6]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("instagram_raw.csv")

NameError: name 'postsDf' is not defined

In [3]:
#Preliminary Screening to determine if posts contains event or not

from together import Together
import os
from dotenv import load_dotenv
import pandas as pd


load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

postsDf = pd.read_csv("instagram_raw.csv").replace('"','', regex=True)
postsDf["is_event"] = pd.NA
postsDf["processed_json"]=pd.NA

def check_string(input_string):
    if any(word in input_string for word in ['yes', 'Yes', 'True', 'true']):
        return True
    elif any(word in input_string for word in ['no', 'No', 'False', 'false']):
        return False
    else:
        return False  # This handles cases where none of the words are found


cnt=0
for index, row in postsDf.iterrows():
    currentjson = f"'account': '{row['account']}'; 'caption': '{row['caption']}'; 'photo_caption': '{row['accessibility_caption']}'"
    postsDf.at[index, "processed_json"] = currentjson
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "user", "content": f'Does the following instagram post contain a club event with start time and end time. RETURN Yes or No: {currentjson}'}],
        max_tokens=2
    )
    print(response.choices[0].message.content)
    is_event = check_string(response.choices[0].message.content)
    postsDf.at[index, "is_event"] = is_event

KeyboardInterrupt: 

In [49]:
postsDf.to_csv("preliminaryProcessedInformation.csv")

In [50]:
from together import Together
import os
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import json
import pandas as pd
from enum import Enum, auto
import re

load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

basePrompt = open("C:/Users/david/Desktop/uw-upnext/WebScraper/basePrompt.in","r", encoding = "utf-8").read()

postsDf = pd.read_csv("C:/Users/david/Desktop/uw-upnext/WebScraper/preliminaryProcessedInformation.csv")

basePrompt = open("basePrompt.in","r", encoding = "utf-8").read()

def remove_emojis(text):
    # Unicode ranges for emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

class Category(Enum):
    TECH = auto()
    DESIGN = auto()
    SOCIAL = auto()
    ENTERTAINMENT = auto()
    CULTURE = auto()
    SPORTS = auto()
    NETWORKING = auto()
    GAMING = auto()

class Event(BaseModel):
    return_id: str = Field(description="Return id of event")
    is_event: bool = Field(description="Whether the post contains an event")
    event_name: str = Field(description="The Name of the Event")
    event_description: str = Field(description='Concise 20 word summary of the event without time or location')
    event_categories: list[str] = Field(description='Categorize the Event into one of the following: TECH, DESIGN, SOCIAL, MUSIC, CULTURE, SPORTS, NETWORK, GAMING')
    start_time: str = Field(description="The Start time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    end_time: str = Field(description="The End time of Event in the format: yyyy-mm-ddTHH:MM:SS+00:00")
    location: str = Field(description= "The location of event")

def return_event_details(inputJson : str):
    chat_completion = client.chat.completions.create(
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    response_format={"type": "json_object", "schema": Event.model_json_schema()},
    messages=[
        {
            "role": "system",
            "content": basePrompt,
        },
        {
            "role": "user",
            "content": "\n\n Input " + remove_emojis(inputJson) + "\n\n" + "Output",
        },
    ])
    
    created_event = json.loads(chat_completion.choices[0].message.content)
    return created_event

def extract_details_with_error_handling(inputJson, index):
        try: 
            created_event = return_event_details(inputJson)
            return created_event
        except Exception as err:
            print(str(err))
            return {'return_id': index, 'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
    
# print(extract_details_with_error_handling(processedjson[0], 0))

In [54]:
import time
postsDf = pd.read_csv("preliminaryProcessedInformation.csv")

postsDf["event_details"] = pd.NA


for index, row in postsDf.iterrows():
    if postsDf.at[index, "is_event"]== True:
        event_details = return_event_details(str(postsDf.at[index, "processed_json"]))
        print(event_details)
        if event_details["event_name"] == None or event_details["start_time"] == None or event_details["end_time"] == None:
            postsDf.at[index, "is_event"] = False
            postsDf.at[index, "event_details"] = None
        else:
            postsDf.at[index, "event_details"] = event_details
    else: print("no event detected")

no event detected
no event detected
no event detected
no event detected
{'return_id': 'uwblueprint', 'is_event': True, 'event_name': 'Celebrating Term Projects and Collaborations', 'event_description': 'Celebrating projects and collaborations', 'event_categories': ['ENTERTAINMENT', 'CULTURE'], 'start_time': 'July 29', 'end_time': '8pm', 'location': 'black and gold room in SLCD'}


In [55]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("information_for_mongo.csv")

print(postsDf)

   Unnamed: 0.1  Unnamed: 0      account                 date  \
0             0           0     uwengsoc  2024-05-26 19:29:29   
1             1           1     uwengsoc  2024-08-26 17:33:54   
2             2           2     uwengsoc  2024-08-26 17:31:50   
3             3           3        uw_ux  2024-08-08 03:40:09   
4             4           4  uwblueprint  2024-07-28 15:28:07   

                                             caption  \
0    EngSoc Student Services Hours*Subject to change   
1  Introducing our Fall 2024 Operations Commissio...   
2  Introducing our Fall 2024 Swag Commissioner, A...   
3  🚨FALL ‘24 EXEC APPLICATIONS ARE EXTENDED TO AU...   
4  Join us in celebrating an amazing term of proj...   

                               accessibility_caption          url  likes  \
0  Photo by Waterloo Engineering Society on May 2...  C7cYYPkJEXd   40.0   
1  Introducing our Fall 2024 Operations Commissio...  C_JESKsuAcf   30.0   
2  Introducing our Fall 2024 Swag Commission