In [None]:
import instaloader
import pandas as pd
import datetime 

L = instaloader.Instaloader()
cutoffdate = datetime.datetime.today() - datetime.timedelta(days=14)

handles = ['uwengsoc','uwcsa','uw_ux','uwblueprint','uwaterlooeng','uwaterloottc','uwaterloodsc','uwaterloopm','uwmcc','gdscwaterloo','uwsmileclub','socratica.info','yourwusa','wataiteam','uwawscloud','techplusuw','itshera.co','uwstartups']

postscolumns = ['account','date','caption','accessibility_caption','hashtags']
postsDf = pd.DataFrame(columns = postscolumns)

cnt = 0

for handle in handles:
    profile = instaloader.Profile.from_username(L.context, handle)
    for post in profile.get_posts():
        if post.date > cutoffdate:
            if (post.accessibility_caption) == None:
                photo_caption = post.accessibility_caption
            else:
                photo_caption = post.accessibility_caption.replace('"','\"')
            new_row = pd.DataFrame({
                'id': cnt,
                'url': post.shortcode,
                'likes': post.likes,
                'display_photo': post.url,
                'account': [handle.replace('"','\"')],
                'date': [post.date],
                'caption': [post.caption.replace('"','\"')],
                'accessibility_caption': [photo_caption],
            })
            cnt += 1
            postsDf = pd.concat([postsDf, new_row], ignore_index=True)
        else:
            break  

print(postsDf)


In [None]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("instagram_raw.csv")

In [2]:
from together import Together
import os
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import json
import pandas as pd
from enum import Enum, auto
import re

load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

basePrompt = open("C:/Users/david/Desktop/uw-upnext/WebScraper/basePrompt.in","r", encoding = "utf-8").read()

postsDf = pd.read_csv("C:/Users/david/Desktop/uw-upnext/WebScraper/instagram_raw.csv")

processedjson = []

for index, row in postsDf.iterrows():
    currentjson = f'"id": "{int(row["id"])}"|* "account": "{row["account"]}"|* "date": "{row["date"]}"|* "caption": "{row["caption"]}"|* "photo_caption": "{row["accessibility_caption"]}"'
    post = "{" + ',\n'.join(x for x in currentjson.replace('\n','\\n').split('|*')) + '}' 
    processedjson.append(post)

print("info loaded")


def remove_emojis(text):
    # Unicode ranges for emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

class Category(Enum):
    TECH = auto()
    DESIGN = auto()
    MUSIC = auto()
    ENTERTAINMENT = auto()
    CULTURE = auto()
    SPORTS = auto()
    WELLNESS = auto()
    GAMING = auto()

class Event(BaseModel):
    return_id: str = Field(description="return id of event")
    is_event: bool = Field(description="if the post contains an event")
    event_name: str = Field(description="name of event")
    event_description: str = Field(description='concise 20 word summary of what the event is, skip time or location')
    event_categories: str = Field(description='Categorize into one of the following: TECH, DESIGN, MUSIC, ENTERTAINMENT, CULTURE, SPORTS, WELLNESS, GAMING')
    start_time: str = Field(description="start time of event")
    end_time: str = Field(description="end time of event")
    location: str = Field(description= "location of event")



def return_event_details(inputJson : str):
    chat_completion = client.chat.completions.create(
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    response_format={"type": "json_object", "schema": Event.model_json_schema()},
        messages=[
            {
                "role": "system",
                "content": basePrompt,
            },
            {
                "role": "user",
                "content": "\n\n Input " + remove_emojis(inputJson) + "\n\n" + "Output",
            },
        ],
    )
    created_event = json.loads(chat_completion.choices[0].message.content)
    return created_event

def extract_details_with_error_handling(inputJson, index):
        try: 
            created_event = return_event_details(inputJson)
            return created_event
        except Exception as err:
            print(str(err))
            return {'return_id': index, 'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
    
# print(extract_details_with_error_handling(processedjson[0], 0))

info loaded


In [3]:
import time
postsDf = pd.read_csv("instagram_raw.csv")

postsDf["is_event"] = pd.NA
postsDf["event_details"] = pd.NA

for index, row in postsDf.iterrows():
    time.sleep(5)
    event_details = extract_details_with_error_handling(processedjson[index],index)
    print(event_details)
    if event_details["event_name"] == None or event_details["start_time"] == None or event_details["end_time"] == None:
        postsDf.at[index, "is_event"] = False
        postsDf.at[index, "event_details"] = None
    else:
        postsDf.at[index, "is_event"] = True
        postsDf.at[index, "event_details"] = event_details

{'return_id': '0', 'is_event': True, 'event_name': 'FRAME Designathon', 'event_description': 'A design challenge with teams making amazing work and creative talent and ideas', 'categories': 'DESIGN', 'start_time': None, 'end_time': None, 'location': None}
{'return_id': '1', 'is_event': True, 'event_name': 'FRAME Designathon', 'event_description': 'Meet the judges and mentors for the designathon', 'categories': 'DESIGN', 'start_time': None, 'end_time': None, 'location': None}
{'return_id': '2', 'is_event': True, 'event_name': 'UW Blueprint Celebration', 'event_description': 'Celebrating an amazing term of projects and collaborations', 'categories': 'DESIGN', 'start_time': '2024-07-29T18:00:00+00:00', 'end_time': '2024-07-29T20:00:00+00:00', 'location': 'SLC, black and gold room'}
{'return_id': '3', 'is_event': False, 'event_name': None, 'event_description': None, 'categories': None, 'start_time': None, 'end_time': None, 'location': None}
{'return_id': '4', 'is_event': False, 'event_name

In [5]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("information_for_mongo.csv")

print(postsDf)

    Unnamed: 0.1  Unnamed: 0         account                 date  \
0              0           0           uw_ux  2024-07-21 14:24:54   
1              1           1           uw_ux  2024-07-20 03:02:24   
2              2           2     uwblueprint  2024-07-28 15:28:07   
3              3           3     uwblueprint  2024-07-26 23:21:02   
4              4           4     uwblueprint  2024-07-21 01:07:58   
5              5           5     uwblueprint  2024-07-18 14:44:10   
6              6           6     uwblueprint  2024-07-18 14:42:39   
7              7           7     uwblueprint  2024-07-18 14:38:51   
8              8           8     uwblueprint  2024-07-18 14:37:07   
9              9           9     uwblueprint  2024-07-18 14:29:24   
10            10          10    uwaterloottc  2024-07-26 14:53:39   
11            11          11    uwaterloottc  2024-07-25 02:44:06   
12            12          12    uwaterloodsc  2024-07-27 14:04:14   
13            13          13    uw