In [1]:
import instaloader
import pandas as pd
import datetime 

L = instaloader.Instaloader()
cutoffdate = datetime.datetime.today() - datetime.timedelta(days=14)

handles = ['uwengsoc','uwcsa','uw_ux','uwblueprint','uwaterlooeng','uwaterloottc','uwaterloodsc','uwaterloopm','uwmcc','gdscwaterloo','uwsmileclub','socratica.info','yourwusa','wataiteam','uwawscloud','techplusuw','itshera.co','uwstartups']

postscolumns = ['account','date','caption','accessibility_caption','hashtags']
postsDf = pd.DataFrame(columns = postscolumns)

cnt = 0

for handle in handles:
    profile = instaloader.Profile.from_username(L.context, handle)
    for post in profile.get_posts():
        if post.date > cutoffdate:
            if (post.accessibility_caption) == None:
                photo_caption = post.accessibility_caption
            else:
                photo_caption = post.accessibility_caption.replace('"','\"')
            new_row = pd.DataFrame({
                'id': cnt,
                'url': post.shortcode,
                'display_photo': post.url,
                'account': [handle.replace('"','\"')],
                'date': [post.date],
                'caption': [post.caption.replace('"','\"')],
                'accessibility_caption': [photo_caption],
            })
            cnt += 1
            postsDf = pd.concat([postsDf, new_row], ignore_index=True)
            
        else:
            break  

# print(postsDf)

  readline_hook.enable(use_pyreadline=use_pyreadline)
  postsDf = pd.concat([postsDf, new_row], ignore_index=True)


In [2]:

processedjson = []

for index, row in postsDf.iterrows():
    json = f'"id": "{int(row["id"])}"|* "account": "{row["account"]}"|* "date": "{row["date"]}"|* "caption": "{row["caption"]}"|* "photo_caption": "{row["accessibility_caption"]}"'
    post = "{" + ',\n'.join(x for x in json.replace('\n','\\n').split('|*')) + '}' 
    processedjson.append(post)

# with open("Output.txt", "w", encoding="utf-8") as textfile:
#     textfile.write(',\n\n'.join(x for x in processedjson))
#     textfile.close()

In [3]:
from together import Together
import os
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import json

load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

basePrompt = open("basePrompt.txt","r", encoding = "utf-8").read()

class Event(BaseModel):
    return_id: str = Field(description="return id of event")
    is_event: bool = Field(description="if the post contains an event")
    event_name: str = Field(description="name of event")
    start_time: str = Field(description="start time of event")
    end_time: str = Field(description="end time of event")
    location: str = Field(description= "location of event") 

def return_event_details(inputJson : str):
    chat_completion = client.chat.completions.create(
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    response_format={"type": "json_object", "schema": Event.model_json_schema()},
        messages=[
            {
                "role": "system",
                "content": basePrompt,
            },
            {
                "role": "user",
                "content": "\n\n Input " + inputJson + "\n\n" + "Output",
            },
        ],
    )
    created_event = json.loads(chat_completion.choices[0].message.content)
    return created_event

def extract_details_with_error_handling(inputJson, index):
    attempts = 0
    max_attempts = 2
    while attempts < max_attempts:
        try: 
            created_event = return_event_details(inputJson)
            return created_event
        except Exception as e:
            attempts += 1
            print(f'Attempt {attempts} failed, {str(e)}')
    else:
        return {'return_id': index, 'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
    
# print(extract_details_with_error_handling(processedjson[7],7))

In [6]:
postsDf["is_event"] = pd.NA
postsDf["event_details"] = pd.NA

def simplify_dictionary(original_dict, keys_to_remove):
    simplified_dict = original_dict.copy()
    for key in keys_to_remove:
        simplified_dict.pop(key, None)
    
    return simplified_dict
for index, row in postsDf.iterrows():
    event_details = extract_details_with_error_handling(processedjson[index],index)
    print(event_details)
    if event_details["event_name"] == None or event_details["start_time"] == None or event_details["end_time"] == None:
        postsDf.at[index, "is_event"] = False
        postsDf.at[index, "event_details"] = None
    else:
        postsDf.at[index, "is_event"] = True
        simplify_dictionary(event_details, ['is_event','return_id'])
        postsDf.at[index, "event_details"] = event_details



{'return_id': '0', 'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
{'return_id': '1', 'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
{'return_id': '2', 'is_event': True, 'event_name': 'FRAME Designathon', 'start_time': None, 'end_time': None, 'location': None}
{'return_id': '3', 'is_event': True, 'event_name': 'FRAME Designathon', 'start_time': '2024-07-20T08:30:00+00:00', 'end_time': '2024-07-20T20:00:00+00:00', 'location': 'Communitech'}
{'return_id': '4', 'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
{'return_id': '5', 'is_event': True, 'event_name': 'Design Night', 'start_time': '2024-07-11T18:00:00+00:00', 'end_time': '2024-07-11T20:00:00+00:00', 'location': 'E7 4053'}
{'return_id': '6', 'is_event': True, 'event_name': 'Engineering Night', 'start_time': '2024-07-10T19:00:00+00:00', 'end_time': '2024-07-10T21:00:00+00:00', 'location': 'E7-4

In [8]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("information_for_mongo.csv")


print(postsDf)

           account                date  \
0         uwengsoc 2024-07-15 23:32:03   
1            uw_ux 2024-07-21 14:24:54   
2            uw_ux 2024-07-20 03:02:24   
3            uw_ux 2024-07-12 22:35:12   
4      uwblueprint 2024-07-21 01:07:58   
5      uwblueprint 2024-07-18 14:44:10   
6      uwblueprint 2024-07-18 14:42:39   
7      uwblueprint 2024-07-18 14:38:51   
8      uwblueprint 2024-07-18 14:37:07   
9      uwblueprint 2024-07-18 14:29:24   
10    uwaterloottc 2024-07-25 02:44:06   
11    uwaterloodsc 2024-07-23 15:07:42   
12    uwaterloodsc 2024-07-19 00:38:35   
13    uwaterloodsc 2024-07-18 00:52:28   
14    uwaterloodsc 2024-07-15 02:53:23   
15    uwaterloodsc 2024-07-12 13:58:11   
16     uwaterloopm 2024-07-22 22:42:25   
17     uwsmileclub 2024-07-17 14:32:29   
18  socratica.info 2024-07-21 03:31:22   
19  socratica.info 2024-07-14 21:04:46   
20  socratica.info 2024-07-14 03:59:56   
21  socratica.info 2024-07-13 22:00:17   
22        yourwusa 2024-07-17 20:2