In [4]:
import instaloader
import pandas as pd
import datetime 

L = instaloader.Instaloader()
cutoffdate = datetime.datetime.today() - datetime.timedelta(days=14)

handles = ['uwengsoc','uwcsa','uw_ux','uwblueprint','uwaterlooeng','uwaterloottc','uwaterloodsc','uwaterloopm','uwmcc','gdscwaterloo','uwsmileclub','socratica.info','yourwusa','wataiteam','uwawscloud','techplusuw','itshera.co','uwstartups']

postscolumns = ['account','date','caption','accessibility_caption','hashtags']
postsDf = pd.DataFrame(columns = postscolumns)

cnt = 0

for handle in handles:
    profile = instaloader.Profile.from_username(L.context, handle)
    for post in profile.get_posts():
        if post.date > cutoffdate:
            if (post.accessibility_caption) == None:
                photo_caption = post.accessibility_caption
            else:
                photo_caption = post.accessibility_caption.replace('"','\"')
            new_row = pd.DataFrame({
                'id': cnt,
                'url': post.shortcode,
                'likes': post.likes,
                'display_photo': post.url,
                'account': [handle.replace('"','\"')],
                'date': [post.date],
                'caption': [post.caption.replace('"','\"')],
                'accessibility_caption': [photo_caption],
            })
            cnt += 1
            postsDf = pd.concat([postsDf, new_row], ignore_index=True)
        else:
            break  

print(postsDf)


  postsDf = pd.concat([postsDf, new_row], ignore_index=True)


KeyboardInterrupt: 

In [2]:

processedjson = []

for index, row in postsDf.iterrows():
    currentjson = f'"id": "{int(row["id"])}"|* "account": "{row["account"]}"|* "date": "{row["date"]}"|* "caption": "{row["caption"]}"|* "photo_caption": "{row["accessibility_caption"]}"'
    post = "{" + ',\n'.join(x for x in currentjson.replace('\n','\\n').split('|*')) + '}' 
    processedjson.append(post)

# with open("Output.txt", "w", encoding="utf-8") as textfile:
#     textfile.write(',\n\n'.join(x for x in processedjson))
#     textfile.close()

In [None]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("instagram_raw.csv")

In [None]:
from together import Together
import os
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import json
import pandas as pd
from enum import Enum, auto
import re

load_dotenv()
togetherAPI = os.getenv('TOGETHER_API')
client = Together(api_key=togetherAPI)

basePrompt = open("basePrompt.txt","r", encoding = "utf-8").read()

postsDf = pd.read_csv("instagram_raw.csv")

processedjson = []

for index, row in postsDf.iterrows():
    currentjson = f'"id": "{int(row["id"])}"|* "account": "{row["account"]}"|* "date": "{row["date"]}"|* "caption": "{row["caption"]}"|* "photo_caption": "{row["accessibility_caption"]}"'
    post = "{" + ',\n'.join(x for x in currentjson.replace('\n','\\n').split('|*')) + '}' 
    processedjson.append(post)


def remove_emojis(text):
    # Unicode ranges for emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

class Category(Enum):
    TECH = auto()
    DESIGN = auto()
    MUSIC = auto()
    ENTERTAINMENT = auto()
    CULTURE = auto()
    SPORTS = auto()
    WELLNESS = auto()
    GAMING = auto()
    Null = None


class Event(BaseModel):
    return_id: str = Field(description="return id of event")
    is_event: bool = Field(description="if the post contains an event")
    event_name: str = Field(description="name of event")
    event_description: str = Field(description='concise 10 word description of what the event is, do not include time or location')
    event_categories: str = Field(description='categorize the event into one or more of the allowed categories')
    start_time: str = Field(description="start time of event")
    end_time: str = Field(description="end time of event")
    location: str = Field(description= "location of event")

    class Config:
        use_enum_values = True 


def return_event_details(inputJson : str):
    inputJson = remove_emojis(inputJson)
    chat_completion = client.chat.completions.create(
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    response_format={"type": "json_object", "schema": Event.model_json_schema()},
        messages=[
            {
                "role": "system",
                "content": basePrompt,
            },
            {
                "role": "user",
                "content": "\n\n Input " + inputJson + "\n\n" + "Output",
            },
        ],
    )
    try: 
        created_event = json.loads(chat_completion.choices[0].message.content)
    except:
        created_event = {'return_id': index, 'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
    return created_event

def extract_details_with_error_handling(inputJson, index):
    attempts = 0
    max_attempts = 2
    while attempts < max_attempts:
        try: 
            created_event = return_event_details(inputJson)
            return created_event
        except Exception as e:
            attempts += 1
            print(f'Attempt {attempts} failed, {str(e)}')
    else:
        return {'return_id': index, 'is_event': False, 'event_name': None, 'start_time': None, 'end_time': None, 'location': None}
    
# print(extract_details_with_error_handling(processedjson[0], 0))

In [2]:



postsDf = pd.read_csv("instagram_raw.csv")

postsDf["is_event"] = pd.NA
postsDf["event_details"] = pd.NA

def simplify_dictionary(original_dict, keys_to_remove):
    simplified_dict = original_dict.copy()
    for key in keys_to_remove:
        simplified_dict.pop(key, None)
    return simplified_dict

for index, row in postsDf.iterrows():
    event_details = extract_details_with_error_handling(processedjson[index],index)
    print(event_details)
    if event_details["event_name"] == None or event_details["start_time"] == None or event_details["end_time"] == None:
        postsDf.at[index, "is_event"] = False
        postsDf.at[index, "event_details"] = None
    else:
        postsDf.at[index, "is_event"] = True
        simplify_dictionary(event_details, ['is_event','return_id'])
        postsDf.at[index, "event_details"] = event_details
    



In [None]:
postsDf.reset_index(drop=True) 
postsDf.to_csv("information_for_mongo.csv")

print(postsDf)