# Import

In [28]:
import requests
import json
import pandas as pd
import os

# Constants & Parameters

In [2]:
PHAVER_GRAPHQL_ENDPOINT = os.getenv("PHAVER_GRAPHQL_ENDPOINT")
PHAVER_PROFILE_ID = os.getenv("PHAVER_PROFILE_ID")

FIREBASE_TOKEN_URL = os.getenv("FIREBASE_API_URL") + os.getenv("FIREBASE_API_KEY")
FIREBASE_REFRESH_TOKEN = os.getenv("FIREBASE_REFRESH_TOKEN")

LIMIT_PER_REQUEST = 1000
MAX_COMMENTS_REQUESTED = 5000

# Load GrapgQL Query

In [36]:
# Utility function to load a GraphQL query or fragment from a file
def load_graphql_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()
    
# Load fragments and query from their respective files
COMMENTS_QUERY = load_graphql_file('graphql/queries/CommentsQuery.gql')

# Access Token Functions

In [4]:
def request_access_token():
    payload = {
        "grantType": "refresh_token",
        "refreshToken": FIREBASE_REFRESH_TOKEN
    }

    response = requests.post(FIREBASE_TOKEN_URL, json=payload)
    response.raise_for_status()
    access_data = response.json()
    return access_data['access_token']

# Request Token

In [5]:
access_token = request_access_token()

# Comments Request

In [39]:
# GraphQL API URL and headers

headers = {
    "Authorization": "Bearer " + access_token,
    "Content-Type": "application/json"
}

query = COMMENTS_QUERY
variables = {
    "topLevelCommentsWhere": {
        "publishedAt": {
            "_lte": "2024-09-18T08:23:31.063Z"
        },
        "recommendationId": {
            "_eq": "fcb35573-ed36-4282-b8eb-d4f16d84f289"
        },
        "parentId": {
            "_is_null": True
        }
    },
    "offset": 0,  # Start offset
    "limit": LIMIT_PER_REQUEST  # Limit per request
}



In [40]:
all_comments = []

In [41]:
# Function to send GraphQL request
def fetch_comments(offset):
    variables['offset'] = offset
    response = requests.post(PHAVER_GRAPHQL_ENDPOINT, json={'query': query, 'variables': variables}, headers=headers)
    response.raise_for_status()
    data = response.json()
    if 'errors' in data:
        raise Exception(data['errors'])
    return data['data']['comments']

# Fetch and store comments in batches of 100 until we get 1000
for offset in range(0, MAX_COMMENTS_REQUESTED, LIMIT_PER_REQUEST):
    comments_batch = fetch_comments(offset)
    all_comments.extend(comments_batch)

# Save comments to a JSON file
with open('data/phaver_x_cyber_comments_list.json', 'w') as f:
    json.dump(all_comments, f, indent=4)

print(f"Fetched and saved {len(all_comments)} comments to comments.json")


Fetched and saved 10 comments to comments.json


In [23]:
# Convert to Pandas DataFrame
all_comments_df = pd.DataFrame(all_comments)

# Normalize nested 'profile' field to extract 'username'
all_comments_df['username'] = all_comments_df['profile'].apply(lambda x: x['username'] if x else None)
all_comments_df = all_comments_df.drop(columns=['profile'])

In [24]:
# Filtered out duplicate comments with the same "id"
filtered_comments_df = all_comments_df.drop_duplicates(subset=['id'])

# Filter out comments that do not contain 'cyber' (case insensitive)
no_cyber_comments_df = filtered_comments_df[~filtered_comments_df['content'].str.contains('cyber', case=False)]

# Save the filtered comments in a separate DataFrame
filtered_comments_df = filtered_comments_df[filtered_comments_df['content'].str.contains('cyber', case=False)]

In [25]:
# Sort the DataFrame by the 'createdAt' column
filtered_comments_df = filtered_comments_df.sort_values(by='createdAt')

In [26]:
# Reset the index to incremental numbers
filtered_comments_df = filtered_comments_df.reset_index(drop=True)

In [29]:
filtered_comments_df.to_excel('data/phaver_x_cyber_comments_list.xlsx', index=True, engine='xlsxwriter')

# Search Someone

In [30]:
filtered_comments_df[filtered_comments_df['username'] == 'maximecharriere']

Unnamed: 0,id,content,createdAt,username
