In [13]:
import google.generativeai as genai

from llama_index.llms.gemini import Gemini
from llama_index.core import PromptTemplate
import tiktoken

import pickle
import os
import json

In [18]:
GOOGLE_API_KEY = 'AIzaSyD989utJ91F8wmJ8otxCKl5q_SUPR6Yd0Q'
genai.configure(api_key=GOOGLE_API_KEY)

In [34]:
model_info = genai.get_model("models/gemini-1.5-flash")

# Returns the "context window" for the model,
# which is the combined input and output token limits.
print(f"{model_info.input_token_limit=}")
print(f"{model_info.output_token_limit=}")
# ( input_token_limit=30720, output_token_limit=2048 )

model_info.input_token_limit=1000000
model_info.output_token_limit=8192


In [4]:
# Load the JSON file
def load_reddit_data(json_file):
    """Load the Reddit data from a JSON file."""
    with open(json_file, 'r', encoding='utf-8') as file:
        return json.load(file)

In [10]:
# Extract relevant data from the JSON
def extract_reddit_text(data):
    """Extract titles, selftext, comments, and comment scores from the JSON data."""
    extracted_data = []
    for post in data:
        post_content = f"Title: {post['title']}\n"
        post_content += f"Body: {post['selftext']}\n"
        post_content += "Comments:\n"
        for comment in post.get("comments", []):
            if isinstance(comment, dict):  # Ensure it's a comment with score
                post_content += f"- {comment['comment_body']} (Score: {comment['comment_score']})\n"
            else:
                post_content += f"- {comment}\n"
        extracted_data.append(post_content)
    return extracted_data

In [14]:
# Calculate tokens using the tokenizer
def calculate_tokens(text_list, model_name="gemini-1.5-flash"):
    """Calculate the number of tokens in a list of text inputs."""
    tokenizer = tiktoken.encoding_for_model(model_name)
    total_tokens = 0

    for text in text_list:
        tokens = tokenizer.encode(text)
        total_tokens += len(tokens)

    return total_tokens

In [53]:
# Generate a summary using an LLM
def generate_summary(extracted_data, context_question):
    """Generate a summary based on the context question using an LLM."""
    model = genai.GenerativeModel("models/gemini-1.5-flash")
    prompt = (
        f"The following is data extracted from Reddit. Based on the question below, "
        f"generate a concise and accurate summary and final conclusion on the sentiment of the question based on whether it positive, negative or neutral:\n\n"
        f"Question: {context_question}\n\n"
    )
    for data in extracted_data:
        prompt += f"Post:\n{data}\n\n"

    prompt += "Provide a detailed summary:"
    response = model.generate_content(prompt)
    return response

In [39]:
# Main function to run the pipeline
def summarize_reddit_data(json_file, question):
    """Load Reddit data, extract relevant text, and generate a summary."""
    data = load_reddit_data(json_file)
    extracted_data = extract_reddit_text(data)
    summary = generate_summary(extracted_data, question)
    return summary

In [54]:
%%time
reddit_json_file = 'reddit_data_with_all_comments.json'
question = 'What people think on Trump winning the election'

summary = summarize_reddit_data(reddit_json_file, question)
print(summary)

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "The Reddit posts analyzed express overwhelmingly negative sentiment towards a potential Trump victory in the 2024 election.  The first post focuses on the anticipated impact on women's rights, with commenters predicting a significant rollback of reproductive rights, including potential bans on abortion and various forms of birth control, as well as restrictions on divorce.  The overall tone is one of fear and outrage, with many expressing anxieties about losing hard-won freedoms and a return to a more patriarchal society.  The second post explores the possibility of civil unrest or even civil war following the election, with a more divided opinion but a strong current of concern regarding the potential for violence and instability, particularly from the righ

In [55]:
print(summary.text)

The Reddit posts analyzed express overwhelmingly negative sentiment towards a potential Trump victory in the 2024 election.  The first post focuses on the anticipated impact on women's rights, with commenters predicting a significant rollback of reproductive rights, including potential bans on abortion and various forms of birth control, as well as restrictions on divorce.  The overall tone is one of fear and outrage, with many expressing anxieties about losing hard-won freedoms and a return to a more patriarchal society.  The second post explores the possibility of civil unrest or even civil war following the election, with a more divided opinion but a strong current of concern regarding the potential for violence and instability, particularly from the right if a Democrat wins. The third post discusses the perceived closeness of the election despite demographic trends seemingly favoring the Democrats. Commenters attribute this closeness to low youth voter turnout, the impact of the ec