### packages

In [44]:
import os
import pandas as pd
import re
import openai
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import datetime
import numpy as np
import matplotlib.pyplot as plt
import json
import warnings
warnings.filterwarnings("ignore")

# Download necessary NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/yy5914/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yy5914/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yy5914/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### LLM API

In [47]:
with open("secret.txt", "r") as f:
    openai.api_key = f.read().strip()  # Read the ChatGPT API key

### Profiling

In [67]:
### Define Utility Functions

def clean_text(text):
    """
    Preprocess the input text by:
    - Converting to lowercase,
    - Removing newline characters and extra spaces,
    - Removing punctuation,
    - Tokenizing and removing English stop words.
    
    Returns the cleaned text string.
    """
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

def remove_markdown_formatting(text):
    """
    Removes markdown code block markers from a string.
    If the text starts with ``` (possibly followed by a language tag)
    and ends with ```, those lines are removed.
    
    Returns the cleaned text.
    """
    lines = text.strip().splitlines()
    if lines and lines[0].startswith("```"):
        # Remove first line (opening marker)
        lines = lines[1:]
        if lines and lines[-1].strip().startswith("```"):
            # Remove last line if it's a closing marker
            lines = lines[:-1]
    return "\n".join(lines).strip()

# ---------------------- Define Directories and File Paths ----------------------
data_dir = '02data'
author_posts_csv = os.path.join(data_dir, 'author_posts.csv')
user_profiles_csv = os.path.join(data_dir, 'user_profiles_llm.csv')

# ---------------------- Load Aggregated Author Posts Data ----------------------
df_authors = pd.read_csv(author_posts_csv)
print(f"Loaded {len(df_authors)} posts from {author_posts_csv}")

# Identify authors with exactly 100 posts.
# Filter the DataFrame based on groups that have exactly 100 rows.
authors_with_100 = df_authors.groupby('author').filter(lambda x: len(x) == 100)['author'].unique()

# Filter df_authors to keep only rows from authors with exactly 100 posts.
df_authors = df_authors[df_authors['author'].isin(authors_with_100)].reset_index(drop=True)
print(f"After filtering, {len(df_authors)} posts remain for authors with exactly 100 posts.")

# ---------------------- Reddit API Initialization ----------------------
# Read Reddit API credentials from '04secret/reddit.txt'
credentials_path = os.path.join('04secret', 'reddit.txt')
with open(credentials_path, 'r') as f:
    lines = f.read().splitlines()

client_id = lines[0].strip()
client_secret = lines[1].strip()
user_agent = lines[2].strip()

# Initialize the Reddit object using PRAW with the provided credentials.
import praw  # ensure PRAW is imported
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# ---------------------- Generate User Profiles via LLM Analysis ----------------------
grouped = df_authors.groupby('author')
user_profiles_data = []

# Process each author's posts to create a profile.
for author, group in grouped:
    # Skip if author is missing or labeled as "deleted"
    if pd.isna(author) or str(author).lower() == "deleted":
        continue

    user_titles = []
    # Process each row in the group: use the title along with a human-readable timestamp.
    for _, row in group.iterrows():
        title = row['title'] if pd.notna(row['title']) else ""
        # Remove URLs, markdown links, and extra whitespace.
        title = re.sub(r'http\S+', '', title)
        title = re.sub(r'\[[^\]]+\]\([^\)]+\)', '', title)
        title = re.sub(r'\s+', ' ', title).strip()
        # Skip titles that are too short.
        if len(title) < 20:
            continue
        # Convert created_utc (UNIX timestamp) to human-readable format.
        try:
            timestamp = datetime.datetime.fromtimestamp(float(row['created_utc'])).strftime('%Y-%m-%d %H:%M:%S')
        except Exception as e:
            timestamp = "N/A"
        combined_line = f"Time: {timestamp} | {title}"
        user_titles.append(combined_line)
    
    if not user_titles:
        continue

    # Concatenate all the filtered titles (each on a new line) and clean the combined text.
    combined_titles = "\n".join(user_titles)
    cleaned_combined_text = clean_text(combined_titles)
    
    # Prepare a prompt for the LLM to generate the user profile.
    prompt = (
        "You are an expert researcher analyzing social media behavior related to electric vehicles (EVs). "
        "Given the following list of post headlines with timestamps by a user, generate both a concise user profile summary "
        "and EV-specific quantitative metrics. Please output a valid JSON object with exactly two keys: "
        "'profile_summary' and 'quantitative_metrics'.\n\n"
        "The 'profile_summary' should be a concise summary (approximately 150 words) describing the user's interests, attitudes, "
        "and discussion topics related to EVs, including opinions about EV brands, charging behavior, adoption concerns, "
        "and any notable patterns.\n\n"
        "The 'quantitative_metrics' should be a JSON object with the following keys (with values on a scale from 0 to 10):\n"
        "  - 'interest': Frequency and depth of EV topic discussions.\n"
        "  - 'attitude': Overall sentiment toward EVs (0: extremely negative, 10: extremely positive).\n"
        "  - 'technical_expertise': Technical sophistication in EV discussions.\n"
        "  - 'adoption_readiness': Likelihood or readiness to adopt/have adopted an EV.\n"
        "  - 'engagement': Activity and engagement level in EV-related discussions.\n"
        "  - 'communication_clarity': Clarity and coherence in communication.\n\n"
        "Do not include any additional text outside the JSON object.\n\n"
        "Post Headlines with Timestamps:\n"
        f"{combined_titles}\n\n"
        "JSON Output:"
    )
    
    # Call the OpenAI API to generate the profile.
    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",  # Change to "gpt-4" if available and desired.
            messages=[
                {"role": "system", "content": "You are an expert analyst."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=400,
            temperature=0.7,
        )
        raw_output = response.choices[0].message.content.strip()
        # Remove markdown formatting if present.
        raw_output = remove_markdown_formatting(raw_output)
        json_output = json.loads(raw_output)
        profile_summary = json_output.get("profile_summary", "")
        quantitative_metrics = json_output.get("quantitative_metrics", {})
    except json.JSONDecodeError as e:
        print(f"JSON decode error for user {author}: {e}")
        print(f"Raw output was: {raw_output}")
        profile_summary = "Error generating profile."
        quantitative_metrics = {}
    except Exception as e:
        print(f"Error generating profile for user {author}: {e}")
        profile_summary = "Error generating profile."
        quantitative_metrics = {}
    
    # Unpack the quantitative metrics dictionary into separate fields.
    interest = quantitative_metrics.get("interest", None)
    attitude = quantitative_metrics.get("attitude", None)
    technical_expertise = quantitative_metrics.get("technical_expertise", None)
    adoption_readiness = quantitative_metrics.get("adoption_readiness", None)
    engagement = quantitative_metrics.get("engagement", None)
    communication_clarity = quantitative_metrics.get("communication_clarity", None)
    
    # Construct profile data for this user, including aggregated statistics and unpacked metrics.
    profile_data = {
        "author": author,
        "total_posts": len(group),
        "avg_score": group["score"].mean() if "score" in group.columns else None,
        "profile_summary": profile_summary,
        "interest": interest,
        "attitude": attitude,
        "technical_expertise": technical_expertise,
        "adoption_readiness": adoption_readiness,
        "engagement": engagement,
        "communication_clarity": communication_clarity
    }
    user_profiles_data.append(profile_data)
    print(f"Processed profile for user: {author}")

# ---------------------- Save User Profiles to CSV ----------------------
df_profiles = pd.DataFrame(user_profiles_data)
df_profiles.to_csv(user_profiles_csv, index=False)
print(f"Saved user profiles with LLM summaries and quantitative metrics to {user_profiles_csv}")
print(df_profiles.head(5))

KeyboardInterrupt: 