### 01 packages

In [26]:
import os
import praw
import csv
import time
from datetime import datetime, timezone
import pandas as pd
from tqdm import tqdm
import openai
import pandas as pd
import time
import json

### 02 collect the post from these users

In [29]:
# ---------------------- Configuration ----------------------
# Define the directory and file paths.
output_dir = "02data"
input_file = os.path.join(output_dir, "filtered_user_reviews.csv")

# ---------------------- Reddit API Initialization ----------------------
# Read Reddit API credentials from the '04secret/reddit.txt' file.
credentials_path = os.path.join('04secret', 'reddit.txt')
with open(credentials_path, 'r') as f:
    lines = f.read().splitlines()

client_id = lines[0].strip()
client_secret = lines[1].strip()
user_agent = lines[2].strip()

# Initialize the Reddit object using PRAW with the provided credentials.
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# ---------------------- Data Loading and Filtering ----------------------
# Load the filtered LLM-analyzed CSV data into a DataFrame.
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} records from {input_file}")

# Apply additional filtering:
# - Only include rows where 'has_car' equals "Yes".
# - Exclude rows where 'car_model' is missing (NaN).
# - Exclude rows where 'fuel_type' equals "Unknown".
df = df.loc[(df.has_car == "Yes") & (df.car_model.notna()) & (df.fuel_type != "Unknown")]
df = df.reset_index(drop=True)
print(f"After filtering, {len(df)} records remain.")

# Extract unique authors from the "user_id" column.
unique_authors = df["user_id"].unique()
print(f"Found {len(unique_authors)} unique authors.")

# ---------------------- Collecting User Posts ----------------------
# Define the output CSV file path for the user posts.
author_posts_csv = os.path.join(output_dir, 'author_posts.csv')

# Define the fieldnames (columns) for the CSV file.
# In addition to the previous fields, we add:
#   - ups: the number of upvotes.
#   - downs: the number of downvotes.
#   - upvote_ratio: the upvote ratio.
#   - We'll also convert created_utc to a human-readable format.
fieldnames = [
    'author', 'post_id', 'title', 'selftext',
    'score', 'ups', 'downs', 'upvote_ratio',
    'url', 'num_comments', 'created_utc', 'subreddit'
]

# Open the CSV file for writing and write the header.
with open(author_posts_csv, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # For each unique author, attempt to scrape up to 100 of their recent posts.
    for author_name in unique_authors:
        try:
            # Create a Redditor object for the given username.
            redditor = reddit.redditor(author_name)
            # Retrieve up to 100 of the user's most recent submissions.
            submissions = redditor.submissions.new(limit=100)

            # Write each submission's details to the CSV.
            for submission in submissions:
                # Convert the UNIX timestamp to a human-readable date format.
                created_time = datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                
                # Write the row with additional fields:
                writer.writerow({
                    'author': author_name,
                    'post_id': submission.id,
                    'title': submission.title,
                    'selftext': submission.selftext if submission.selftext else "",
                    'score': submission.score,
                    'ups': getattr(submission, 'ups', None),
                    'downs': getattr(submission, 'downs', None),
                    'upvote_ratio': getattr(submission, 'upvote_ratio', None),
                    'url': submission.url,
                    'num_comments': submission.num_comments,
                    'created_utc': created_time,
                    'subreddit': str(submission.subreddit)
                })

            print(f"Saved up to 100 posts for user: {author_name}")

        except Exception as e:
            # Handle possible exceptions (e.g., banned or suspended users, or user not found).
            print(f"An error occurred for user '{author_name}': {e}")

print(f"User posts collection complete. Posts saved to {author_posts_csv}")

Loaded 4242 records from 02data/filtered_user_reviews.csv
After filtering, 2030 records remain.
Found 2030 unique authors.
Saved up to 100 posts for user: BroxigarZ
Saved up to 100 posts for user: eaglerulez
Saved up to 100 posts for user: Dannyz
Saved up to 100 posts for user: The_Exia
Saved up to 100 posts for user: LetOk8529
Saved up to 100 posts for user: iamnotcreativeDET
Saved up to 100 posts for user: ChirpyRaven
Saved up to 100 posts for user: sunshinedirt13
Saved up to 100 posts for user: cookingboy
Saved up to 100 posts for user: Zappiticas


KeyboardInterrupt: 