### 01 packages

In [20]:
import os
import praw
import csv
import time
from datetime import datetime, timezone

### 02 reddit api

In [23]:
# ----------------------------------------------------------------------------------
# Read Reddit API credentials from the '04secret/reddit.txt' file.
# The file should contain three lines: client_id, client_secret, and user_agent.
credentials_path = os.path.join('04secret', 'reddit.txt')
with open(credentials_path, 'r') as f:
    lines = f.read().splitlines()

client_id = lines[0].strip()
client_secret = lines[1].strip()
user_agent = lines[2].strip()

# Initialize the Reddit object using PRAW with the provided credentials.
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

### user collection

In [26]:
# Define a minimum UNIX timestamp: data older than January 1, 2014 will be excluded.
MIN_TIMESTAMP = 1388534400  # corresponds to 2014-01-01 00:00:00 UTC

def is_recent(created_utc):
    """
    Check if the provided created_utc timestamp is on or after January 1, 2014.
    
    Args:
        created_utc (int): The creation time in UNIX timestamp format.
    
    Returns:
        bool: True if the timestamp is newer than the defined minimum, False otherwise.
    """
    return created_utc >= MIN_TIMESTAMP

def get_submission_description(submission):
    """
    Combine the post's title and selftext to form a full description.
    This description is intended to capture a car purchase review or experience.
    
    Args:
        submission (praw.models.Submission): A Reddit submission (post).
    
    Returns:
        str: A concatenated string of the title and selftext if both exist; otherwise,
             it returns whichever one is available.
    """
    title = submission.title.strip() if submission.title else ""
    selftext = submission.selftext.strip() if submission.selftext else ""
    if title and selftext:
        return f"{title}\n{selftext}"
    elif title:
        return title
    elif selftext:
        return selftext
    else:
        return ""

def search_posts(keywords, subreddits, limit=1000):
    """
    Search for submissions (posts) in specified subreddits that contain any of the provided keywords.
    
    Args:
        keywords (list): A list of search phrases, e.g., "Tesla Model 3 review", "bought car", 
                         "electric car review", etc. The search is based on Reddit's built-in
                         fuzzy matching.
        subreddits (list): A list of subreddit names to search in (e.g., ["cars", "teslamotors", "electricvehicles", "CarReviews"]).
        limit (int): Maximum number of submissions to retrieve per keyword per subreddit.
    
    Returns:
        list: A list of dictionaries. Each dictionary contains:
              - "user_id": The author's username.
              - "comment_time": The UTC creation time as a formatted string.
              - "description": Combined text from the title and selftext.
    """
    results = []
    for sub in subreddits:
        print(f"Searching posts in subreddit: {sub}")
        subreddit_obj = reddit.subreddit(sub)
        for keyword in keywords:
            print(f"  Searching posts with keyword: {keyword}")
            try:
                # Retrieve submissions matching the keyword using Reddit's built-in search.
                for submission in subreddit_obj.search(keyword, sort='new', limit=limit):
                    # Only process submissions that are recent.
                    if not is_recent(submission.created_utc):
                        continue

                    # Use "Unknown" if the author information is not available.
                    author = str(submission.author) if submission.author else "Unknown"
                    # Convert the creation timestamp into a timezone-aware UTC datetime string.
                    comment_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
                    # Create a description by combining the title and selftext.
                    description = get_submission_description(submission)
                    
                    if description:
                        results.append({
                            "user_id": author,
                            "comment_time": comment_time,
                            "description": description
                        })
            except Exception as e:
                print(f"Error searching posts in subreddit '{sub}' with keyword '{keyword}': {e}")
    return results

def main():
    # Define an expanded set of keywords for car reviews and purchase experiences.
    # These keywords include both specific models and generic phrases.
    keywords = [
        "Tesla Model 3 review",
        "Tesla Model S review",
        "bought Tesla Model 3",
        "bought Tesla Model S",
        "Nissan Leaf review",
        "bought Nissan Leaf",
        "BMW 3 Series review",
        "bought BMW 3 Series",
        "Audi A4 review",
        "bought Audi A4",
        "electric car review",
        "bought a car",
        "purchased a car",
        "new car review",
        "my car review",
        "car purchase experience",
        "auto review",
        "vehicle review"
    ]
    
    # Define target subreddits for searching. This list can be expanded to include more active automotive communities.
    target_subreddits = [
        "cars",
        "teslamotors",
        "electricvehicles",
        "CarReviews",
        "AutoDetailing"  # Add additional automotive-related subreddits as needed.
    ]
    
    print("Starting search for car review posts...")
    posts_results = search_posts(keywords, target_subreddits, limit=1000)
    print(f"Total posts found: {len(posts_results)}")
    
    # Combine records from all searches. For duplicate user_ids, keep the one with the longest description.
    unique_user_records = {}
    for record in posts_results:
        user_id = record["user_id"]
        if user_id in unique_user_records:
            if len(record["description"]) > len(unique_user_records[user_id]["description"]):
                unique_user_records[user_id] = record
        else:
            unique_user_records[user_id] = record

    # Ensure the output directory exists.
    output_dir = "02data"
    os.makedirs(output_dir, exist_ok=True)

    # Write the unique user records to a CSV file in the '02data/' directory.
    output_file = os.path.join(output_dir, "user_reviews.csv")
    with open(output_file, "w", newline='', encoding='utf-8') as csvfile:
        fieldnames = ["user_id", "comment_time", "description"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for record in unique_user_records.values():
            writer.writerow(record)
    
    print(f"Saved {len(unique_user_records)} unique user records to {output_file}")

if __name__ == "__main__":
    main()

Starting search for car review posts...
Searching posts in subreddit: cars
  Searching posts with keyword: Tesla Model 3 review
  Searching posts with keyword: Tesla Model S review
  Searching posts with keyword: bought Tesla Model 3
  Searching posts with keyword: bought Tesla Model S
  Searching posts with keyword: Nissan Leaf review
  Searching posts with keyword: bought Nissan Leaf
  Searching posts with keyword: BMW 3 Series review
  Searching posts with keyword: bought BMW 3 Series
  Searching posts with keyword: Audi A4 review
  Searching posts with keyword: bought Audi A4
  Searching posts with keyword: electric car review
  Searching posts with keyword: bought a car
  Searching posts with keyword: purchased a car
  Searching posts with keyword: new car review
  Searching posts with keyword: my car review
  Searching posts with keyword: car purchase experience
  Searching posts with keyword: auto review
  Searching posts with keyword: vehicle review
Searching posts in subreddit