In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install openai==0.28



**Data Collection**

In [None]:
import praw
import pandas as pd
import time

# Setup Reddit API access
print("Initializing Reddit connection...")

try:
    # Configure the Reddit instance
    reddit = praw.Reddit(
        client_id="Q3b_fvUxT7upIi1msSisrw",
        client_secret="zJyEmWdd1yYV2zVDfVyHCNpbLM2DfQ",
        user_agent="script:TeslaSolarScraper:v1.0 by u/Wonderful_Check_2951",
        redirect_uri="http://localhost:8080"
    )

    # Set to read-only mode
    reddit.read_only = True

    print("Connected to Reddit API in read-only mode")

    # Test connection by getting info about the subreddit
    subreddit = reddit.subreddit("TeslaSolar")
    print(f"Accessing r/{subreddit.display_name}")

    # Fetch posts
    print("Retrieving posts...")
    posts = []
    count = 0

    for post in subreddit.top(limit=100):
        count += 1
        # Convert Unix timestamp to human-readable date
        readable_date = datetime.fromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S')

        posts.append({
            'date': readable_date,  # Using the formatted date string
            'unix_timestamp': post.created_utc,  # Keep the original timestamp for reference
            'title': post.title,
            'text': post.selftext,
            'score': post.score,
            'url': post.url
        })

        if count % 10 == 0:
            print(f"Retrieved {count} posts...")
        time.sleep(2)  # Be gentle with the API

    # Create DataFrame and save to CSV
    if posts:
        df = pd.DataFrame(posts)
        df.to_csv('tesla_solar_reddit_posts.csv', index=False)
        print(f"Successfully saved {len(posts)} posts to CSV file.")
    else:
        print("No posts were retrieved.")

except Exception as e:
    print(f"Error occurred: {type(e).__name__}: {e}")

**Data Preprocessing**

In [None]:
import pandas as pd
import openai

# Set your OpenAI API key
openai.api_key = "    "

In [None]:
# Load your original posts
df = pd.read_csv('/content/drive/MyDrive/Project Tesla/tesla_solar_reddit_posts.csv')

# Fill missing title/text safely
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')

# Combine title and text into one field for analysis
df['content'] = df['title'] + ' ' + df['text']

# Optional: Remove very short posts
df = df[df['content'].str.len() > 10]

# Reset index
df = df.reset_index(drop=True)

print(f"Total posts after cleaning: {len(df)}")


Total posts after cleaning: 99


In [None]:
# Summarization function
def summarize_post(post_text):
    prompt = f"Summarize the following Reddit post into 1-2 sentences, highlighting the main issue or event:\n\n{post_text}"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful summarizer."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=60
    )
    return response['choices'][0]['message']['content'].strip()

# Create summary column only if it doesn't exist yet
if 'summary' not in df.columns:
    df['summary'] = df['content'].apply(summarize_post)

In [None]:
def create_prompt(existing_labels, post_text, similarity_threshold=70):
    if not existing_labels:
        return f"""
You are a smart assistant for categorizing Reddit posts related to Tesla Solar products.

Task:
- Suggest a new topic label for the following post.
- The label must be specific, informative, and actionable.
- It should clearly summarize the main issue or event in 3 to 7 words.
- Avoid generic labels like "Solar Discussion" or "Update."
- Focus on the real problem, experience, or event being discussed.

Post:
"{post_text}"

Respond ONLY in the format:
New Label: <your label>
"""

    label_list = '\n'.join([f"- {label}: {desc}" for label, desc in existing_labels.items()])

    return f"""
You are a smart assistant for categorizing Reddit posts related to Tesla Solar products.

Existing Labels:
{label_list}

New Post:
"{post_text}"

Task:
- If any existing label matches the post meaning with ≥ {similarity_threshold}% similarity, pick that label.
- If no existing label matches well enough, suggest a new label.
- The label must be specific, informative, and actionable.
- It should clearly summarize the main issue or event in 3 to 7 words.
- Avoid vague labels like "Feedback" or "Update."
- Focus on the real problem, experience, or event being discussed.

Respond ONLY in the format:
Existing Label: <label name>
or
New Label: <your new label suggestion>
"""

In [None]:
# Send prompt to GPT and get clean answer
def ask_llm(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=50  # short focused output
    )
    return response['choices'][0]['message']['content'].strip()

In [None]:
# Initialize
labels = {}  # Store existing labels
assigned_labels = []  # Store assigned label for each post

# Loop through posts
for idx, post_text in enumerate(df['content']):
    prompt = create_prompt(labels, post_text)
    response = ask_llm(prompt)

    if response.startswith("Existing Label:"):
        label_name = response.replace("Existing Label:", "").strip()
    elif response.startswith("New Label:"):
        label_name = response.replace("New Label:", "").strip()
        labels[label_name] = post_text  # Save first example post as description
    else:
        label_name = f"Label_{idx}"
        labels[label_name] = post_text

    assigned_labels.append(label_name)

# Add labels to DataFrame
df['category'] = assigned_labels

print("Posts have been labeled!")


Posts have been labeled!


In [None]:
# Save labeled posts to new CSV
df.to_csv('/content/drive/MyDrive/Project Tesla/tesla_solar_reddit_posts_labeled.csv', index=False)

print("Final CSV saved with 'category' column!")


Final CSV saved with 'category' column!


In [None]:
label_grouping = {
    "Tesla Solar System Downtime Woes": "System Performance Problems",
    "Powerwall Delay Frustration": "Powerwall Delivery/Installation Delays",
    "Positive Tesla Energy Bill Experience": "Positive Customer Experiences",
    "Tesla Solar Service Delays": "Customer Service Problems",
    "Roof Damage Dispute with Tesla": "Roof Quality Issues",
    "Powerwall Delivery Delay Frustration": "Powerwall Delivery/Installation Delays",
    "Powerwall Installation Delay Frustration": "Powerwall Delivery/Installation Delays",
    "Tesla Solar System Performance Issues": "System Performance Problems",
    "Tesla Solar System Performance Concerns": "System Performance Problems",
    "Tesla Solar Customer Service Frustrations": "Customer Service Problems",
    "Solar Roof Installation Completion": "Positive Customer Experiences",
    "Tesla Solar System Reliability Concerns": "System Performance Problems",
    "Hurricane Resilience with Tesla Solar": "Positive Customer Experiences",
    "Solar System Performance Concerns": "System Performance Problems",
    "Solar System Installation Delay Frustration": "Installation Delays",
    "Tesla Solar Roof Leak Issue": "Roof Quality Issues",
    "Solar Roof Material Arrival Update": "Installation Progress Updates",
    "Tesla Solar System Installation Delays": "Installation Delays"
}

df['dashboard_category'] = df['category'].map(label_grouping).fillna(df['category'])

In [None]:
# Save labeled posts to new CSV
df.to_csv('/content/drive/MyDrive/Project Tesla/tesla_solar_reddit_posts_GroupLabeled.csv', index=False)

print("Final CSV saved with 'category' column!")


Final CSV saved with 'category' column!
