In [1]:
import pandas as pd
import re
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer


In [2]:
# import datasets generated from scraper
posts_df = pd.read_csv('leica_subreddit_posts.csv')
comments_df = pd.read_csv('leica_subreddit_comments.csv')

In [3]:
# initialize tools
lemmatizer = WordNetLemmatizer()
analyzer = SentimentIntensityAnalyzer()

In [4]:
# define stop words
stop_words = set(stopwords.words('english'))
important_stopwords = {'not', 'no', 'very', 'never', 'too', 'quite'}
filtered_stopwords = [word for word in stop_words if word not in important_stopwords]

In [5]:
# add domain-specific words
analyzer.lexicon.update({
    "sharp": 2.0,
    "soft": -1.5,
    "Leica": 0.5,
    "rangefinder": 1.2,
    "bokeh": 1.0,
    "M10": 0.7,
    "Q2": 0.8,
})

In [6]:
# add Leica relevant keywords that we care about
relevant_keywords = ['leica', 'Leica', 'q2', 'Q2', 'q3', 'Q3', 'm10', 'M10', 'm11', 'M11', 'camera', 'lens', 'photography', 'image quality', 'sharpness']

# common off-topic phrases, words
off_topic_words = ['guidelines', 'rules', 'moderator', 'off-topic', 'follow the rules']

In [7]:
# text pre-processing funcs
def remove_urls(text):
    url_pattern = re.compile(r'http\S+|www\S+')
    return url_pattern.sub(r'', text)

def remove_usernames(text):
    return re.sub(r'u/\S+', '', text)

def convert_emojis(text):
    return emoji.demojize(text, delimiters=("", ""))

def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)

def remove_flair(text):
    return re.sub(r'\[.*?\]', '', text)

def remove_votes(text):
    return re.sub(r'\d+\s*(upvotes|downvotes)', '', text)

def remove_unimportant_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in filtered_stopwords])

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# applying pre-processing funcs to pipeline
def preprocess_reddit_post(text):
    if isinstance(text, str):  # Ensure the text is a string
        text = remove_urls(text)
        text = remove_usernames(text)
        text = remove_flair(text)
        text = remove_votes(text)
        text = convert_emojis(text)
        text = remove_special_characters(text)
        text = remove_unimportant_stopwords(text)
        text = lemmatize_text(text)
        return text
    else:
        return ""  # Return empty string for non-string values

# perform sentiment analysis
def analyze_sentiment(text):
    processed_text = preprocess_reddit_post(text)
    return analyzer.polarity_scores(processed_text)['compound']

# check if post contains relevant keywords
def contains_relevant_keywords(text):
    if isinstance(text, str):
        text_lower = text.lower()
        for keyword in relevant_keywords:
            if keyword in text_lower:
                return True
    return False

# check if post contains irrelevant, off-topic words
def contains_off_topic_words(text):
    if isinstance(text, str):
        text_lower = text.lower()
        for word in off_topic_words:
            if word in text_lower:
                return True
    return False

# combining word filtering funcs
def is_relevant_post(text):
    return contains_relevant_keywords(text) and not contains_off_topic_words(text)

In [8]:
import re
import emoji
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from transformers import pipeline
import torch

# Initialize VADER analyzer and other components
analyzer = SentimentIntensityAnalyzer()
lemmatizer = WordNetLemmatizer()
filtered_stopwords = set(stopwords.words('english')) | {'leica', 'camera', 'photography'}  # Custom stopwords

# Load an alternative sentiment model for comparisons (optional, requires transformers library)
transformer_analyzer = pipeline('sentiment-analysis')

# Additional setup for logging non-relevant posts
non_relevant_posts_log = []

# text pre-processing funcs
def remove_urls(text):
    url_pattern = re.compile(r'http\S+|www\S+')
    return url_pattern.sub(r'', text)

def remove_usernames(text):
    return re.sub(r'u/\S+', '', text)

def convert_emojis(text):
    """Convert emojis to text form. Helps capture emotional intent in sentiment analysis."""
    return emoji.demojize(text, delimiters=("", ""))

def remove_special_characters(text):
    """Remove special characters, keeping punctuation useful for sentiment analysis."""
    return re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)

def remove_flair(text):
    """Remove flair indicators (e.g., [Flair]) commonly used in Reddit titles."""
    return re.sub(r'\[.*?\]', '', text)

def remove_votes(text):
    """Remove vote counts (e.g., 123 upvotes), as they are not relevant to sentiment."""
    return re.sub(r'\d+\s*(upvotes|downvotes)', '', text)

def remove_unimportant_stopwords(text):
    """Remove unimportant stopwords to reduce noise."""
    return ' '.join([word for word in text.split() if word.lower() not in filtered_stopwords])

def lemmatize_text(text):
    """Convert words to their base form (lemma) to standardize word use."""
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Adding keyword density counting
def keyword_density(text):
    """Analyze keyword frequency for relevant keywords. Adjust based on findings."""
    text_lower = text.lower() if isinstance(text, str) else ''
    keyword_counts = {keyword: text_lower.count(keyword) for keyword in relevant_keywords}
    return keyword_counts

# applying pre-processing funcs to pipeline
def preprocess_reddit_post(text):
    if isinstance(text, str):  # Ensure the text is a string
        text = remove_urls(text)
        text = remove_usernames(text)
        text = remove_flair(text)
        text = remove_votes(text)
        text = convert_emojis(text)
        text = remove_special_characters(text)
        text = remove_unimportant_stopwords(text)
        text = lemmatize_text(text)
        return text
    else:
        return ""  # Return empty string for non-string values

# perform sentiment analysis
def analyze_sentiment(text, use_vader=True):
    """Analyze sentiment using VADER or an alternative transformer model for comparison."""
    processed_text = preprocess_reddit_post(text)
    if use_vader:
        sentiment_score = analyzer.polarity_scores(processed_text)['compound']
        
        # Filter neutral comments if score is around zero
        neutral_words = {'fine', 'ok', 'average'}
        if any(word in processed_text for word in neutral_words) and -0.05 <= sentiment_score <= 0.05:
            sentiment_score = 0.0
    else:
        # Alternative sentiment model (e.g., BERT-based) for enhanced nuance
        transformer_result = transformer_analyzer(processed_text)
        sentiment_score = transformer_result[0]['score'] * (1 if transformer_result[0]['label'] == 'POSITIVE' else -1)
    return sentiment_score

# check if post contains relevant keywords
def contains_relevant_keywords(text):
    if isinstance(text, str):
        text_lower = text.lower()
        for keyword in relevant_keywords:
            if keyword in text_lower:
                return True
    return False

# check if post contains irrelevant, off-topic words
def contains_off_topic_words(text):
    if isinstance(text, str):
        text_lower = text.lower()
        for word in off_topic_words:
            if word in text_lower:
                return True
    return False

# combining word filtering funcs
def is_relevant_post(text):
    """Check if post is relevant. Log non-relevant posts for review."""
    if contains_relevant_keywords(text) and not contains_off_topic_words(text):
        return True
    else:
        non_relevant_posts_log.append(text)  # Log non-relevant posts
        return False


# Apply to the posts and comments DataFrames
# Example DataFrames 'posts_df' for posts and 'comments_df' for comments
# (Assuming 'post_body' in posts_df and 'comment_body' in comments_df)
posts_df['cleaned_post_body'] = posts_df['post_body'].apply(preprocess_reddit_post)
posts_df['sentiment'] = posts_df['post_body'].apply(analyze_sentiment)
posts_df['is_relevant'] = posts_df['post_body'].apply(is_relevant_post)
posts_df['keyword_density'] = posts_df['post_body'].apply(keyword_density)  # New keyword density column
df_relevant = posts_df[posts_df['is_relevant']] 

comments_df['cleaned_comment_body'] = comments_df['comment_body'].apply(preprocess_reddit_post)
comments_df['sentiment'] = comments_df['comment_body'].apply(analyze_sentiment)
comments_df['is_relevant'] = comments_df['comment_body'].apply(is_relevant_post)
comments_df['keyword_density'] = comments_df['comment_body'].apply(keyword_density)  # New keyword density column
comments_df_relevant = comments_df[comments_df['is_relevant']]  # Keep only relevant comments

# Display the cleaned DataFrames
print("Relevant Posts:")
print(df_relevant[['title', 'cleaned_post_body', 'sentiment', 'keyword_density']].head())

print("\nRelevant Comments:")
print(comments_df_relevant[['comment_body', 'cleaned_comment_body', 'sentiment', 'keyword_density']].head())

# Optional: Print logged non-relevant posts for error checking
print("\nLogged Non-Relevant Posts:")
print(non_relevant_posts_log[:5])  # Display first 5 for brevity

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Relevant Posts:
                                               title  \
0         Posting Guidelines for Imagery/Photography   
1  What I'd like you to know about flair-- Please...   
2  My entire collection of 14 leica lenses fit co...   
4              Text Peeled Off - 2 Week Old Leica MP   
6                           M10 monochrome with 28mm   

                                   cleaned_post_body  sentiment  \
0  new subreddit, may aware, please know rLeica s...     0.9852   
1  time spent answering question flair. Id like s...     0.8271   
2  use spent film cartridge prop smaller lens len...     0.7269   
4  recently got first Leica, brand new black pain...     0.9721   
6  recent trip Rome, decided bring M10 got 6 mont...     0.7674   

                                     keyword_density  
0  {'leica': 10, 'Leica': 0, 'q2': 0, 'Q2': 0, 'q...  
1  {'leica': 0, 'Leica': 0, 'q2': 0, 'Q2': 0, 'q3...  
2  {'leica': 0, 'Leica': 0, 'q2': 0, 'Q2': 0, 'q3...  
4  {'leica': 1, 'Leica':

In [None]:
import pandas as pd
import time
import concurrent.futures
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI()

# Function to call GPT-4 Turbo for sentiment analysis with retry and delay
def analyze_sentiment_gpt4_turbo_batch(texts, retries=3, delay=2):
    for attempt in range(retries):
        try:
            # Combine multiple texts into a single API call
            batch_prompt = "\n".join(
                [f"{i+1}. {text}" for i, text in enumerate(texts)]
            )
            completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a sentiment analysis assistant."},
                    {"role": "user", "content": f"Analyze the sentiment of the following texts on a scale from -1 to 1, where -1 is negative, 0 is neutral, and 1 is positive. Provide sentiment ratings for each text:\n{batch_prompt}"}
                ],
                max_tokens=50,  # Adjust based on the number of texts
                temperature=0
            )
            # Parse results back into individual responses
            responses = completion.choices[0].message.content.strip().split("\n")
            sentiments = []
            for response in responses:
                if "1" in response:  # Positive sentiment
                    sentiments.append(1)
                elif "-1" in response:  # Negative sentiment
                    sentiments.append(-1)
                else:  # Neutral sentiment
                    sentiments.append(0)
            return sentiments
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if "rate limit" in str(e).lower() and attempt < retries - 1:
                time.sleep(delay * (attempt + 1))  # Exponential backoff
            else:
                return [None] * len(texts)  # Return placeholder on failure

# Batch and process rows
def batch_process_sentiment(df, text_column, batch_size=10, max_workers=5, delay_between_batches=1.5):
    results = []
    text_batches = [df[text_column].iloc[i:i + batch_size].tolist() for i in range(0, len(df), batch_size)]
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_batch = {
            executor.submit(analyze_sentiment_gpt4_turbo_batch, batch): batch for batch in text_batches
        }
        
        for future in concurrent.futures.as_completed(future_to_batch):
            batch_results = future.result()
            results.extend(batch_results)
            
            # Optional delay between processing batches
            time.sleep(delay_between_batches)
    
    return results

# Example preprocessing function (add your logic)
def preprocess_reddit_post(text):
    # Basic cleaning
    return text.strip()

# Add preprocessed text column
posts_df['processed_text'] = posts_df['post_body'].apply(preprocess_reddit_post)
comments_df['processed_text'] = comments_df['comment_body'].apply(preprocess_reddit_post)

# Process sentiment in batches
posts_df['sentiment_scores'] = batch_process_sentiment(
    posts_df, 'processed_text', batch_size=10, max_workers=5, delay_between_batches=1.5
)
comments_df['sentiment_scores'] = batch_process_sentiment(
    comments_df, 'processed_text', batch_size=10, max_workers=5, delay_between_batches=1.5
)

# Output results
print("Sentiment Scores for Posts:")
print(posts_df[['title', 'processed_text', 'sentiment_scores']].head())

print("\nSentiment Scores for Comments:")
print(comments_df[['processed_text', 'sentiment_scores']].head())

Attempt 1 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-WcwchLn8aGr4Tindmk0IRCLv on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}
Attempt 1 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-WcwchLn8aGr4Tindmk0IRCLv on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}
Attempt 1 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-WcwchLn8aGr4Tindmk0IRCLv on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.c

In [None]:
# Split the dictionary in 'sentiment_comparison' into separate columns
sentiment_df = pd.json_normalize(comments_df['sentiment_comparison'])

# Concatenate the new columns back to the original DataFrame (optional)
comments_df = pd.concat([comments_df, sentiment_df], axis=1)

# Drop the original 'sentiment_comparison' column if no longer needed
comments_df = comments_df.drop(columns=['sentiment_comparison'])

print(comments_df)

In [13]:
# Split the dictionary in 'sentiment_comparison' into separate columns
sentiment_df = pd.json_normalize(comments_df['sentiment_comparison'])

# Concatenate the new columns back to the original DataFrame (optional)
comments_df = pd.concat([comments_df, sentiment_df], axis=1)

# # Drop the original 'sentiment_comparison' column if no longer needed
# comments_df = comments_df.drop(columns=['sentiment_comparison'])

KeyError: 'sentiment_comparison'

In [17]:
# Split the dictionary in 'sentiment_comparison' into separate columns
sentiment_df = pd.json_normalize(posts_df['sentiment_comparison'])

# Concatenate the new columns back to the original DataFrame (optional)
posts_df = pd.concat([posts_df, sentiment_df], axis=1)

# # Drop the original 'sentiment_comparison' column if no longer needed
# comments_df = comments_df.drop(columns=['sentiment_comparison'])