In [None]:
import pandas as pd
from transformers import pipeline

# Load the data
df = pd.read_csv("reddit_data.csv")

summarizer = pipeline("summarization", model="t5-small")

def summarize_text(text, max_length=100, min_length=30):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return "No content to summarize."
    try:
        summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error in summarization: {e}"

# Summarize the content and comments
df["summarized_content"] = df["content"].apply(lambda x: summarize_text(x, max_length=100, min_length=30))
df["summarized_comments"] = df["comments"].apply(lambda x: summarize_text(x, max_length=100, min_length=30))

# Create a new DataFrame with required columns
summarized_df = df[["title", "summarized_content", "summarized_comments", "url", "subreddit"]]

# Save the new DataFrame to a CSV file
summarized_df.to_csv("reddit_data_t5.csv", index=False)

print(summarized_df.head())

In [None]:
import json
from rouge_score import rouge_scorer
import pandas as pd
import openai

# Load API key
with open("/path/to/config.json", "r") as f:
    config = json.load(f)
openai.api_key = config["openai_api_key"]

# Load the data
df = pd.read_csv("reddit_data.csv")

# OpenAI summary
def summarize_text_with_openai(text, max_length=100):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return "No content to summarize."
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that summarizes text."},
                {"role": "user", "content": f"Summarize this text in {max_length} words or less: {text}"}
            ]
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        return f"Error in summarization: {e}"

# Summarize the content and comments
df["summarized_content"] = df["content"].apply(lambda x: summarize_text_with_openai(x, max_length=100))
df["summarized_comments"] = df["comments"].apply(lambda x: summarize_text_with_openai(x, max_length=100))

# Calculate ROUGE scores
def calculate_rouge(reference, generated):
    if not isinstance(reference, str) or not isinstance(generated, str):
        return {"rouge1": 0, "rouge2": 0, "rougeL": 0}
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        "rouge1": scores["rouge1"].fmeasure,
        "rouge2": scores["rouge2"].fmeasure,
        "rougeL": scores["rougeL"].fmeasure
    }

# Calculate ROUGE scores for content
rouge_scores_content = df.apply(
    lambda row: calculate_rouge(row["content"], row["summarized_content"]), axis=1
)

# Add ROUGE scores to the DataFrame
df["rouge1_content"] = [score["rouge1"] for score in rouge_scores_content]
df["rouge2_content"] = [score["rouge2"] for score in rouge_scores_content]
df["rougeL_content"] = [score["rougeL"] for score in rouge_scores_content]

# Calculate ROUGE scores for comments
rouge_scores_comments = df.apply(
    lambda row: calculate_rouge(row["comments"], row["summarized_comments"]), axis=1
)

# Add ROUGE scores for comments
df["rouge1_comments"] = [score["rouge1"] for score in rouge_scores_comments]
df["rouge2_comments"] = [score["rouge2"] for score in rouge_scores_comments]
df["rougeL_comments"] = [score["rougeL"] for score in rouge_scores_comments]

# Save the DataFrame with ROUGE scores to a new CSV file
df.to_csv("reddit_data_openai.csv", index=False)

# Print the DataFrame with ROUGE scores
print(df.head())

In [None]:
pip install openai