In [3]:
import pandas as pd
import time
import openai

# Set up OpenAI API key
openai.api_key = ""

# Load dataset
data_path = "../newdatasets/datasetgroup/unlabeled_data.csv"
data = pd.read_csv(data_path)

# Ensure columns are strings
data['cleaned_text'] = data['cleaned_text'].fillna("").astype(str)
data['cleaned_body'] = data['cleaned_body'].fillna("").astype(str)

# Shuffle the dataset without resetting the index
#shuffled_data = data.sample(frac=1, random_state=42)
#shuffled_data['original_index'] = shuffled_data.index  # Preserve the original index

# Select a sample of 3000 rows for labeling
sample_data = data.head(3000).copy()

# Add columns for GPT results
sample_data['label_agreement'] = None
sample_data['explanation'] = None


# Function to get GPT label for a single post and comment
def get_gpt_label(post, comment):
    try:
        # Prepare the prompt
        prompt = f"""
        You are an AI tasked with classifying the conceptual agreement between the following post and comment about climate change. 
        The possible labels are:
        - 1: Agree (the comment conceptually agrees with and supports the post's argument and opinion).
        - 0: Neutral/Unrelated (the comment neither agrees nor disagrees conceptually but shifts focus or introduces unrelated ideas).
        - -1: Disagree (the comment disagrees, rejects, or challenges the post's argument).
        
        Please strictly follow this format: "<label>: <explanation>"
        Example:
        Post: "Reducing CO2 emissions is critical to slowing global warming."
        Comment: "We should prioritize renewable energy like wind and solar."
        Output: "1: The comment agrees with the post's argument about reducing CO2 emissions by suggesting renewable energy solutions."


        Classify the following post-comment pair:
        Post: "{post}"
        Comment: "{comment}"

        Provide the label (1, 0, or -1) and a brief explanation for your classification, separated by a colon.
        
        """
        
        # Call OpenAI GPT API
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150,
            temperature=0
        )
        
        # Extract and parse the response
        result = response['choices'][0]['message']['content'].strip()
        
        # Split the response into label and explanation
        try:
            label, explanation = result.split(":", 1)
            label = int(label.strip().strip('"'))  # Handle potential formatting quirks
            explanation = explanation.strip().strip('"')  # Clean explanation
        except ValueError:
            # If the response isn't formatted as expected, log and return defaults
            print(f"Unexpected response format: {result}")
            label, explanation = None, f"Error parsing response: {result}"
        
        return label, explanation
    
    except Exception as e:
        print(f"Error processing post-comment pair: {e}")
        return None, f"Error encountered during processing: {e}"


# Iterate through each post and comment pair
for i, row in sample_data.iterrows():
    post = row['cleaned_text']
    comment = row['cleaned_body']
    
    # Get GPT label and explanation
    label, explanation = get_gpt_label(post, comment)
    sample_data.at[i, 'label_agreement'] = label
    sample_data.at[i, 'explanation'] = explanation
    
    # Print progress for debugging
    print(f"\nPost:\n{post[:300]}...")
    print(f"Comment:\n{comment[:300]}...")
    print(f"Label: {label}")
    print(f"Explanation: {explanation}\n")
    
    # Optional: Print progress periodically
    if i % 10 == 0:
        print(f"Processed {i + 1}/{len(sample_data)} rows...")

    # Avoid hitting API rate limits (adjust delay as needed)
    time.sleep(2)

# Save the labeled sample dataset
labeled_sample_path = "../newdatasets/datasetgroup/output/labeled_sample_data2.csv"
sample_data.to_csv(labeled_sample_path, index=False)
print(f"Labeled sample dataset saved to {labeled_sample_path}")

# Save the shuffled_data
#shuffled_data_path = "../newdatasets/datasetgroup/output/shuffled_data.csv"
#shuffled_data.to_csv(shuffled_data_path, index=False)  # Use the DataFrame, not a string
#print(f"Shuffled dataset saved to {shuffled_data_path}")

# Save the remaining unlabeled data
unlabeled_data = shuffled_data.iloc[3000:].copy()  # Ensure the remaining rows are selected correctly
unlabeled_path = "../newdatasets/datasetgroup/output/unlabeled_data2.csv"
unlabeled_data.to_csv(unlabeled_path, index=False)
print(f"Unlabeled data saved to {unlabeled_path}")



Post:
i've become aware of a fascinating conspiracy theory that lizard people run various governments--the united states of america, etc. always, the operating assumption is that this is bad specifically because they are lizard people. i find this close minded and offensive. we shouldn't be biased against...
Comment:
answer: we're not allowed to say the real group of people that run the banks, tech/pharma/media corporations, courts, and governments of the world so instead we just call em lizard people in order to get past the censorship....
Label: -1
Explanation: The comment disagrees with the post's argument. The post argues that the concept of "lizard people" running the government should be evaluated based on their governance, not their species. The comment, however, suggests that "lizard people" is a coded term used to bypass censorship, thus rejecting the post's premise of lizard people as a literal species.

Processed 1/3000 rows...

Post:
edit: i have been convinced that renewa

NameError: name 'shuffled_data' is not defined

In [5]:
# Save the remaining unlabeled data
unlabeled_data = data.iloc[3000:].copy()  # Ensure the remaining rows are selected correctly
unlabeled_path = "../newdatasets/datasetgroup/output/unlabeled_data2.csv"
unlabeled_data.to_csv(unlabeled_path, index=False)
print(f"Unlabeled data saved to {unlabeled_path}")

Unlabeled data saved to ../newdatasets/datasetgroup/output/unlabeled_data2.csv


In [13]:
# Load dataset
data_path = "../newdatasets/datasetgroup/unlabeled_data.csv"
df = pd.read_csv(data_path)

In [14]:
df.columns

Index(['level_0', 'index', 'post_id', 'link_id', 'subreddit', 'parent_id',
       'redditor_id', 'created_at', 'body', 'score', 'edited',
       'parent_id_clean', 'has_delta', 'title', 'text', 'permalink',
       'post_id_parent', 'redditor_id_parent', 'body_clean', 'knowledge_mean',
       'knowledge_max', 'knowledge_min', 'knowledge_std', 'similarity_mean',
       'similarity_max', 'similarity_min', 'similarity_std', 'trust_mean',
       'trust_max', 'trust_min', 'trust_std', 'dominant_dimension_low_std',
       'summarized_body_clean', 'cleaned_text', 'cleaned_body',
       'is_climate_related', 'num_sentences_cleaned_body',
       'num_sentences_cleaned_text', 'num_sentences_body',
       'num_sentences_text', 'word_count_text', 'token_count_text',
       'word_count_body', 'token_count_body', 'summarized_cleaned_bod',
       'summarized_cleaned_text', 'original_index'],
      dtype='object')

In [17]:
# Check for duplicates based on 'index'
duplicates_index = df[df.duplicated(subset=['index'], keep=False)]
print(f"Duplicates based on 'index':\n{duplicates_index}")

# Check for duplicates based on 'original_index'
duplicates_original_index = df[df.duplicated(subset=['original_index'], keep=False)]
print(f"Duplicates based on 'original_index':\n{duplicates_original_index}")


Duplicates based on 'index':
Empty DataFrame
Columns: [level_0, index, post_id, link_id, subreddit, parent_id, redditor_id, created_at, body, score, edited, parent_id_clean, has_delta, title, text, permalink, post_id_parent, redditor_id_parent, body_clean, knowledge_mean, knowledge_max, knowledge_min, knowledge_std, similarity_mean, similarity_max, similarity_min, similarity_std, trust_mean, trust_max, trust_min, trust_std, dominant_dimension_low_std, summarized_body_clean, cleaned_text, cleaned_body, is_climate_related, num_sentences_cleaned_body, num_sentences_cleaned_text, num_sentences_body, num_sentences_text, word_count_text, token_count_text, word_count_body, token_count_body, summarized_cleaned_bod, summarized_cleaned_text, original_index]
Index: []

[0 rows x 47 columns]
Duplicates based on 'original_index':
Empty DataFrame
Columns: [level_0, index, post_id, link_id, subreddit, parent_id, redditor_id, created_at, body, score, edited, parent_id_clean, has_delta, title, text, pe

In [19]:
# Check duplicates for 'post_id'
duplicates_post_id = df[df.duplicated(subset=['post_id'], keep=False)]
print(f"Duplicates based on 'post_id':\n{duplicates_post_id}")

# Check duplicates for a combination of 'post_id' and 'parent_id'
df['unique_id'] = df['post_id'].astype(str) + "_" + df['parent_id'].astype(str)
duplicates_combined = df[df.duplicated(subset=['unique_id'], keep=False)]
print(f"Duplicates based on combined 'unique_id':\n{duplicates_combined}")


Duplicates based on 'post_id':
       level_0  index  post_id  link_id     subreddit   parent_id  \
789      17856   3505  liuh0at  1evrkc0  changemyview  t1_liugst3   
1188      7238    239  k3c9bq6  16yqq5w  changemyview  t1_k3b3xqm   
1501     17961  35484  liuh0at  1evrkc0  changemyview  t1_liugst3   
1686      1418  39025  jb0umyw  11iw30k  changemyview  t3_11iw30k   
2541     17901   3551  liyk42o  1evrkc0  changemyview  t1_lix371n   
...        ...    ...      ...      ...           ...         ...   
37446    11241  33541  khfp65a  194cjhp  changemyview  t3_194cjhp   
37625    17958  35481  liugge7  1evrkc0  changemyview  t1_liufc0m   
39437    17761   3406  liyld3k  1evrkc0  changemyview  t1_lixnwfi   
41833    17955  35477  liuawcs  1evrkc0  changemyview  t1_lity6we   
42107     7158    158  k3b3xqm  16yqq5w  changemyview  t1_k39xcxh   

                          redditor_id                 created_at  \
789                        14ww9pmqfe  2024-08-19 10:54:52+00:00   
1188

In [5]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline



# Suppress warnings
#warnings.filterwarnings("ignore")
#os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Load dataset
path = "../newdatasets/datasetgroup/output/labeled_sample_data.csv"
data = pd.read_csv(path)


# Preprocess data to ensure valid text
data['cleaned_text'] = data['cleaned_text'].fillna("").astype(str)
data['cleaned_body'] = data['cleaned_body'].fillna("").astype(str)



# Initialize models
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased")
semantic_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Helper function: Extract numbers and keywords
def extract_numbers_and_keywords(text):
    numbers = re.findall(r'\b\d+(\.\d+)?\b', text)  # Extract numbers
    keywords = re.findall(r'\b(growth|percentage|climate|year|auto|market|policy|housing|change|price)\b', text, re.IGNORECASE)
    return set(numbers + keywords)

# Helper function: Calculate cosine similarity using TF-IDF
def calculate_cosine_similarity(post, comment):
    vectorizer = TfidfVectorizer(stop_words="english")
    vectors = vectorizer.fit_transform([post, comment])
    return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

# Helper function: Calculate semantic similarity using Sentence-BERT
def calculate_semantic_similarity(post, comment):
    post_embedding = semantic_model.encode(post, convert_to_tensor=True)
    comment_embedding = semantic_model.encode(comment, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(post_embedding, comment_embedding).item()
    return similarity_score

# Main classification function
def classify_agreement(post, comment):
    try:
        # Step 1: Sentiment Analysis
        post_sentiment = sentiment_pipeline(post[:512])[0]["label"]
        comment_sentiment = sentiment_pipeline(comment[:512])[0]["label"]
        
        # Step 2: Semantic Similarity
        semantic_score = calculate_semantic_similarity(post, comment)
        
        # Step 3: Cosine Similarity (optional as a double-check)
        cosine_score = calculate_cosine_similarity(post, comment)
        
        # Step 4: Numerical and Keyword Overlap
        post_facts = extract_numbers_and_keywords(post)
        comment_facts = extract_numbers_and_keywords(comment)
        factual_overlap = len(post_facts & comment_facts) > 0
        
        # Step 5: Agreement Logic
        if semantic_score > 0.75 and post_sentiment == comment_sentiment and factual_overlap:
            return 1  # Agree
        elif semantic_score > 0.5 or cosine_score > 0.3 or factual_overlap:
            return 0  # Neutral or Unrelated
        else:
            return -1  # Disagree
    except Exception as e:
        print(f"Error processing post/comment: {e}")
        return 0  # Default to neutral in case of error

# Example Post and Comment
#post = """when you ask the average conservative why they don't seem to take climate change seriously, you often hear something along the lines of well, they've all told us that the world would end in 10 years, and it didn't, and since that didn't happen, my trust is now completely broken and i feel entitled not to have to listen to'experts'anymore. it's this bit about the experts have said the world will end in 10 years that i find highly suspicious, and it is my view that this information likely came from a non-expert, in a casual context, without citation of actual evidence and research."""
#comment = """i dont deny climate change. i just dont want to be priced out of the housing market, auto market, computers i want to enjoy my life."""

# Run the Classification
#label = classify_agreement(post, comment)



# Apply classification function to the dataset
data['agreement_label6'] = data.apply(
    lambda row: classify_agreement(row['cleaned_text'], row['cleaned_body']),
    axis=1
)

# Save the updated dataset
output_path = "../newdatasets/datasetgroup/output/supportgptresponse.csv"
data.to_csv(output_path, index=False)

print(f"Classification completed and saved to {output_path}")


# Display the Result
#print(f"Agreement Label: {label}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Classification completed and saved to ../newdatasets/datasetgroup/output/supportgptresponse.csv
