In [3]:
import pandas as pd
import json

# Load the JSON files
cta_human_file_path = 'human_annotated_cta.json'
de_human_file_path = 'human_annotated_de.json'
cta_weak_file_path = 'weak_annotated_cta.json'
de_weak_file_path = 'weak_annotated_de.json'

with open(cta_human_file_path, 'r') as file:
    cta_human_data = json.load(file)

with open(de_human_file_path, 'r') as file:
    de_human_data = json.load(file)

with open(cta_weak_file_path, 'r') as file:
    cta_weak_data = json.load(file)

with open(de_weak_file_path, 'r') as file:
    de_weak_data = json.load(file)

# Helper function to process data
def process_data(data, category_subject_key, category_action_key, category_name):
    processed_data = []
    for entry in data:
        tid = entry.get("tid", None)
        text = entry.get("text", "")
        subjects = [subject["span_text"] for subject in entry.get(category_subject_key, [])]
        actions_phrases = [action["span_text"] for action in entry.get(category_action_key, [])]
        
        processed_data.append({
            "TID": tid,
            "Tweet": text,
            "Subjects": subjects,
            "Action/Phrases": actions_phrases,
            "Category": category_name
        })
    return processed_data
# Process each dataset
cta_human_processed = process_data(cta_human_data, "called_subjects", "called_actions", "CTA")
de_human_processed = process_data(de_human_data, "discredited_subjects", "discrediting_phrases", "DE")
cta_weak_processed = process_data(cta_weak_data, "called_subjects", "called_actions", "CTA")
de_weak_processed = process_data(de_weak_data, "discredited_subjects", "discrediting_phrases", "DE")

# Combine all data
combined_data = cta_human_processed + de_human_processed + cta_weak_processed + de_weak_processed

# Convert to DataFrame
combined_df = pd.DataFrame(combined_data)

# Handle TID conflicts by merging rows with the same TID
combined_df = combined_df.groupby("TID").agg({
    "Tweet": "first",
    "Subjects": lambda x: list(set(sum(x, []))),
    "Action/Phrases": lambda x: list(set(sum(x, []))),
    "Category": lambda x: list(set(x))
}).reset_index()

# Save to CSV
import os

# Get the directory of the current notebook
notebook_dir = os.path.dirname('/Users/manimtirkey/PycharmProjects/pythonProject5/dataset_extraction_NLP_FTP.ipynb')

# Define output paths
output_combined_df_path = os.path.join(notebook_dir, "combined_df.csv")
output_combined_data_path = os.path.join(notebook_dir, "combined_data.csv")

# Save combined_df to CSV
combined_df.to_csv(output_combined_df_path, index=False)

# Save combined_data to CSV by first converting it to a DataFrame
pd.DataFrame(combined_data).to_csv(output_combined_data_path, index=False)

# Verify the paths
output_combined_df_path, output_combined_data_path


('/Users/manimtirkey/PycharmProjects/pythonProject5/combined_df.csv',
 '/Users/manimtirkey/PycharmProjects/pythonProject5/combined_data.csv')