In [None]:
import os
import json
import pandas as pd
import numpy as np
import nltk
import spacy
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

**HEDGEPEER AND BIOSCOPE DATASETS PREPROCESSING**

In [None]:
def process_dataset(data_path, output_path):
    dataObj = pd.read_json(path_or_buf=data_path, lines=True)
    data_list = []
    
    for index, row in dataObj.iterrows():
        rev_id = row['Review_id']
        sents = row['Sentences']
        
        for s in sents:
            hedges = s['Hedges']
            if len(hedges) == 0:
                data_list.append({
                    'Review_id': rev_id,
                    'Sentence_id': s['Sentence_id'],
                    'Raw Sentence': s['Sentence'],
                    'Hedged Sentence': s['Sentence'],
                    'Hedge': 'NO HEDGE',
                    'Span': None
                })
            else:
                for h in hedges:
                    data_list.append({
                        'Review_id': rev_id,
                        'Sentence_id': s['Sentence_id'],
                        'Raw Sentence': s['Sentence'],
                        'Hedged Sentence': h['Hedged Sentence'],
                        'Hedge': h['Hedge'],
                        'Span': h['Span']
                    })
    
    df = pd.DataFrame(data_list)
    df.to_json(output_path, orient='records', lines=True)

root = '../input'
os.chdir(root)

datasets = [
    ('hedgepeer/HedgePeer.jsonl', '/kaggle/working/HedgePeer_processed.json'),
    ('merged-bioscope/merged_bioscope.jsonl', '/kaggle/working/BioScope_processed.json')
]

for data_path, output_path in datasets:
    process_dataset(data_path, output_path)

**CUE ANALYSIS IN HEDGEPEER AND BIOSCOPE DATASETS**

In [None]:
def load_and_filter_data(file_path):
    df = pd.read_json(file_path, lines=True, orient="records")
    return df[df["Hedge"] != "NO HEDGE"] 

def count_total_hedges(hedge_df):
    return len(hedge_df)

def count_unique_hedges(hedge_df):
    return hedge_df["Hedge"].nunique()

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

def analyze_hedge_frequencies(hedge_df):
    hedge_df["Hedge Lemmatized"] = hedge_df["Hedge"].apply(lemmatize_text)
    return Counter(hedge_df["Hedge Lemmatized"])

def analyze_pos_distribution(hedge_df):
    hedge_words = hedge_df["Hedge"].tolist()
    pos_tags = pos_tag(hedge_words)
    return Counter(tag for _, tag in pos_tags)

def calculate_overlap_and_percentage(hedge_counts1, hedge_counts2):
     
    all_words_1 = dict(hedge_counts1)
    all_words_2 = dict(hedge_counts2)
    
    common_words = set(all_words_1.keys()) & set(all_words_2.keys())
    
    sorted_common_words_1 = sorted(common_words, key=lambda word: all_words_1[word], reverse=True)
    sorted_common_words_2 = sorted(common_words, key=lambda word: all_words_2[word], reverse=True)
    
    sorted_common_words_combined = sorted(common_words, key=lambda word: all_words_1[word] + all_words_2[word], reverse=True)
    
    top_10_common_words = sorted_common_words_combined[:10]
    
    percentage_1 = {word: (all_words_1[word] / sum(all_words_1.values())) * 100 for word in top_10_common_words}
    percentage_2 = {word: (all_words_2[word] / sum(all_words_2.values())) * 100 for word in top_10_common_words}

    return common_words, top_10_common_words, percentage_1, percentage_2

def plot_top_hedges(hedge_counts, dataset_name):
    top_10_hedges = hedge_counts.most_common(10)
    hedge_labels, hedge_values = zip(*top_10_hedges)

    plt.bar(hedge_labels, hedge_values)
    plt.xlabel("Hedge")
    plt.ylabel("Frequency")
    plt.title(f"Top 10 Hedge Cues in ({dataset_name})")
    plt.xticks(rotation=45)

    for i, value in enumerate(hedge_values):
        plt.text(i, value + 0.5, str(value), ha='center', va='bottom')

    plt.show()
    
def plot_top_pos(pos_counts, dataset_name):
    top_10_pos = pos_counts.most_common(10)
    pos_labels, pos_values = zip(*top_10_pos)

    plt.bar(pos_labels, pos_values)
    plt.xlabel("POS Tag")
    plt.ylabel("Frequency")
    plt.title(f"Top 10 POS Tags in Hedge Words ({dataset_name})")
    plt.xticks(rotation=45)

    for i, value in enumerate(pos_values):
        plt.text(i, value + 0.5, str(value), ha='center', va='bottom')
        
    plt.show()

def plot_pos_comparison_bar_chart(pos_counts1, pos_counts2, dataset1_name, dataset2_name):
    
    coinciding_pos = set(pos_counts1.keys()) & set(pos_counts2.keys())

    total_pos1 = sum(pos_counts1.values())
    total_pos2 = sum(pos_counts2.values())

    coincide_percentages1 = {pos: (pos_counts1[pos] / total_pos1 * 100) for pos in coinciding_pos}
    coincide_percentages2 = {pos: (pos_counts2[pos] / total_pos2 * 100) for pos in coinciding_pos}

    sorted_coinciding_pos = sorted(coinciding_pos, 
                                   key=lambda pos: coincide_percentages1[pos] + coincide_percentages2[pos], 
                                   reverse=True)

    top_10_coinciding = sorted_coinciding_pos[:10]

    percent1_sorted = [coincide_percentages1[pos] for pos in top_10_coinciding]
    percent2_sorted = [coincide_percentages2[pos] for pos in top_10_coinciding]

    plt.figure(figsize=(12, 6))
    width = 0.4
    x = range(len(top_10_coinciding))

    plt.bar(x, percent1_sorted, width, label=dataset1_name, color='blue', alpha=0.7)
    plt.bar([i + width for i in x], percent2_sorted, width, label=dataset2_name, color='orange', alpha=0.7)

    plt.xticks([i + width/2 for i in x], top_10_coinciding, rotation=45, ha="right")
    plt.xlabel("POS Tags")
    plt.ylabel("Percentage (%)")
    plt.title(f"TOP 10 POS Tag Percentage Comparison Between {dataset1_name} and {dataset2_name} (Coinciding POS Tags)")
    plt.legend()
    plt.grid(axis='y', linestyle="--", alpha=0.5)

    for i, pos in enumerate(top_10_coinciding):
        plt.text(x[i], percent1_sorted[i] + 0.5, f"{percent1_sorted[i]:.2f}%", ha='center', color='black')
        plt.text(x[i] + width, percent2_sorted[i] + 0.5, f"{percent2_sorted[i]:.2f}%", ha='center', color='black')

    plt.show()

def plot_common_words_bar_chart(common_words, percentage_1, percentage_2, dataset1_name, dataset2_name):

    words = list(common_words)
    freq_1 = [percentage_1[word] for word in words]
    freq_2 = [percentage_2[word] for word in words]
    
    x = np.arange(len(words))  
    width = 0.4  
    
    plt.figure(figsize=(10, 6))
    bars1 = plt.bar(x - width/2, freq_1, width, label=dataset1_name, color='blue', alpha=0.7)
    bars2 = plt.bar(x + width/2, freq_2, width, label=dataset2_name, color='orange', alpha=0.7)

    max_height = max(max(freq_1, default=0), max(freq_2, default=0))
    plt.ylim(0, max_height * 1.1)  

    for bar in bars1 + bars2:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height + 0.02 * max_height, 
                 f'{height:.2f}%', ha='center', color='black')
    
    plt.xticks(x, words, rotation=45, ha="right")
    plt.xlabel("Common Hedge Words")
    plt.ylabel("Relative Frequency (%)")
    plt.title("Relative Frequency Comparison of TOP 10 Common Hedge Words")
    plt.legend()
    plt.grid(axis='y', linestyle="--", alpha=0.5)
    
    plt.tight_layout() 
    plt.show()

def compare_datasets(file_path1, file_path2, name1="Dataset 1", name2="Dataset 2"):
    
    hedge_df1 = load_and_filter_data(file_path1)
    hedge_df2 = load_and_filter_data(file_path2)

    hedge_counts1 = analyze_hedge_frequencies(hedge_df1)
    hedge_counts2 = analyze_hedge_frequencies(hedge_df2)

    total_hedges1 = count_total_hedges(hedge_df1)
    total_hedges2 = count_total_hedges(hedge_df2)
    unique_hedges1 = count_unique_hedges(hedge_df1)
    unique_hedges2 = count_unique_hedges(hedge_df2)

    pos_counts1 = analyze_pos_distribution(hedge_df1)
    pos_counts2 = analyze_pos_distribution(hedge_df2)

    print(f"Total hedges in {name1}: {total_hedges1}")
    print(f"Total hedges in {name2}: {total_hedges2}")
    print(f"Unique hedges in {name1}: {unique_hedges1}")
    print(f"Unique hedges in {name2}: {unique_hedges2}")

    common_words, top_10_common_words, percentage_1, percentage_2 = calculate_overlap_and_percentage(hedge_counts1, hedge_counts2)
    print(f"\n Number of Common Hedge Words in Both Datasets: {len(common_words)}")

    plot_top_hedges(hedge_counts1, name1)
    plot_top_hedges(hedge_counts2, name2)

    plot_common_words_bar_chart(top_10_common_words, percentage_1, percentage_2, name1, name2)

    total_pos_1 = sum(pos_counts1.values())
    total_pos_2 = sum(pos_counts2.values())

    percentage_pos_1 = {tag: (count / total_pos_1) * 100 for tag, count in pos_counts1.items()}
    percentage_pos_2 = {tag: (count / total_pos_2) * 100 for tag, count in pos_counts2.items()}

    common_pos_tags = set(pos_counts1.keys()) & set(pos_counts2.keys())

    plot_top_pos(pos_counts1, name1)
    plot_top_pos(pos_counts2, name2)

    plot_pos_comparison_bar_chart(pos_counts1, pos_counts2, name1, name2)

In [None]:
file1 = "/kaggle/working/HedgePeer_processed.json"
file2 = "/kaggle/working/BioScope_processed.json"

compare_datasets(file1, file2, "HedgePeer", "BioScope")

**SPAN ANALYSIS IN HEDGEPEER AND BIOSCOPE DATASETS**

In [None]:
def remove_html_tags(text):
    
    return re.sub(r'<[^>]*>', '', text)

def trim_string(s):
    
    s = s.strip() 
    s = re.sub(r'^[^\w]+', '', s)  
    s = re.sub(r'[^\w]+$', '', s)  
    
    return s

def load_and_filter_data(file_path):
    
    df = pd.read_json(file_path, lines=True, orient="records")    
    df["Clean Span"] = df["Span"].apply(lambda span: remove_html_tags(span) if isinstance(span, str) else "")
    df_filtered = df[df["Span"].notna() & (df["Span"] != "") & df["Clean Span"].apply(lambda span: bool(span.strip()))]
    
    return df_filtered

def analyze_span_length_and_position(df):

    df["Span Length"] = 0
    df["Span Position"] = "Unknown"
    df["Whole Sentence"] = False

    for index, row in df.iterrows():
        span = row["Clean Span"]
        sentence = row["Raw Sentence"]
        
        if pd.isna(span) or span is None:
            df.at[index, "Span Length"] = 0
            df.at[index, "Span Position"] = "None"
            df.at[index, "Whole Sentence"] = False
            continue      
      
        sentence_trimmed = trim_string(sentence)
        span_trimmed = trim_string(span)
        
        span_length = len(span_trimmed.split())
        df.at[index, "Span Length"] = span_length
        
        if sentence_trimmed.startswith(span_trimmed):  
            df.at[index, "Span Position"] = "Beginning"
        elif sentence_trimmed.endswith(span_trimmed):  
            df.at[index, "Span Position"] = "End"
        else:
            df.at[index, "Span Position"] = "Middle"
        
        span_text = remove_html_tags(span_trimmed)
        raw_text = remove_html_tags(sentence_trimmed)
        if span_text == raw_text:
            df.at[index, "Whole Sentence"] = True
    
    position_counts = df["Span Position"].value_counts()
    position_counts_w = df["Whole Sentence"].sum()
    print("\nSpan Position Counts:")
    print(position_counts)
    print("\nWhole Sentence Counts:")
    print(position_counts_w)
    
    return df

def plot_top_hedge_words(df1, df2, dataset_labels):
  
    for df, label in zip([df1, df2], dataset_labels):
    
        hedge_stats = df.groupby('Hedge').agg(
            Frequency=('Hedge', 'size'),
            Avg_Span_Length=('Span Length', 'mean')
        )
        
        top_hedge_words = hedge_stats.sort_values(by='Frequency', ascending=False).head(10)
        
        fig, ax1 = plt.subplots(figsize=(10, 6))
        
        ax1.bar(top_hedge_words.index, top_hedge_words['Frequency'])
        ax1.set_xlabel('Hedge Word')
        ax1.set_ylabel('Frequency', color='black')
        ax1.tick_params(axis='y', labelcolor='black')
        
        ax2 = ax1.twinx()
        ax2.plot(top_hedge_words.index, top_hedge_words['Avg_Span_Length'], color='orange', marker='o', linestyle='-', linewidth=2)
        ax2.set_ylabel('Average Span Length', color='orange')
        ax2.tick_params(axis='y', labelcolor='orange')
        
        plt.title(f'Top 10 Most Frequent Hedge Words and Average Span Length in {label}')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

def plot_top_span_lengths(df1, df2, dataset_labels):

    for df, label in zip([df1, df2], dataset_labels):
        span_length_counts = Counter(df['Span Length'])
        
        top_span_lengths = span_length_counts.most_common(10)
        
        lengths, frequencies = zip(*top_span_lengths)
        
        plt.figure(figsize=(10, 6))
        plt.bar(lengths, frequencies, color='blue', alpha=0.7)
        
        plt.xlabel('Span Length')
        plt.ylabel('Frequency')
        plt.title(f'Top 10 Most Frequent Span Lengths in {label}')
        plt.xticks(lengths)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        
        plt.show()

def plot_top_span_lengths_comparison(df1, df2, dataset_labels):
 
    span_length_counts_1 = Counter(df1['Span Length'])
    span_length_counts_2 = Counter(df2['Span Length'])
    
    top_span_lengths_1 = span_length_counts_1.most_common(10)
    top_span_lengths_2 = span_length_counts_2.most_common(10)
    
    lengths_1, frequencies_1 = zip(*top_span_lengths_1)
    lengths_2, frequencies_2 = zip(*top_span_lengths_2)
    
    common_lengths = sorted(set(lengths_1).union(set(lengths_2)))
    
    freq_1 = [frequencies_1[lengths_1.index(l)] if l in lengths_1 else 0 for l in common_lengths]
    freq_2 = [frequencies_2[lengths_2.index(l)] if l in lengths_2 else 0 for l in common_lengths]
    
    total_spans_1 = sum(freq_1)
    total_spans_2 = sum(freq_2)
    
    perc_1 = [f / total_spans_1 * 100 for f in freq_1]
    perc_2 = [f / total_spans_2 * 100 for f in freq_2]
    
    x = np.arange(len(common_lengths)) 
    width = 0.4  
    
    plt.figure(figsize=(10, 6))
    bars1 = plt.bar(x - width/2, perc_1, width, label=dataset_labels[0], color='blue', alpha=0.7)
    bars2 = plt.bar(x + width/2, perc_2, width, label=dataset_labels[1], color='orange', alpha=0.7)

    max_height = max(max(perc_1, default=0), max(perc_2, default=0))
    plt.ylim(0, max_height * 1.1)  

    for bar in bars1 + bars2:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height + 0.02 * max_height, 
                 f'{height:.2f}%', ha='center', color='black')
    
    plt.xticks(x, common_lengths, rotation=45, ha="right")
    plt.xlabel("Span Length")
    plt.ylabel("Frequency (%)")
    plt.title(f"Top 10 Most Frequent Span Lengths Comparison: {dataset_labels[0]} vs. {dataset_labels[1]}")
    plt.legend()
    plt.grid(axis='y', linestyle="--", alpha=0.5)
    

    plt.tight_layout()
    plt.show()


In [None]:
df_analyzed1 = load_and_filter_data(file1)
df_analyzed2 = load_and_filter_data(file2)

df_analyzed1 = analyze_span_length_and_position(df_analyzed1)
df_analyzed2 = analyze_span_length_and_position(df_analyzed2)


dataset_labels = ['HedgePeer', 'BioScope']

plot_top_hedge_words(df_analyzed1, df_analyzed2, dataset_labels)
plot_top_span_lengths_comparison(df_analyzed1, df_analyzed2, dataset_labels)