# Classification-based Summarization

## Import packages

In [1]:
import numpy as np
from argsum import load_test_df, get_smatchtopr_classification_sums, get_barh_classification_sums

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
loading configuration file config.json from cache at /Users/moritz/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_attentions": true,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.33.3",
  "voc

In [2]:
from time import time 
from tqdm.notebook import tqdm
from itertools import product
import os 
import json
import pandas as pd

###########################################################################################################################
### Get and evaluate classification based summaries ###
###########################################################################################################################

def get_classification_sums(df, get_classification_sums_callable, parameter_dict, output_dir = 'results/classification_sums', file_name = None):

    # Get unique topics and stances
    topics = df['topic'].unique().tolist()
    stances = [str(int(sta)) for sta in sorted(df['stance'].unique())]

    # Get parameter for iteration
    iterate_parameter_names = [item[0] for item in parameter_dict.items() if type(item[1]) == list]
    iterate_parameter_values = [parameter_dict[parameter_name] for parameter_name in iterate_parameter_names]
    iter_parameter_value_combinations = list(product(*iterate_parameter_values))

    # Create empty dict to store the clusters
    results = dict(zip(['summaries', 'parameter_names', 'parameter_values'], [dict(zip([str(comb) for comb in iter_parameter_value_combinations], [dict(zip(topics, [dict(zip(stances, [dict(zip(['sum_ids', 'sums', 'runtime'], [None, None, None])) for i in range(len(stances))])) for i in range(len(topics))])) for i in range(len(iter_parameter_value_combinations))])) for i in range(len(['summaries']))] + [iterate_parameter_names, iterate_parameter_values]))

    ################################
    ### Iterate: topic & stance ####
    ################################

    for topic_stance in tqdm([(topic, stance) for topic in topics for stance in stances], leave = True, desc = 'topic + stance'):
        
        topic = topic_stance[0]
        stance = topic_stance[1]
        mask_topic_stance = (df['topic'] == topic) & (df['stance'] == int(stance))
        arguments = df[mask_topic_stance]['argument'].to_list()

        ############################
        ### Iterate: parameter #####
        ############################

        for comb in tqdm(iter_parameter_value_combinations, leave = False,  desc = 'summarization parameter'):
            iterate_parameter_dict = {**parameter_dict, **dict(zip(iterate_parameter_names, list(comb)))}

            ########################
            ### Get summaries ######
            ########################

            start_time = time()
            classification_sum_ids, classification_sums = get_classification_sums_callable(arguments, topic = topic, stance = int(stance), **iterate_parameter_dict)
            runtime = time() - start_time

            if classification_sum_ids != None:
                results['summaries'][str(comb)][topic][stance]['sum_ids'] = [int(id) for id in classification_sum_ids]
                results['summaries'][str(comb)][topic][stance]['sums'] = classification_sums
                results['summaries'][str(comb)][topic][stance]['runtime'] = float(np.round(runtime, 5))

    
    ########################
    ### Save results #######
    ########################

    if file_name != None:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        with open(output_dir + '/' + file_name, 'w') as file:
            json.dump(results, file)
    
    return results

## Load data

In [3]:
ArgKP21 = load_test_df('ArgKP21')
Debate_test = load_test_df('Debate_test')

## BarH

### Extractive

In [None]:
barh_parameter_dict = {'quality_scorer_t':0.7,
                       'min_proportion_candidates':[0.1, 0.3], 
                       'match_scorer_t':[i for i in np.arange(0.75,0.96, 0.025)],
                       'final_match_scorer_t':0,
                       'use_llm':False
                       }

barh_results = get_classification_sums(df = ArgKP21, 
                                       get_classification_sums_callable = get_barh_classification_sums, 
                                       parameter_dict = barh_parameter_dict, 
                                       output_dir = 'investigations/3_classification_summaries', 
                                       file_name = 'ArgKP21_BarH.json'
                                       )

In [None]:
barh_parameter_dict = {'quality_scorer_t':0.7,
                            'min_proportion_candidates':[0.1, 0.3], 
                            'match_scorer_t':[i for i in np.arange(0.75,0.96, 0.025)],
                            'final_match_scorer_t':0,
                            'use_llm':False
                            }

barh_results = get_classification_sums(df = Debate_test, 
                                            get_classification_sums_callable = get_barh_classification_sums, 
                                            parameter_dict = barh_parameter_dict, 
                                            output_dir = 'investigations/3_classification_summaries', 
                                            file_name = f'Debate_test_BarH.json')

### LLM

In [None]:
barh_key_points_parameter_dict = {'quality_scorer_t':0.7,
                                  'min_proportion_candidates':0,
                                  'match_scorer_t':[i for i in np.arange(0.75,0.96, 0.025)],
                                  'final_match_scorer_t':0, 
                                  'use_llm':'candidates',
                                  'sum_token_length':8,
                                  'sum_min_num':12,
                                  'sum_min_num_plus':8,
                                  'temperature':0.5,
                                  'frequency_penalty':None,
                                  'few_shot':True
                                  }

barh_key_points_results = get_classification_sums(df = ArgKP21, 
                                                  get_classification_sums_callable = get_barh_classification_sums, 
                                                  parameter_dict = barh_key_points_parameter_dict, 
                                                  output_dir = 'investigations/3_classification_summaries', 
                                                  file_name = 'ArgKP21_BarH_Candidates.json'
                                                  )

In [None]:
barh_key_points_parameter_dict = {'quality_scorer_t':0.7,
                                  'min_proportion_candidates':0,
                                  'match_scorer_t':[i for i in np.arange(0.75,0.96, 0.025)],
                                  'final_match_scorer_t':0, 
                                  'use_llm':'candidates',
                                  'sum_token_length':8,
                                  'sum_min_num':12,
                                  'sum_min_num_plus':8,
                                  'temperature':0.5,
                                  'frequency_penalty':None,
                                  'few_shot':True
                                  }

barh_key_points_results = get_classification_sums(df = Debate_test, 
                                                  get_classification_sums_callable = get_barh_classification_sums, 
                                                  parameter_dict = barh_key_points_parameter_dict, 
                                                  output_dir = 'investigations/3_classification_summaries', 
                                                  file_name = 'Debate_test_BarH_Candidates.json'
                                                  )

In [None]:
barh_key_points_parameter_dict = {'quality_scorer_t':0.7,
                                  'min_proportion_candidates':0,
                                  'match_scorer_t':0.8,
                                  'final_match_scorer_t':0, 
                                  'use_llm':'key_points',
                                  'sum_token_length':8,
                                  'sum_min_num':[3,4],
                                  'sum_min_num_plus':[2,3,4,5,6],
                                  'temperature':0.5,
                                  'frequency_penalty':None,
                                  'few_shot':True
                                  }

barh_key_points_results = get_classification_sums(df = ArgKP21, 
                                                  get_classification_sums_callable = get_barh_classification_sums, 
                                                  parameter_dict = barh_key_points_parameter_dict, 
                                                  output_dir = 'investigations/3_classification_summaries', 
                                                  file_name = 'ArgKP21_BarH_Key_Points.json'
                                                  )

In [None]:
barh_key_points_parameter_dict = {'quality_scorer_t':0.7,
                                  'min_proportion_candidates':0,
                                  'match_scorer_t':0.8,
                                  'final_match_scorer_t':0, 
                                  'use_llm':'key_points',
                                  'sum_token_length':8,
                                  'sum_min_num':[3,4],
                                  'sum_min_num_plus':[2,3,4,5,6],
                                  'temperature':0.5,
                                  'frequency_penalty':None,
                                  'few_shot':True
                                  }

barh_key_points_results = get_classification_sums(df = Debate_test, 
                                                  get_classification_sums_callable = get_barh_classification_sums, 
                                                  parameter_dict = barh_key_points_parameter_dict, 
                                                  output_dir = 'investigations/3_classification_summaries', 
                                                  file_name = 'Debate_test_BarH_Key_Points.json'
                                                  )

## SMatchToPr

### Extractive

In [None]:
smatchtopr_parameter_dict = {'quality_scorer_t':0.8,
                             'min_proportion_candidates':[0.1, 0.3], 
                             'match_scorer_pr_t':0.4, 
                             'damping_factor':0.2, 
                             'final_match_scorer_t':0,
                             'scorer_cands':None, 
                             'scorer_cands_t':[i for i in np.arange(0.75,0.96, 0.025)], 
                             'use_llm':False
                             }

smatchtopr_results = get_classification_sums(df = ArgKP21, 
                                             get_classification_sums_callable = get_smatchtopr_classification_sums, 
                                             parameter_dict = smatchtopr_parameter_dict, 
                                             output_dir = 'investigations/3_classification_summaries', 
                                             file_name = 'ArgKP21_SMatchToPr.json'
                                             )

In [None]:
smatchtopr_parameter_dict = {'quality_scorer_t':[0.6, 0.8],
                             'match_scorer_pr_t':0.4, 
                             'damping_factor':0.2, 
                             'final_match_scorer_t':0,
                             'scorer_cands':None, 
                             'scorer_cands_t':[i for i in np.arange(0.75,0.96, 0.025)], 
                             'use_llm':False
                             }

smatchtopr_results = get_classification_sums(df = Debate_test, 
                                             get_classification_sums_callable = get_smatchtopr_classification_sums, 
                                             parameter_dict = smatchtopr_parameter_dict, 
                                             output_dir = 'investigations/3_classification_summaries', 
                                             file_name = 'Debate_test_SMatchToPr.json'
                                             )

### LLM

In [None]:
smatchtopr_key_points_parameter_dict = {'quality_scorer_t':0.8,
                                        'match_scorer_pr_t':0.4, 
                                        'damping_factor':0.2, 
                                        'final_match_scorer_t':0, 
                                        'scorer_cands_t':[i for i in np.arange(0.75,0.96, 0.025)], 
                                        'use_llm':'candidates',
                                        'sum_token_length':8,
                                        'sum_min_num':8,
                                        'sum_min_num_plus':12,
                                        'temperature':0.5,
                                        'frequency_penalty':None,
                                        'few_shot':True
                                        }

smatchtopr_key_points_results = get_classification_sums(df = ArgKP21, 
                                                        get_classification_sums_callable = get_smatchtopr_classification_sums, 
                                                        parameter_dict = smatchtopr_key_points_parameter_dict, 
                                                        output_dir = 'investigations/3_classification_summaries', 
                                                        file_name = 'ArgKP21_SMatchToPr_Candidates.json'
                                                        )

In [None]:
smatchtopr_key_points_parameter_dict = {'quality_scorer_t':0.8,
                                        'match_scorer_pr_t':0.4, 
                                        'damping_factor':0.2, 
                                        'final_match_scorer_t':0, 
                                        'scorer_cands_t':[i for i in np.arange(0.75,0.96, 0.025)], 
                                        'use_llm':'candidates',
                                        'sum_token_length':8,
                                        'sum_min_num':8,
                                        'sum_min_num_plus':12,
                                        'temperature':0.5,
                                        'frequency_penalty':None,
                                        'few_shot':True
                                        }

smatchtopr_key_points_results = get_classification_sums(df = Debate_test, 
                                                        get_classification_sums_callable = get_smatchtopr_classification_sums, 
                                                        parameter_dict = smatchtopr_key_points_parameter_dict, 
                                                        output_dir = 'investigations/3_classification_summaries', 
                                                        file_name = 'Debate_test_SMatchToPr_Candidates.json'
                                                        )

In [None]:
smatchtopr_key_points_parameter_dict = {'quality_scorer_t':0.8,
                                        'match_scorer_pr_t':0.4, 
                                        'damping_factor':0.2, 
                                        'final_match_scorer_t':0, 
                                        'use_llm':'key_points',
                                        'sum_token_length':8,
                                        'sum_min_num':[3,4],
                                        'sum_min_num_plus':[2,3,4,5,6],
                                        'temperature':0.5,
                                        'frequency_penalty':None,
                                        'few_shot':True
                                        }

smatchtopr_key_points_results = get_classification_sums(df = ArgKP21, 
                                                        get_classification_sums_callable = get_smatchtopr_classification_sums, 
                                                        parameter_dict = smatchtopr_key_points_parameter_dict, 
                                                        output_dir = 'investigations/3_classification_summaries', 
                                                        file_name = 'ArgKP21_SMatchToPr_Key_Points.json'
                                                        )

In [None]:
smatchtopr_key_points_parameter_dict = {'quality_scorer_t':0.8,
                                        'match_scorer_pr_t':0.4, 
                                        'damping_factor':0.2, 
                                        'final_match_scorer_t':0, 
                                        'use_llm':'key_points',
                                        'sum_token_length':8,
                                        'sum_min_num':[3,4],
                                        'sum_min_num_plus':[2,3,4,5,6],
                                        'temperature':0.5,
                                        'frequency_penalty':None,
                                        'few_shot':True
                                        }

smatchtopr_key_points_results = get_classification_sums(df = Debate_test, 
                                                        get_classification_sums_callable = get_smatchtopr_classification_sums, 
                                                        parameter_dict = smatchtopr_key_points_parameter_dict, 
                                                        output_dir = 'investigations/3_classification_summaries', 
                                                        file_name = 'Debate_test_SMatchToPr_Key_Points.json'
                                                        )