In [31]:
#Imports
import os
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [32]:
#Global variables
data_path = '/Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/'

In [33]:

def extract_features_sentiment_sarcasm(dir_path, num_docs=None):
    # Get a list of file paths for all files ending with '.csv' in the given directory path
    topics_files_path = [os.path.join(dir_path, filename) for filename in os.listdir(dir_path) if filename.endswith('.csv')]

    # If num_docs argument is provided, limit the list to the first 'num_docs' files
    if num_docs is not None:
        topics_files_path = topics_files_path[:num_docs]

    # Initialize sentiment analyzer
    sentiment_analyzer = SentimentIntensityAnalyzer()

    # Initialize sarcasm detection pipeline
    sarcasm_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    sarcasm_tokenizer = AutoTokenizer.from_pretrained(sarcasm_model_name)
    sarcasm_model = AutoModelForSequenceClassification.from_pretrained(sarcasm_model_name)
    sarcasm_detector = pipeline("text-classification", model=sarcasm_model, tokenizer=sarcasm_tokenizer)

    def is_sarcastic(text):
        truncated_text = text[:512]  # Truncate the text to the first 512 tokens
        result = sarcasm_detector(truncated_text)
        if result[0]['label'] == "NEGATIVE":
            return 1
        else:
            return 0

    # Loop through each file path in the list
    for topic_file_path in topics_files_path:
        print(f"Processing {topic_file_path}")

        # Get the file name without the extension
        file_name = os.path.splitext(os.path.basename(topic_file_path))[0]
        print(f"Topic: {file_name}")

        # Read the contents of the file into a pandas dataframe
        baseline_ret_res = pd.read_csv(topic_file_path, header=0, sep='\t')

        # Print the number of rows (premises) in the dataframe
        print(f"Number of premises in file: {baseline_ret_res.shape[0]}")

        # If there are premises in the dataframe, extract features for them
        if baseline_ret_res.shape[0] > 0:
            print("Extracting sentiment and sarcasm features...")
            premises = baseline_ret_res.premises_texts
            conclusions = baseline_ret_res.conclusion
            
            # Sentiment features
            compound_premises = []
            compound_conclusions = []
            
            for premise in premises:
                if isinstance(premise, str):
                    compound_premises.append(sentiment_analyzer.polarity_scores(premise)['compound'])
                else:
                    compound_premises.append(0)

            for conclusion in conclusions:
                if isinstance(conclusion, str):
                    compound_conclusions.append(sentiment_analyzer.polarity_scores(conclusion)['compound'])
                else:
                    compound_conclusions.append(0)

            # Sarcasm features
            sarcasm_premises = [is_sarcastic(premise) if isinstance(premise, str) else 0 for premise in premises]
            sarcasm_conclusions = [is_sarcastic(conclusion) if isinstance(conclusion, str) else 0 for conclusion in conclusions]

            # Compiling extracted features for the current topic.
            print("Compiling extracted features for the current topic...")
            argument_ids = list(baseline_ret_res['docno'])
            topic_features_dict = {'qid': file_name,
                                   'docno': argument_ids,
                                   'compound_premises': compound_premises,
                                   'compound_conclusions': compound_conclusions,
                                   'sarcasm_premises': sarcasm_premises,
                                   'sarcasm_conclusions': sarcasm_conclusions}

            # Saving extracted features for the current topic to a file
            features_df = pd.DataFrame.from_dict(topic_features_dict)
            features_file_path = os.path.join(data_path + 'Data/features_extracted_2020/sentiment_sarcasm_features', f"{file_name}_features.csv")
            features_df.to_csv(features_file_path, sep=',', index=False)

            print(f"Feature extraction completed for {file_name}!")

    print("All feature extractions completed!")

In [34]:
#Path 2020
args_20_dir_path = data_path + "Data/arguments_2020"

#Path 2021
args_21_dir_path = data_path + "Data/arguments_2021"

#Call feature extraction 2020
extract_features_sentiment_sarcasm(args_20_dir_path)

#Call feature extraction 2021
#extract_features_sentiment_sarcasm(args_21_dir_path)

Processing /Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/Data/arguments_2020/6.csv
Topic: 6
Number of premises in file: 1000
Extracting sentiment and sarcasm features...
Compiling extracted features for the current topic...
Feature extraction completed for 6!
Processing /Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/Data/arguments_2020/40.csv
Topic: 40
Number of premises in file: 1000
Extracting sentiment and sarcasm features...
Compiling extracted features for the current topic...
Feature extraction completed for 40!
Processing /Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/Data/arguments_2020/41.csv
Topic: 41
Number of premises in file: 1000
Extracting sentiment and sarcasm features...
Compiling extracted features for the current topic...
Feature extraction completed for 41!
Processing /Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/Data/arguments_2020/7.csv
Topic: 7
Number of premises in file: 1000
Extra