In [25]:
#Imports
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
#Global variables
data_path = '/Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/'

In [27]:
def extract_tfidf_features(dir_path, num_docs=None):
    # Get a list of file paths for all files ending with '.txt' in the given directory path
    topics_files_path = [os.path.join(dir_path, filename) for filename in os.listdir(dir_path) if filename.endswith('.csv')]

    # If num_docs argument is provided, limit the list to the first 'num_docs' files
    if num_docs is not None:
        topics_files_path = topics_files_path[:num_docs]

    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Loop through each file path in the list
    for topic_file_path in topics_files_path:
        print(f"Processing {topic_file_path}")

        # Get the file name without the extension
        file_name = os.path.splitext(os.path.basename(topic_file_path))[0]
        print(f"Topic: {file_name}")

        # Read the contents of the file into a pandas dataframe
        baseline_ret_res = pd.read_csv(topic_file_path, header=0, sep='\t')

        # Print the number of rows (premises) in the dataframe
        print(f"Number of premises in file: {baseline_ret_res.shape[0]}")

        # If there are premises in the dataframe, extract features for them
        if baseline_ret_res.shape[0] > 0:
            print("Extracting TF-IDF features for premises and conclusions...")
            premises = baseline_ret_res.premises_texts.fillna('')  # Replace NaN values with empty strings
            conclusions = baseline_ret_res.conclusion.fillna('')   # Replace NaN values with empty strings

            # Combine premises and conclusions into a single list
            combined_texts = list(premises) + list(conclusions)

            # Calculate the TF-IDF features
            tfidf_matrix = vectorizer.fit_transform(combined_texts)

            # Create a dictionary for the TF-IDF features
            tfidf_dict = {word: tfidf_matrix[:, i].toarray().ravel().tolist() for i, word in enumerate(vectorizer.get_feature_names_out())}

            # Compiling extracted features for the current topic.
            print("Compiling extracted features for the current topic...")
            argument_ids = list(baseline_ret_res['docno']) * 2  # Adjust the length to match combined_texts
            topic_features_dict = {'qid': [file_name] * len(combined_texts),
                                   'docno': argument_ids}
            topic_features_dict.update(tfidf_dict)

            # Saving extracted features for the current topic to a file
            features_df = pd.DataFrame.from_dict(topic_features_dict)
            features_file_path = os.path.join(data_path + 'Data/features_extracted_2020/tf_idf_features', f"{file_name}_tfidf_features.csv")
            features_df.to_csv(features_file_path, sep=',', index=False)

            print(f"TF-IDF feature extraction completed for {file_name}!")

    print("All TF-IDF feature extractions completed!")

In [28]:
#Path 2020
args_20_dir_path = data_path + "Data/arguments_2020"

#Path 2021
args_21_dir_path = data_path + "Data/arguments_2021"

#Call feature extraction 2020
extract_tfidf_features(args_20_dir_path)

#Call feature extraction 2021
#extract_tfidf_features(args_21_dir_path)

Processing /Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/Data/arguments_2020/6.csv
Topic: 6
Number of premises in file: 1000
Extracting TF-IDF features for premises and conclusions...
Compiling extracted features for the current topic...
TF-IDF feature extraction completed for 6!
Processing /Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/Data/arguments_2020/40.csv
Topic: 40
Number of premises in file: 1000
Extracting TF-IDF features for premises and conclusions...
Compiling extracted features for the current topic...
TF-IDF feature extraction completed for 40!
Processing /Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/Data/arguments_2020/41.csv
Topic: 41
Number of premises in file: 1000
Extracting TF-IDF features for premises and conclusions...
Compiling extracted features for the current topic...
TF-IDF feature extraction completed for 41!
Processing /Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/Data/argum