## Function to Extract All Features and Create SVMrank-Formatted Vectors

This function extracts features from CSV files in a given directory, concatenates them into a single dataframe, converts them to SVMrank-style feature representation, and saves them as a file without headers. It takes two arguments:

- `input_dir_path`: the directory path containing the CSV files.
- `output_svmrank_path`: the path to the output file where the SVMrank-formatted feature vectors will be saved.

The function reads in each CSV file in the directory, drops any rows with missing relevance scores, and extracts feature values for each file. The cleaned dataframes and feature values are stored in separate lists. The function then concatenates all dataframes into a single merged dataframe, dropping any remaining rows with NaN values. New columns are added to the merged dataframe for each feature value extracted from each file. The columns are then converted to SVMrank-style feature representation, and the feature vectors are saved as a file without headers.

### Output
The function saves the SVMrank-formatted feature vectors as a file without headers at the specified output path.

### Required Libraries
The function requires the following Python libraries:

- pandas
- os

In [2]:
#Imports
import os
import pandas as pd

In [3]:
#Global variables
data_path = '/Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/'

In [4]:
def all_features_svm(input_dir_path, output_svmrank_path):
    dataframes = []
    feature_values = []

    for filename in os.listdir(input_dir_path):
        if filename.endswith(".csv"):
            df = pd.read_csv(os.path.join(input_dir_path, filename), sep=",", header=0)

            file_feature_values = {}
            for j, feature_name in enumerate(df.columns.drop(["relevance_score", "qid", "docno"]), start=1):
                file_feature_values[feature_name] = df[feature_name].tolist()

            dataframes.append(df)
            feature_values.append(file_feature_values)

    merged_df = pd.concat(dataframes, ignore_index=True, sort=False)

    for feature_name in feature_values[0]:
        feature_values_flat = [val for file_dict in feature_values for val in file_dict.get(feature_name, [])]
        
        # Pad missing values with 0
        if len(feature_values_flat) < len(merged_df):
            feature_values_flat.extend([0] * (len(merged_df) - len(feature_values_flat)))

        if len(feature_values_flat) == len(merged_df):
            merged_df[feature_name] = feature_values_flat
        else:
            print(f"Skipping '{feature_name}' due to length mismatch (index: {len(merged_df)}, values: {len(feature_values_flat)})")

    merged_df.sort_values(by='qid', inplace=True)

    with open(output_svmrank_path, 'w') as f:
        for _, row in merged_df.iterrows():
            f.write(f"{row['relevance_score']} qid:{int(row['qid'])}")

            for j, feature_name in enumerate(merged_df.columns.drop(["relevance_score", "qid", "docno"]), start=1):
                f.write(f" {j}:{row[feature_name]}")

            f.write(f" #{row['docno']}\n")

In [5]:
def all_features_svm_no_relevance(input_dir_path, output_svmrank_path):
    dataframes = []
    feature_values = []

    for filename in os.listdir(input_dir_path):
        if filename.endswith(".csv"):
            df = pd.read_csv(os.path.join(input_dir_path, filename), sep=",", header=0)

            file_feature_values = {}
            for j, feature_name in enumerate(df.columns.drop(["relevance_score", "qid", "docno"]), start=1):
                file_feature_values[feature_name] = df[feature_name].tolist()

            dataframes.append(df)
            feature_values.append(file_feature_values)

    merged_df = pd.concat(dataframes, ignore_index=True, sort=False)

    for feature_name in feature_values[0]:
        feature_values_flat = [val for file_dict in feature_values for val in file_dict.get(feature_name, [])]
        
        # Pad missing values with 0
        if len(feature_values_flat) < len(merged_df):
            feature_values_flat.extend([0] * (len(merged_df) - len(feature_values_flat)))

        if len(feature_values_flat) == len(merged_df):
            merged_df[feature_name] = feature_values_flat
        else:
            print(f"Skipping '{feature_name}' due to length mismatch (index: {len(merged_df)}, values: {len(feature_values_flat)})")

    merged_df.sort_values(by='qid', inplace=True)

    with open(output_svmrank_path, 'w') as f:
        for _, row in merged_df.iterrows():
            f.write("0")  # Default target value
            f.write(f" qid:{int(row['qid'])}")  # Query ID

            for j, feature_name in enumerate(merged_df.columns.drop(["relevance_score", "qid", "docno"]), start=1):
                f.write(f" {j}:{row[feature_name]}")

            f.write(f" #{row['docno']}\n")  # Document ID

In [None]:
input_dir_path = data_path + 'Data/merged_features_relevance_2021/sbert_features_relevance'
output_svmrank_path = data_path + 'Data/svm_format_2020_2021/sbert_merged/merged_data_2021.svmrank'
output_svmrank_path_no_relevance = data_path + 'Data/svm_format_2020_2021/sbert_merged/merged_data_no_relevance_2021.svmrank'

all_features_svm(input_dir_path, output_svmrank_path)
all_features_svm_no_relevance(input_dir_path,output_svmrank_path_no_relevance)

In [None]:
input_dir_path = data_path + 'Data/merged_features_relevance_2021/sentiment_sarcasm_features_relevance'
output_svmrank_path = data_path + 'Data/svm_format_2020_2021/sentiment_sarcasm_merged/merged_data_2021.svmrank'
output_svmrank_path_no_relevance = data_path + 'Data/svm_format_2020_2021/sentiment_sarcasm_merged/merged_data_no_relevance_2021.svmrank'

all_features_svm(input_dir_path, output_svmrank_path)
all_features_svm_no_relevance(input_dir_path,output_svmrank_path_no_relevance)

### TF-IDF Doesnt work, bug needs to be fixes

In [None]:
##input_dir_path = data_path + 'Data/merged_features_relevance_2021/tf_idf_features_relevance'
##output_svmrank_path = data_path + 'Data/svm_format_2020_2021/tf_idf_merged/merged_data_2021.svmrank'
##output_svmrank_path_no_relevance = data_path + 'Data/svm_format_2020_2021/tf_idf_merged/merged_data_no_relevance_2021.svmrank'

##all_features_svm(input_dir_path, output_svmrank_path)
##all_features_svm_no_relevance(input_dir_path,output_svmrank_path_no_relevance)

: 

: 

In [None]:
input_dir_path = data_path + 'Data/merged_features_relevance_2020/sbert_features_relevance'
output_svmrank_path = data_path + 'Data/svm_format_2020_2021/sbert_merged/merged_data_2020.svmrank'
output_svmrank_path_no_relevance = data_path + 'Data/svm_format_2020_2021/sbert_merged/merged_data_no_relevance_2020.svmrank'

all_features_svm(input_dir_path, output_svmrank_path)
all_features_svm_no_relevance(input_dir_path,output_svmrank_path_no_relevance)

In [None]:
input_dir_path = data_path + 'Data/merged_features_relevance_2020/sentiment_sarcasm_features_relevance'
output_svmrank_path = data_path + 'Data/svm_format_2020_2021/sentiment_sarcasm_merged/merged_data_2020.svmrank'
output_svmrank_path_no_relevance = data_path + 'Data/svm_format_2020_2021/sentiment_sarcasm_merged/merged_data_no_relevance_2020.svmrank'

all_features_svm(input_dir_path, output_svmrank_path)
all_features_svm_no_relevance(input_dir_path,output_svmrank_path_no_relevance)

In [None]:
input_dir_path = data_path + 'Data/merged_features_relevance_2020/tf_idf_features_relevance'
output_svmrank_path = data_path + 'Data/svm_format_2020_2021/tf_idf_merged/merged_data_2020.svmrank'
output_svmrank_path_no_relevance = data_path + 'Data/svm_format_2020_2021/tf_idf_merged/merged_data_no_relevance_2020.svmrank'

all_features_svm(input_dir_path, output_svmrank_path)
all_features_svm_no_relevance(input_dir_path,output_svmrank_path_no_relevance)