# `make_runfile` Function Explanation

This code defines a function called `make_runfile` that takes four arguments: `input_file_path`, `output_dir`, `output_file_name`, and `run_tag`. 
The function reads data from the input file, processes the lines, sorts the data by topic ID and score, and writes the sorted data to the output file.

## Steps

### 1. Create the Output Directory

If the output directory doesn't exist, it is created using the `os.makedirs` function with the `exist_ok=True` flag.

### 2. Read the Input File

The input file is read, and the lines are stored in a list called `lines`.

### 3. Process the Lines

Each line is processed using regular expressions to extract the score, topic ID, argument ID, and rank. These values are stored in a list of dictionaries called `data`.

### 4. Sort the Data

The data is sorted by topic ID (low to high) and score (high to low). The score is converted to a float to ensure proper sorting.

### 5. Write the Sorted Data to the Output File

The sorted data is written to the output file. Each line in the output file contains the topic ID, a fixed value of 0, the argument ID, the score, and the run tag, all separated by spaces.


In [1]:
#Imports
import os
import re
import pandas as pd

In [2]:
#Global variables
data_path = '/Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/'

In [3]:
def make_runfile(input_file_path, output_dir, output_file_name, run_tag):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # Set the output file path
    output_file_path = os.path.join(output_dir, output_file_name)

    # Read the input file and store the lines in a list
    with open(input_file_path, 'r') as input_file:
        lines = input_file.readlines()

    # Process the lines and store them in a list of dictionaries
    data = []
    seen = set()
    for line in lines:
        score = re.search(r'^([\d.]+)', line).group(1)
        topic_id = re.search(r'qid:(\d+)', line).group(1)
        argument_id = re.search(r'#(.+)$', line).group(1)

        # Skip duplicates
        if (topic_id, argument_id) in seen:
            continue
        seen.add((topic_id, argument_id))
        
        data.append({
            'score': score,
            'topic_id': topic_id,
            'argument_id': argument_id
        })

    # Sort the data by topic_id (low to high) and score (high to low)
    data.sort(key=lambda x: (int(x['topic_id']), -float(x['score'])))

    # Write the sorted data to the output file
    with open(output_file_path, 'w') as f:
        for i, item in enumerate(data):
            topic_id = item['topic_id']
            argument_id = item['argument_id']
            score = item['score']
            rank = i + 1
            output_line = f"{topic_id}\t0\t{argument_id}\t{rank}\t{score}\t{run_tag}\n"
            f.write(output_line)

In [4]:
# Generate runfiles linguistic sentiment
input_file_path_2020 = data_path + "Data/re_ranked_2020_2021/linguistic_sentiment_re_ranked/re_ranked_2020_LambdaMart.txt"
input_file_path_2021 = data_path + "Data/re_ranked_2020_2021/linguistic_sentiment_re_ranked/re_ranked_2021_LambdaMart.txt"

output_dir = data_path + "Data/runfiles_2020_2021/linguistic_sentiment_runfiles"

output_file_name_2020 =  "runfile_2020_LambdaMart.txt"
output_file_name_2021 =  "runfile_2021_LambdaMart.txt"

run_tag_2020 = "linguistic_sentiment_2020_LambdaMart"
run_tag_2021 = "linguistic_sentiment_2021_LambdaMart"


make_runfile(input_file_path_2020, output_dir, output_file_name_2020, run_tag_2020)
make_runfile(input_file_path_2021, output_dir, output_file_name_2021, run_tag_2021)

In [5]:
# Generate runfiles sbert
input_file_path_2020 = data_path + "Data/re_ranked_2020_2021/sbert_re_ranked/re_ranked_2020_LambdaMart.txt"
input_file_path_2021 = data_path + "Data/re_ranked_2020_2021/sbert_re_ranked/re_ranked_2021_LambdaMart.txt"

output_dir = data_path + "Data/runfiles_2020_2021/sbert_runfiles"

output_file_name_2020 =  "runfile_2020_LambdaMart.txt"
output_file_name_2021 =  "runfile_2021_LambdaMart.txt"

run_tag_2020 = "SBERT_2020_LambdaMart"
run_tag_2021 = "SBERT_2021_LambdaMart"


make_runfile(input_file_path_2020, output_dir, output_file_name_2020, run_tag_2020)
make_runfile(input_file_path_2021, output_dir, output_file_name_2021, run_tag_2021)

In [6]:
# Generate runfiles linguistic sarcasm
input_file_path_2020 = data_path + "Data/re_ranked_2020_2021/sentiment_sarcasm_re_ranked/re_ranked_2020_LambdaMart.txt"
input_file_path_2021 = data_path + "Data/re_ranked_2020_2021/sentiment_sarcasm_re_ranked/re_ranked_2021_LambdaMart.txt"

output_dir = data_path + "Data/runfiles_2020_2021/sentiment_sarcasm_runfiles"

output_file_name_2020 =  "runfile_2020_LambdaMart.txt"
output_file_name_2021 =  "runfile_2021_LambdaMart.txt"

run_tag_2020 = "sarcasm_sentiment_2020_LambdaMart"
run_tag_2021 = "sarcasm_sentiment_2021_LambdaMart"


make_runfile(input_file_path_2020, output_dir, output_file_name_2020, run_tag_2020)
make_runfile(input_file_path_2021, output_dir, output_file_name_2021, run_tag_2021)

# Generate baseline runfiles

In [None]:
folder_path = data_path + 'Data/arguments_retrieved_2020'
output_file = data_path + 'Data/runfiles_2020_2021/baseline_runfiles/baseline_runfile_2020.txt'
tag = 'baseline_runfile_2020'

merged_df = pd.DataFrame()

for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path, sep='\t')
        merged_df = pd.concat([merged_df, df])

merged_df['Tag'] = tag
merged_df['Q0'] = 'Q0'
merged_df['Rank'] = merged_df.groupby('qid')['score'].rank(ascending=False)
merged_df = merged_df[['qid', 'Q0', 'docno', 'Rank', 'score', 'Tag']]
merged_df.columns = ['Topic', 'Q0', 'ID', 'Rank', 'Score', 'Tag']
merged_df.sort_values(by=['Topic', 'Rank'], ascending=[True, True], inplace=True)
merged_df.to_csv(output_file, sep=' ', index=False, header=False)