### Function 1: `Adding scores to ranking files`

It takes in three file paths as input arguments: `ranking_file`, `scores_file`, and `output_file`. 

- `ranking_file` is a text file containing a list of documents, where each document is followed by a rank number and a "#" symbol. The function extracts the document IDs from each line and saves them for later use.
- `scores_file` is a text file containing a list of scores, where each score corresponds to the document at the same line number in `ranking_file`. The function reads in these scores and stores them in memory.
- The function then writes out a new file at `output_file`, where each line corresponds to a document in `ranking_file`, with the score from `scores_file` appended to the end of the line, preceded by a "#" symbol and the document ID.

### Function 2: `Re-ranking documents`

It takes in three file paths as input arguments: `argument_scorefile_with_doc_ids`, `merged_data_no_relevance`, and `output_file`.

- `argument_scorefile_with_doc_ids` is a text file containing a list of scores and their corresponding document IDs, separated by "#" symbols. The function reads in these scores and document IDs and stores them in memory.
- `merged_data_no_relevance` is a text file containing a list of data points in SVMrank format, where each line corresponds to a document and contains a query ID, a document rank, and a feature vector. The function reads in this data and modifies it to include the scores from `argument_scorefile_with_doc_ids`, appending them to the feature vector of each document.
- The function then reorders the data based on the query ID and the new scores (from high to low), and writes out a new file at `output_file` with the reordered data.

If there are no document IDs that match between `argument_scorefile_with_doc_ids` and `merged_data_no_relevance`, the function will print a message indicating that no matching IDs were found.

In [1]:
#Imports
import os

In [2]:
#Global variables
data_path = '/Users/balazs/Desktop/dissertationProjectCode/dissertationCodeBase/'

In [3]:
def add_argument_ids_to_scorefile(ranking_file, scores_file, output_file):
    with open(ranking_file, 'r') as rankfile, open(scores_file, 'r') as scorefile, open(output_file, 'w') as outfile:
        for line, score in zip(rankfile, scorefile):
            doc_id = line.split('#')[-1].strip()
            outfile.write(f'{score.strip()} #{doc_id}\n')

In [4]:
def re_rank_svmrank_data(argument_scorefile_with_doc_ids, merged_data_no_relevance, output_file):
    # Read the argument2020_scorefile_with_doc_ids file and store the scores in a dictionary
    with open(argument_scorefile_with_doc_ids, "r") as f:
        scores = {}
        for line in f:
            parts = line.strip().split("#")
            doc_id = parts[1].strip()
            score = float(parts[0].split()[2])
            scores[doc_id] = score

    # Read the merged_data_no_relevance_2020.svmrank file, add the predicted scores, and store the lines in a list
    with open(merged_data_no_relevance, "r") as f:
        merged_data = []
        for line in f:
            parts = line.strip().split("#")
            doc_id = parts[1].strip()
            if doc_id in scores:
                relevance_score = str(scores[doc_id])
                qid = parts[0].split()[0]  # Extract qid
                features = " ".join(parts[0].split()[1:])  # Extract features
                new_line = relevance_score + " " + qid + " " + features + " #" + doc_id
                merged_data.append((qid, scores[doc_id], new_line))

    # Check if any lines were matched and modified
    if merged_data:
        # Reorder the merged data based on qid and the new scores (high to low)
        merged_data.sort(key=lambda x: (x[0], -x[1]))

        # Write the reordered data to the output file
        with open(output_file, "w") as f:
            for _, _, line in merged_data:
                f.write(line + "\n")
    else:
        print("No matching IDs found between the two files.")

# Linguistic & Sentiment

In [5]:
# Scorefile update 2021 paths
ranking_file_2021 = data_path + 'Data/svm_format_2020_2021/linguistic_sentiment_merged/merged_data_2021.svmrank'
scores_file_2021 = data_path + 'Data/scorefiles_2020_2021/linguistic_sentiment_scorefiles/argument2021_scorefile_LambdaMart.txt'
output_file_2021 = data_path + 'Data/scorefiles_2020_2021/linguistic_sentiment_scorefiles/argument2021_scorefile_with_doc_ids_LambdaMart.txt'

# Scorefile update 2020 paths
ranking_file_2020 = data_path + 'Data/svm_format_2020_2021/linguistic_sentiment_merged/merged_data_2020.svmrank'
scores_file_2020 = data_path + 'Data/scorefiles_2020_2021/linguistic_sentiment_scorefiles/argument2020_scorefile_LambdaMart.txt'
output_file_2020 = data_path + 'Data/scorefiles_2020_2021/linguistic_sentiment_scorefiles/argument2020_scorefile_with_doc_ids_LambdaMart.txt'

# Add argument ID's to scorefile 2021
add_argument_ids_to_scorefile(ranking_file_2021, scores_file_2021, output_file_2021)

# Add argument ID's to scorefile 2020
add_argument_ids_to_scorefile(ranking_file_2020, scores_file_2020, output_file_2020)

In [6]:
#Re-rank 2021 paths
argument2021_scorefile_with_doc_ids = data_path + 'Data/scorefiles_2020_2021/linguistic_sentiment_scorefiles/argument2021_scorefile_with_doc_ids_LambdaMart.txt'
merged_data_no_relevance_2021 = data_path + 'Data/svm_format_2020_2021/linguistic_sentiment_merged/merged_data_no_relevance_2021.svmrank'
re_ranked_2021 = data_path + 'Data/re_ranked_2020_2021/linguistic_sentiment_re_ranked/re_ranked_2021_LambdaMart.txt'

#Re-rank 2020 paths
argument2020_scorefile_with_doc_ids = data_path + 'Data/scorefiles_2020_2021/linguistic_sentiment_scorefiles/argument2020_scorefile_with_doc_ids_LambdaMart.txt'
merged_data_no_relevance_2020 = data_path + 'Data/svm_format_2020_2021/linguistic_sentiment_merged/merged_data_no_relevance_2020.svmrank'
re_ranked_2020 = data_path + 'Data/re_ranked_2020_2021/linguistic_sentiment_re_ranked/re_ranked_2020_LambdaMart.txt'

# Re-rank 2020
re_rank_svmrank_data(argument2020_scorefile_with_doc_ids, merged_data_no_relevance_2020, re_ranked_2020)

#Re-rank 2021
re_rank_svmrank_data(argument2021_scorefile_with_doc_ids, merged_data_no_relevance_2021, re_ranked_2021)


# SBERT

In [7]:
# Scorefile update 2021 paths
ranking_file_2021 = data_path + 'Data/svm_format_2020_2021/sbert_merged/merged_data_2021.svmrank'
scores_file_2021 = data_path + 'Data/scorefiles_2020_2021/sbert_scorefiles/argument2021_scorefile_LambdaMart.txt'
output_file_2021 = data_path + 'Data/scorefiles_2020_2021/sbert_scorefiles/argument2021_scorefile_with_doc_ids_LambdaMart.txt'

# Scorefile update 2020 paths
ranking_file_2020 = data_path + 'Data/svm_format_2020_2021/sbert_merged/merged_data_2020.svmrank'
scores_file_2020 = data_path + 'Data/scorefiles_2020_2021/sbert_scorefiles/argument2020_scorefile_LambdaMart.txt'
output_file_2020 = data_path + 'Data/scorefiles_2020_2021/sbert_scorefiles/argument2020_scorefile_with_doc_ids_LambdaMart.txt'

# Add argument ID's to scorefile 2021
add_argument_ids_to_scorefile(ranking_file_2021, scores_file_2021, output_file_2021)

# Add argument ID's to scorefile 2020
add_argument_ids_to_scorefile(ranking_file_2020, scores_file_2020, output_file_2020)

In [8]:
#Re-rank 2021 paths
argument2021_scorefile_with_doc_ids = data_path + 'Data/scorefiles_2020_2021/sbert_scorefiles/argument2021_scorefile_with_doc_ids_LambdaMart.txt'
merged_data_no_relevance_2021 = data_path + 'Data/svm_format_2020_2021/sbert_merged/merged_data_no_relevance_2021.svmrank'
re_ranked_2021 = data_path + 'Data/re_ranked_2020_2021/sbert_re_ranked/re_ranked_2021_LambdaMart.txt'

#Re-rank 2020 paths
argument2020_scorefile_with_doc_ids = data_path + 'Data/scorefiles_2020_2021/sbert_scorefiles/argument2020_scorefile_with_doc_ids_LambdaMart.txt'
merged_data_no_relevance_2020 = data_path + 'Data/svm_format_2020_2021/sbert_merged/merged_data_no_relevance_2020.svmrank'
re_ranked_2020 = data_path + 'Data/re_ranked_2020_2021/sbert_re_ranked/re_ranked_2020_LambdaMart.txt'

# Re-rank 2020
re_rank_svmrank_data(argument2020_scorefile_with_doc_ids, merged_data_no_relevance_2020, re_ranked_2020)

#Re-rank 2021
re_rank_svmrank_data(argument2021_scorefile_with_doc_ids, merged_data_no_relevance_2021, re_ranked_2021)


# Linguistic & Sarcasm

In [9]:
# Scorefile update 2021 paths
ranking_file_2021 = data_path + 'Data/svm_format_2020_2021/sentiment_sarcasm_merged/merged_data_2021.svmrank'
scores_file_2021 = data_path + 'Data/scorefiles_2020_2021/sentiment_sarcasm_scorefiles/argument2021_scorefile_LambdaMart.txt'
output_file_2021 = data_path + 'Data/scorefiles_2020_2021/sentiment_sarcasm_scorefiles/argument2021_scorefile_with_doc_ids_LambdaMart.txt'

# Scorefile update 2020 paths
ranking_file_2020 = data_path + 'Data/svm_format_2020_2021/sentiment_sarcasm_merged/merged_data_2020.svmrank'
scores_file_2020 = data_path + 'Data/scorefiles_2020_2021/sentiment_sarcasm_scorefiles/argument2020_scorefile_LambdaMart.txt'
output_file_2020 = data_path + 'Data/scorefiles_2020_2021/sentiment_sarcasm_scorefiles/argument2020_scorefile_with_doc_ids_LambdaMart.txt'

# Add argument ID's to scorefile 2021
add_argument_ids_to_scorefile(ranking_file_2021, scores_file_2021, output_file_2021)

# Add argument ID's to scorefile 2020
add_argument_ids_to_scorefile(ranking_file_2020, scores_file_2020, output_file_2020)

In [10]:
#Re-rank 2021 paths
argument2021_scorefile_with_doc_ids = data_path + 'Data/scorefiles_2020_2021/sentiment_sarcasm_scorefiles/argument2021_scorefile_with_doc_ids_LambdaMart.txt'
merged_data_no_relevance_2021 = data_path + 'Data/svm_format_2020_2021/sentiment_sarcasm_merged/merged_data_no_relevance_2021.svmrank'
re_ranked_2021 = data_path + 'Data/re_ranked_2020_2021/sentiment_sarcasm_re_ranked/re_ranked_2021_LambdaMart.txt'

#Re-rank 2020 paths
argument2020_scorefile_with_doc_ids = data_path + 'Data/scorefiles_2020_2021/sentiment_sarcasm_scorefiles/argument2020_scorefile_with_doc_ids_LambdaMart.txt'
merged_data_no_relevance_2020 = data_path + 'Data/svm_format_2020_2021/sentiment_sarcasm_merged/merged_data_no_relevance_2020.svmrank'
re_ranked_2020 = data_path + 'Data/re_ranked_2020_2021/sentiment_sarcasm_re_ranked/re_ranked_2020_LambdaMart.txt'


# Re-rank 2020
re_rank_svmrank_data(argument2020_scorefile_with_doc_ids, merged_data_no_relevance_2020, re_ranked_2020)

#Re-rank 2021
re_rank_svmrank_data(argument2021_scorefile_with_doc_ids, merged_data_no_relevance_2021, re_ranked_2021)