In [3]:
import re
import pandas as pd
from pathlib import Path
from unidecode import unidecode


### Define processing funtion

- **Reads and parses** the input files: 'authors.txt' and a file with class labels for each paper
- **Cleans and processes** author names, removing accents and special characters
- **Creates a dictionary** of authors and their associated classes
- **Constructs a DataFrame** with columns for 'paper_id', 'authors', 'classes', class counts, and class weights
- **Calculates** the class counts and class weights for each paper's authors
- **Saves** the resulting DataFrame to a CSV file
- **Returns** the DataFrame for further use or analysis

In [14]:
def process_authors_data(y_train_path: Path, authors_path: Path, output_csv_path: Path) -> pd.DataFrame:

    '''
    This function gets as input the paths of the required files and the required name from the csv file output.
    It returns a dataframe from better understanding the results.
    Also, it saves the result as a csv file 
    '''

    # Read the 'authors.txt' file
    with open(authors_path, "r") as file:
        lines = file.readlines()

    # Create the authors_dict from the lines
    authors_dict = {int(line.split('||')[0]): line.split('||')[1].strip() for line in lines}

    # Process the authors_dict
    processed_authors = {}
    for paper_id, authors in authors_dict.items():
        # Clean the author names
        cleaned_authors = [unidecode(author).lower().translate(str.maketrans("", "", "0123456789")).strip() for author in authors.split(",")]
        # Add the cleaned authors to the processed_authors dictionary
        processed_authors[paper_id] = ','.join(cleaned_authors)

    # Create the authors_class_dict
    authors_class_dict = {}
    paper_classes = {}

    with open(y_train_path, 'r') as file:
        for line in file:
            paper_id, class_label = line.strip().split(',')
            paper_classes[int(paper_id)] = int(class_label)

    for paper_id, authors in processed_authors.items():
        try:
            class_label = paper_classes[paper_id]
            for author in authors.split(','):
                if author not in authors_class_dict:
                    authors_class_dict[author] = []
                authors_class_dict[author].append(class_label)
        except KeyError:
            pass

    ### Update the authors_dict with class information ###
    updated_authors_dict = {}
    for paper_id, authors in processed_authors.items():
        updated_authors = []
        updated_classes = []
        for author in authors.split(','):
            class_list = authors_class_dict.get(author, [])
            updated_authors.append(author)
            updated_classes.append(', '.join(map(str, class_list)))
        updated_authors_dict[paper_id] = {'authors': ','.join(updated_authors), 'classes': ','.join(updated_classes)}


    # Create a DataFrame from the updated_authors_dict
    df = pd.DataFrame.from_dict(updated_authors_dict, orient='index')
    df.reset_index(inplace=True)
    df.columns = ['paper_id', 'authors', 'classes']

    # Add class count columns to the DataFrame
    for i in range(5):
        df[f'class{i}'] = 0

    for idx, row in df.iterrows():
        classes = row['classes'].split(',')
        class_counts = {f'class{i}': 0 for i in range(5)}

        for class_label in classes:
            class_label = class_label.strip()
            if class_label.isdigit():
                class_counts[f'class{class_label}'] += 1

        for class_col, count in class_counts.items():
            df.at[idx, class_col] = count


    # Add class weight columns to the DataFrame
    for i in range(5):
        df[f'class{i}_weight'] = 0.0

    # Iterate through the DataFrame and populate the new columns
    for idx, row in df.iterrows():
        class_counts = [row[f'class{i}'] for i in range(5)]
        total_counts = sum(class_counts)
        
        # Calculate the class weights
        if total_counts > 0:
            class_weights = [count / total_counts for count in class_counts]
        else:
            class_weights = [0.0 for _ in class_counts]

        # Update the new columns with the weights
        for i, weight in enumerate(class_weights):
            df.at[idx, f'class{i}_weight'] = weight

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv_path, index=False)
    return df

In [16]:
# Set the absolute path to your data directory
data_directory = Path("E:/panag/Desktop/Ms Data Science/6 Quarter/Data Science Challenge/data_challenge_aueb_2023")

# Train data
y_train_file = "y_train.txt"
# Authors data
authors_file = "authors.txt"

y_train_path = data_directory / y_train_file
authors_path = data_directory / authors_file
output_csv_path =  "features_authors.csv"

df = process_authors_data(y_train_path, authors_path, output_csv_path)

df.head(5)

Unnamed: 0,paper_id,authors,classes,class0,class1,class2,class3,class4,class0_weight,class1_weight,class2_weight,class3_weight,class4_weight
0,0,"junchi yan,jian liu,yin li,zhibin niu,yuncai liu","0, 3, 1, 0, 1, 0, 1, 3, 1, 0, 1, 1, 1, 1, 1, 1...",4,24,0,7,0,0.114286,0.685714,0.0,0.2,0.0
1,1,"mehdi m. kashani,eric joanis,roland kuhn,georg...",",2,2, 2, 2, 2, 2,2, 2, 2, 2,",0,0,10,0,0,0.0,0.0,1.0,0.0,0.0
2,2,"amir abboud,arturs backurs,virginia vassilevsk...","4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,4, 4, 4, 4,...",1,0,0,0,32,0.030303,0.0,0.0,0.0,0.969697
3,3,"david c. anastasiu,byron j. gao,david buttler","3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, ...",0,0,0,19,0,0.0,0.0,0.0,1.0,0.0
4,4,giulia pagallo,,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [None]:
Write me a short description in bullets what does this funtion do