In [1]:
import pandas as pd
from pathlib import Path


In [8]:
# Set the absolute path to your data directory
data_directory = Path("E:/panag/Desktop/Ms Data Science/6 Quarter/Data Science Challenge/data_challenge_aueb_2023")

# Train data
y_train_file = "y_train.txt"
# Abstract data
abstract_file = "abstract.txt"
# Test data
test_file = "test.txt"
# Authors
authors_file = "authors.txt"
# Edge list
edgelist_file = "edgelist.txt"
# Year 
year_file = "year.txt"

###  1. Edgelist/Year data

In [9]:
# Import citation data and year data


# Read the year.txt file
year_data = pd.read_csv(data_directory/year_file, header=None, names=['paper_id', 'year'], sep=',')

# Merge the year data with the existing DataFrame
df = df.merge(year_data, on='paper_id')

# Check for min and max year
min_year = df['year'].min()
max_year = df['year'].max()

# Create the year_normalized column
df['year_normalized'] = (df['year'] - min_year) / (max_year - min_year)

# Create a final DataFrame with class weights and year_normalized columns
df_final = df[['paper_id'] + [f'class{i}_weight' for i in range(5)] + ['year_normalized']]

# Fill nan class_weight columns with equal weigts
df_final = df_final.fillna(1/5)
df_final

Unnamed: 0,paper_id,class0_weight,class1_weight,class2_weight,class3_weight,class4_weight,year_normalized
0,0,0.000000,1.0,0.000000,0.000000,0.000000,0.853659
1,1,0.000000,0.0,1.000000,0.000000,0.000000,0.817073
2,2,0.000000,0.0,0.133333,0.000000,0.866667,0.914634
3,3,0.333333,0.0,0.000000,0.666667,0.000000,0.865854
4,4,0.000000,0.0,0.000000,1.000000,0.000000,0.597561
...,...,...,...,...,...,...,...
166981,166981,0.500000,0.0,0.000000,0.000000,0.500000,0.902439
166982,166982,0.000000,0.0,0.000000,0.000000,1.000000,0.500000
166983,166983,0.000000,1.0,0.000000,0.000000,0.000000,0.853659
166984,166984,0.000000,0.0,0.000000,1.000000,0.000000,0.817073


#### Testing

In [12]:
# Read train.txt and test.txt files
train_data = pd.read_csv(data_directory/y_train_file, header=None, names=['paper_id', 'class'], sep=',')

# Create df_train and df_test DataFrames
df_data = df_final[df_final['paper_id'].isin(train_data['paper_id'])]

df_train = df_data.merge(train_data, on='paper_id')

x_all_train = df_train[[f'class{i}_weight' for i in range(5)] + ['year_normalized']]
y_all_train = df_train['class']

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_train[[f'class{i}_weight' for i in range(5)] + ['year_normalized']], df_train['class'],test_size=0.1, random_state=42)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# Train a Random Forest classifier using cross-validation
rf = RandomForestClassifier(n_estimators=300, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_proba = cross_val_predict(rf, x_all_train, y_all_train, cv=cv, method='predict_proba')


# Calculate the multiclass logarithmic loss
loss = log_loss(y_all_train, y_pred_proba)
print("Multiclass logarithmic loss: {:.3f}".format(loss))

Multiclass logarithmic loss: 0.864


In [22]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=10000)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_proba = cross_val_predict(clf, x_all_train, y_all_train, cv=cv, method='predict_proba')
# Calculate the multiclass logarithmic loss
loss = log_loss(y_all_train, y_pred_proba)
print("Multiclass logarithmic loss: {:.3f}".format(loss))

Multiclass logarithmic loss: 0.561


In [None]:
import xgboost as xgb

# Train an XGBoost classifier on the training set
xgb_model = xgb.XGBClassifier(objective='multi:softprob', random_state=42)
xgb_model.fit(x_all_train, y_all_train)

# Predict the class probabilities for the test set
#y_pred_proba_xgb = xgb_model.predict_proba(X_test)

# Calculate the multiclass logarithmic loss
#loss_xgb = log_loss(y_test, y_pred_proba_xgb)

#print("XGBoost model - Multiclass logarithmic loss: {:.3f}".format(loss_xgb))

In [20]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid to search over
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [100, 500, 1000]
}

# XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='multi:softprob', random_state=42)
# Perform grid search using 5-fold cross-validation
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_log_loss', n_jobs=-1, verbose=1)
grid_search.fit(x_all_train, y_all_train)

# Print the best hyperparameters and the corresponding score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: {:.3f}".format(-grid_search.best_score_))


# Best hyperparameters:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

Fitting 3 folds for each of 27 candidates, totalling 81 fits




Best hyperparameters:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best score: 0.499


## 2. Authors data

In [45]:
### Find training and test paper ids. Extract the authors list 

# Read training data
train_papers = list()
with open(data_directory/y_train_file, "r") as f:
    for line in f:
        t = line.split(',')
        train_papers.append(int(t[0]))

# Read test data
test_papers = list()
with open(data_directory/test_file, "r") as f:
    for line in f:
        t = line.split(',')
        test_papers.append(int(t[0]))

## Papers ids set 
papers_tuple = set(train_papers + test_papers)

In [54]:
# Read the 'authors.txt' file
with open(data_directory/authors_file, "r") as file:
    lines = file.readlines()

# Function to process each line and get the paper id and authors
def process_line(line):
    parts = line.strip().split("||")
    paper_id = int(parts[0])
    authors = parts[1]
    return paper_id, authors

# Iterate through the lines and check if the paper id is in the papers tuple
authors_dict = {}
for line in lines:
    paper_id, authors = process_line(line)
    if paper_id in papers_tuple:
        authors_dict[paper_id] = authors

#### Cleaned Authors

In [68]:
from unidecode import unidecode
from fuzzywuzzy import fuzz
import re

def clean_author_name(name):
    # Replace Unicode characters with their closest ASCII equivalents
    name = unidecode(name)
    # Convert to lowercase
    name = name.lower()
    # Remove digits
    name = re.sub(r'\d', '', name)
    # Remove strings with less than 2 characters
    name = " ".join([part for part in name.split() if len(part) > 2])
    return name

def process_authors_dict(authors_dict):
    processed_authors = {}
    for paper_id, authors in authors_dict.items():
        # Clean the author names
        cleaned_authors = [clean_author_name(author) for author in authors.split(",")]
        # Add the cleaned authors to the processed_authors dictionary
        processed_authors[paper_id] = ','.join(cleaned_authors)
    return processed_authors

In [107]:
# Process the authors_dict
papers_authors_clean_dict = process_authors_dict(authors_dict)

In [110]:
papers_authors_clean_dict


{2: 'amir abboud,arturs backurs,virginia vassilevska williams',
 3: 'david anastasiu,byron gao,david buttler',
 6: 'kar,purushottam,narasimhan,harikrishna,prateek jain',
 8: 'niepert,mathias',
 13: 'mattias fitzi,ueli maurer',
 15: 'zenglin,rong jin,haiqin yang,irwin king,michael lyu',
 21: 'fabian chudak,kiyohito nagano',
 22: 'jianfeng yan,zhang',
 25: 'zeljko agic,anders johannsen,barbara plank,hector martinez alonso,natalie schluter,anders sogaard',
 27: 'abhishek sharma,oncel tuzel,david jacobs',
 28: 'huanjing yue,xiaoyan sun,jingyu yang,feng',
 33: 'gloria zen,elisa ricci,nicu sebe',
 46: 'philipp koehn,hieu hoang,alexandra birch,chris callison-burch,marcello federico,nicola bertoldi,brooke cowan,wade shen,christine moran,richard zens,chris dyer,ondrej bojar,alexandra constantin,evan herbst',
 49: 'katja hofmann,shimon whiteson,maarten rijke',
 50: 'sergi elizalde,peter winkler',
 57: 'reinhard rapp',
 58: 'peter turney',
 59: 'zheng lin,xiaolong jin,xueke,weiping wang,xueqi che

#### Replaced Authors

In [80]:
#### Tooooo SLowwww #######


def find_similar_author(name, unique_authors, threshold=85):
    for author in unique_authors:
        if fuzz.token_set_ratio(name, author) >= threshold:
            return author
    return None

def replace_authors_with_similar(authors_dict):
    unique_authors = {}
    replaced_authors = {}
    for paper_id, authors in authors_dict.items():
        print(paper_id)
        new_authors = []
        for author in authors.split(","):
            similar_author = find_similar_author(author, unique_authors)
            if similar_author:
                new_authors.append(similar_author)
            else:
                unique_authors[author] = True
                new_authors.append(author)
        replaced_authors[paper_id] = ",".join(new_authors)
    return replaced_authors

#### Tooooo SLowwww #######

# Replace authors with similar names
replaced_authors_dict = replace_authors_with_similar(cleaned_authors_dict)

In [98]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
from fuzzywuzzy import fuzz
from typing import Dict

def replace_authors_with_similar(authors_dict: Dict[int, str], threshold: float = 0.8) -> Dict[int, str]:
    # Load the pre-trained BERT model for computing sentence embeddings
    model = SentenceTransformer('paraphrase-distilroberta-base-v2')

    # Extract unique author names
    author_names = list(set([author for authors in authors_dict.values() for author in authors.split(",")]))

    # Calculate author name embeddings
    author_embeddings = model.encode(author_names)

    # Define the clustering model
    clustering_model = AgglomerativeClustering(
        n_clusters=None,
        affinity='cosine',
        linkage='average',
        distance_threshold=1 - threshold
    )

    # Fit the clustering model
    clustering_model.fit(author_embeddings)

    # Create a dictionary to map author names to their cluster representatives
    cluster_representatives = {}
    for cluster_label in set(clustering_model.labels_):
        cluster_members = [author_names[i] for i, label in enumerate(clustering_model.labels_) if label == cluster_label]
        cluster_representative = min(cluster_members)
        for member in cluster_members:
            cluster_representatives[member] = cluster_representative

    # Replace authors with their cluster representatives
    replaced_authors = {}
    for paper_id, authors in authors_dict.items():
        new_authors = [cluster_representatives[author] for author in authors.split(",")]
        replaced_authors[paper_id] = ",".join(new_authors)

    return replaced_authors

In [None]:
# Replace authors with similar names
replaced_authors_dict = replace_authors_with_similar(cleaned_authors_dict)

#### Dict Authors / Classes

In [83]:
def create_authors_class_dict(authors_dict, y_train_file):
    authors_class_dict = {}

    # Load paper classes from y_train.txt file
    paper_classes = {}
    with open(y_train_file, 'r') as file:
        for line in file:
            paper_id, class_label = line.strip().split(',')
            paper_classes[int(paper_id)] = int(class_label)

    # Iterate through the authors_dict and create authors_class_dict
    for paper_id, authors in authors_dict.items():
        try:
            class_label = paper_classes[paper_id]
            for author in authors.split(','):
                if author not in authors_class_dict:
                    authors_class_dict[author] = []
                authors_class_dict[author].append(class_label)
        except KeyError:
            pass
    return authors_class_dict



# Create the authors_class_dict
authors_class_dict = create_authors_class_dict(papers_authors_clean_dict, data_directory/y_train_file)


In [87]:
print('Cleaned Authors:', len(authors_class_dict))

Cleaned Authors: 48250


In [104]:
authors_class_dict

{'amir abboud': [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
 'arturs backurs': [4, 4, 4, 4, 4, 0],
 'virginia vassilevska williams': [4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4],
 'david anastasiu': [3, 3, 3],
 'byron gao': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 'david buttler': [3, 3, 3, 3],
 'kar': [0, 0, 0, 0, 0],
 'purushottam': [0, 0, 0],
 'narasimhan': [0, 0, 0, 1, 0, 1, 0],
 'harikrishna': [0, 0, 0],
 'prateek jain': [0,
  0,
  0,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'niepert': [0],
 'mathias': [0, 0],
 'mattias fitzi': [4],
 'ueli maurer': [4],
 'zenglin': [0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 1, 3],
 'rong jin': [0,
  0,
  3,
  0,
  3,
  1,
  0,
  3,
  1,
  3,
  0,
  3,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  3,
  0,
  3,
  0,
  3,
  0,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  1,
  3,
  0,
  0,
  0,
  0,
  1,
  3

#### Combine Paper id with class authors

In [109]:
def add_class_information_to_authors_dict(authors_dict, authors_class_dict):
    updated_authors_dict = {}

    for paper_id, authors in authors_dict.items():
        updated_authors = []
        for author in authors.split(','):
            class_list = authors_class_dict.get(author, [])
            updated_author = f"{author} ({', '.join(map(str, class_list))})"
            updated_authors.append(updated_author)
        updated_authors_dict[paper_id] = ",".join(updated_authors)

    return updated_authors_dict

In [113]:
updated_authors_dict = add_class_information_to_authors_dict(papers_authors_clean_dict,authors_class_dict)


In [114]:
updated_authors_dict

{2: 'amir abboud (4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4),arturs backurs (4, 4, 4, 4, 4, 0),virginia vassilevska williams (4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4)',
 3: 'david anastasiu (3, 3, 3),byron gao (3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3),david buttler (3, 3, 3, 3)',
 6: 'kar (0, 0, 0, 0, 0),purushottam (0, 0, 0),narasimhan (0, 0, 0, 1, 0, 1, 0),harikrishna (0, 0, 0),prateek jain (0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)',
 8: 'niepert (0),mathias (0, 0)',
 13: 'mattias fitzi (4),ueli maurer (4)',
 15: 'zenglin (0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 1, 3),rong jin (0, 0, 3, 0, 3, 1, 0, 3, 1, 3, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 0, 0, 0, 1, 3, 0, 1, 3, 0, 1, 0, 0, 1),haiqin yang (0, 0, 0, 3, 1),irwin king (0, 3, 0, 2, 3, 3, 0, 3, 3, 3, 3, 0, 3, 0, 3, 0, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1),michael lyu (0, 3, 3, 0, 0, 2, 3, 3, 0, 3, 3, 3, 3, 0, 3,

## Train / Test data

In [75]:
# Read train.txt and test.txt files
train_data = pd.read_csv(data_directory/y_train_file, header=None, names=['paper_id', 'class'], sep=',')


# Read test data
test_papers = list()
with open(data_directory/test_file, "r") as f:
    for line in f:
        t = line.split(',')
        test_papers.append(int(t[0]))

test_data = pd.DataFrame (test_papers, columns = ['paper_id'])

In [104]:
# Create df_train and df_test DataFrames
df_train = df_final[df_final['paper_id'].isin(train_data['paper_id'])]
df_train = df_train.merge(train_data, on='paper_id')

df_test = df_final[df_final['paper_id'].isin(test_data['paper_id'])]

In [105]:
# Define X_train, y_train, and X_test
X_train = df_train[[f'class{i}_weight' for i in range(5)] + ['year_normalized']]
y_train = df_train['class']
X_test = df_test[[f'class{i}_weight' for i in range(5)] + ['year_normalized']]


In [11]:
# Read authors.txt file
with open(data_directory/authors_p, "r") as f:
    lines = f.readlines()

# Parse the authors data
authors_data = []
for line in lines:
    paper_id, authors = line.strip().split("||")
    authors_list = authors.split(",")
    for author in authors_list:
        authors_data.append((int(paper_id), author.strip()))

# Create a DataFrame with the parsed data
authors_df = pd.DataFrame(authors_data, columns=["paper_id", "author"])

In [15]:
df = pd.read_csv(data_directory/y_train_p, sep=',', header=None, names=['paper_id', 'class'])
df

Unnamed: 0,paper_id,class
0,2,4
1,3,3
2,6,0
3,8,0
4,13,4
...,...,...
35158,166969,1
35159,166976,2
35160,166977,1
35161,166978,3


In [16]:
authors_df

Unnamed: 0,paper_id,author
0,0,Junchi Yan
1,0,Jian Liu
2,0,Yin Li
3,0,Zhibin Niu
4,0,Yuncai Liu
...,...,...
559464,166984,Yang Li
559465,166985,Ola
559466,166985,A.
559467,166985,Ozsoyoglu


In [17]:
# Merge authors_df with labels_df to get category information
authors_with_categories = authors_df.merge(df, on="paper_id")

In [18]:
authors_with_categories

Unnamed: 0,paper_id,author,class
0,2,Amir Abboud,4
1,2,Arturs Backurs,4
2,2,Virginia Vassilevska Williams,4
3,3,David C. Anastasiu,3
4,3,Byron J. Gao,3
...,...,...,...
126232,166978,Yi-Dong Shen,3
126233,166978,Liang Du,3
126234,166978,Chen-Yan Xiong,3
126235,166980,Lina Peng,3


In [20]:
# Group by author and category, and count the number of papers
author_paper_counts = authors_with_categories.groupby(["author", "class"]).size().reset_index(name="count")


In [21]:
author_paper_counts

Unnamed: 0,author,class,count
0,,2,2
1,A VéghLászló,4,1
2,A van den Hengel,1,1
3,A.,0,2
4,A.,1,23
...,...,...,...
60523,Şimşekli Umut,1,1
60524,Şule Gündüz,3,1
60525,Željko Agić,2,1
60526,保之 菅谷,1,1
