In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import csv
import numpy as np

In [8]:
# Set the absolute path to your data directory
data_directory = Path("E:/panag/Desktop/Ms Data Science/6 Quarter/Data Science Challenge/data_challenge_aueb_2023")

# Train data
y_train_file = "y_train.txt"
# Abstract data
abstract_file = "abstract.txt"
# Test data
test_file = "test.txt"
# Authors
authors_file = "authors.txt"
# Edge list
edgelist_file = "edgelist.txt"
# Year 
year_file = "year.txt"

## 1. Citations data

In [3]:
# Initialize the dictionary with keys from 0 to 166985 and empty lists as values
paper_links = {i: [] for i in range(166986)}

# Read the edgelist.txt file
with open(data_directory/edgelist_file, "r") as f:
    lines = f.readlines()

# Parse the data and assign values bidirectionally
for line in lines:
    paper_id, linked_paper_id = map(int, line.strip().split())

    # Assign the linked_paper_id to the paper_id key if not already present
    if linked_paper_id not in paper_links[paper_id]:
        paper_links[paper_id].append(linked_paper_id)

    # Assign the paper_id to the linked_paper_id key if not already present
    if paper_id not in paper_links[linked_paper_id]:
        paper_links[linked_paper_id].append(paper_id)

In [4]:
with open(data_directory/y_train_file, "r") as f:
    lines = f.readlines()

# Create a dictionary with paper_id as key and class as value
paper_classes = {}
for line in lines:
    paper_id, paper_class = map(int, line.strip().split(','))
    paper_classes[paper_id] = paper_class

# Add class information to the paper_links dictionary
paper_links_with_classes = {}
for paper_id, linked_papers in paper_links.items():
    paper_links_with_classes[paper_id] = {
        'links': linked_papers,
        'classes': [paper_classes.get(linked_paper_id) for linked_paper_id in linked_papers if linked_paper_id in paper_classes]
    }

In [5]:
for paper_id, info in paper_links_with_classes.items():
    class_counts = {'class0': 0, 'class1': 0, 'class2': 0, 'class3': 0, 'class4': 0}

    for paper_class in info['classes']:
        class_name = f'class{paper_class}'
        class_counts[class_name] += 1

    paper_links_with_classes[paper_id].update(class_counts)

In [6]:
# Convert the paper_links_with_classes dictionary to a DataFrame
data = []
for paper_id, info in paper_links_with_classes.items():
    row = {'paper_id': paper_id}
    row.update(info)
    data.append(row)

df = pd.DataFrame(data)

# Rearrange columns
columns = ['paper_id', 'links', 'classes', 'class0', 'class1', 'class2', 'class3', 'class4']

for i in range(5):
    class_count_column = f'class{i}'
    class_weight_column = f'class{i}_weight'
    df[class_weight_column] = df[class_count_column] / df[[f'class{j}' for j in range(5)]].sum(axis=1)

df

Unnamed: 0,paper_id,links,classes,class0,class1,class2,class3,class4,class0_weight,class1_weight,class2_weight,class3_weight,class4_weight
0,0,[138097],[1],0,1,0,0,0,0.000000,1.0,0.000000,0.000000,0.000000
1,1,"[44164, 38118]",[2],0,0,1,0,0,0.000000,0.0,1.000000,0.000000,0.000000
2,2,"[132575, 36628, 21383, 103558, 103665, 99219, ...","[2, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]",0,0,2,0,13,0.000000,0.0,0.133333,0.000000,0.866667
3,3,"[158930, 20562, 36447, 157194, 16449, 97870, 1...","[3, 0, 3]",1,0,0,2,0,0.333333,0.0,0.000000,0.666667,0.000000
4,4,"[95168, 133558]",[3],0,0,0,1,0,0.000000,0.0,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
166981,166981,"[78765, 103490]","[4, 0]",1,0,0,0,1,0.500000,0.0,0.000000,0.000000,0.500000
166982,166982,"[120658, 63091, 100542]","[4, 4, 4]",0,0,0,0,3,0.000000,0.0,0.000000,0.000000,1.000000
166983,166983,[47162],[1],0,1,0,0,0,0.000000,1.0,0.000000,0.000000,0.000000
166984,166984,[155853],[3],0,0,0,1,0,0.000000,0.0,0.000000,1.000000,0.000000


In [9]:
# Read the year.txt file
year_data = pd.read_csv(data_directory/year_file, header=None, names=['paper_id', 'year'], sep=',')

# Merge the year data with the existing DataFrame
df = df.merge(year_data, on='paper_id')

# Check for min and max year
min_year = df['year'].min()
max_year = df['year'].max()

# Create the year_normalized column
df['year_normalized'] = (df['year'] - min_year) / (max_year - min_year)

# Create a final DataFrame with class weights and year_normalized columns
df_final = df[['paper_id'] + [f'class{i}_weight' for i in range(5)] + ['year_normalized']]

# Fill nan class_weight columns with equal weigts
df_final = df_final.fillna(1/5)
df_final

Unnamed: 0,paper_id,class0_weight,class1_weight,class2_weight,class3_weight,class4_weight,year_normalized
0,0,0.000000,1.0,0.000000,0.000000,0.000000,0.853659
1,1,0.000000,0.0,1.000000,0.000000,0.000000,0.817073
2,2,0.000000,0.0,0.133333,0.000000,0.866667,0.914634
3,3,0.333333,0.0,0.000000,0.666667,0.000000,0.865854
4,4,0.000000,0.0,0.000000,1.000000,0.000000,0.597561
...,...,...,...,...,...,...,...
166981,166981,0.500000,0.0,0.000000,0.000000,0.500000,0.902439
166982,166982,0.000000,0.0,0.000000,0.000000,1.000000,0.500000
166983,166983,0.000000,1.0,0.000000,0.000000,0.000000,0.853659
166984,166984,0.000000,0.0,0.000000,1.000000,0.000000,0.817073


#### Testing

In [12]:
# Read train.txt and test.txt files
train_data = pd.read_csv(data_directory/y_train_file, header=None, names=['paper_id', 'class'], sep=',')

# Create df_train and df_test DataFrames
df_data = df_final[df_final['paper_id'].isin(train_data['paper_id'])]

df_train = df_data.merge(train_data, on='paper_id')

x_all_train = df_train[[f'class{i}_weight' for i in range(5)] + ['year_normalized']]
y_all_train = df_train['class']

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_train[[f'class{i}_weight' for i in range(5)] + ['year_normalized']], df_train['class'],test_size=0.1, random_state=42)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# Train a Random Forest classifier using cross-validation
rf = RandomForestClassifier(n_estimators=300, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_proba = cross_val_predict(rf, x_all_train, y_all_train, cv=cv, method='predict_proba')


# Calculate the multiclass logarithmic loss
loss = log_loss(y_all_train, y_pred_proba)
print("Multiclass logarithmic loss: {:.3f}".format(loss))

Multiclass logarithmic loss: 0.864


In [None]:
import xgboost as xgb

# Train an XGBoost classifier on the training set
xgb_model = xgb.XGBClassifier(objective='multi:softprob', random_state=42)
xgb_model.fit(x_all_train, y_all_train)

# Predict the class probabilities for the test set
#y_pred_proba_xgb = xgb_model.predict_proba(X_test)

# Calculate the multiclass logarithmic loss
#loss_xgb = log_loss(y_test, y_pred_proba_xgb)

#print("XGBoost model - Multiclass logarithmic loss: {:.3f}".format(loss_xgb))

In [20]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid to search over
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [100, 500, 1000]
}

# XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='multi:softprob', random_state=42)
# Perform grid search using 5-fold cross-validation
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_log_loss', n_jobs=-1, verbose=1)
grid_search.fit(x_all_train, y_all_train)

# Print the best hyperparameters and the corresponding score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: {:.3f}".format(-grid_search.best_score_))


# Best hyperparameters:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

Fitting 3 folds for each of 27 candidates, totalling 81 fits




Best hyperparameters:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best score: 0.499


## 2. Authors data

## Train / Test data

In [75]:
# Read train.txt and test.txt files
train_data = pd.read_csv(data_directory/y_train_file, header=None, names=['paper_id', 'class'], sep=',')


# Read test data
test_papers = list()
with open(data_directory/test_file, "r") as f:
    for line in f:
        t = line.split(',')
        test_papers.append(int(t[0]))

test_data = pd.DataFrame (test_papers, columns = ['paper_id'])

In [104]:
# Create df_train and df_test DataFrames
df_train = df_final[df_final['paper_id'].isin(train_data['paper_id'])]
df_train = df_train.merge(train_data, on='paper_id')

df_test = df_final[df_final['paper_id'].isin(test_data['paper_id'])]

In [105]:
# Define X_train, y_train, and X_test
X_train = df_train[[f'class{i}_weight' for i in range(5)] + ['year_normalized']]
y_train = df_train['class']
X_test = df_test[[f'class{i}_weight' for i in range(5)] + ['year_normalized']]


In [11]:
# Read authors.txt file
with open(data_directory/authors_p, "r") as f:
    lines = f.readlines()

# Parse the authors data
authors_data = []
for line in lines:
    paper_id, authors = line.strip().split("||")
    authors_list = authors.split(",")
    for author in authors_list:
        authors_data.append((int(paper_id), author.strip()))

# Create a DataFrame with the parsed data
authors_df = pd.DataFrame(authors_data, columns=["paper_id", "author"])

In [15]:
df = pd.read_csv(data_directory/y_train_p, sep=',', header=None, names=['paper_id', 'class'])
df

Unnamed: 0,paper_id,class
0,2,4
1,3,3
2,6,0
3,8,0
4,13,4
...,...,...
35158,166969,1
35159,166976,2
35160,166977,1
35161,166978,3


In [16]:
authors_df

Unnamed: 0,paper_id,author
0,0,Junchi Yan
1,0,Jian Liu
2,0,Yin Li
3,0,Zhibin Niu
4,0,Yuncai Liu
...,...,...
559464,166984,Yang Li
559465,166985,Ola
559466,166985,A.
559467,166985,Ozsoyoglu


In [17]:
# Merge authors_df with labels_df to get category information
authors_with_categories = authors_df.merge(df, on="paper_id")

In [18]:
authors_with_categories

Unnamed: 0,paper_id,author,class
0,2,Amir Abboud,4
1,2,Arturs Backurs,4
2,2,Virginia Vassilevska Williams,4
3,3,David C. Anastasiu,3
4,3,Byron J. Gao,3
...,...,...,...
126232,166978,Yi-Dong Shen,3
126233,166978,Liang Du,3
126234,166978,Chen-Yan Xiong,3
126235,166980,Lina Peng,3


In [20]:
# Group by author and category, and count the number of papers
author_paper_counts = authors_with_categories.groupby(["author", "class"]).size().reset_index(name="count")


In [21]:
author_paper_counts

Unnamed: 0,author,class,count
0,,2,2
1,A VéghLászló,4,1
2,A van den Hengel,1,1
3,A.,0,2
4,A.,1,23
...,...,...,...
60523,Şimşekli Umut,1,1
60524,Şule Gündüz,3,1
60525,Željko Agić,2,1
60526,保之 菅谷,1,1


In [None]:
166893||L Xu
166894||Abhijit S. Ogale,Yiannis Aloimonos
166895||John Sharko,Georges Grinstein
166896||Ho Jin Woo,Won Suk Lee
166897||S. Z. Li

In [None]:
166979||Nhon H. Trinh,Benjamin B. Kimia
166980||Lina Peng,K. Selçuk Candan
166981||Jelani Nelson,Eric Price,Mary Wootters
166982||Karp, Richard M.,Sipser, M.
166983||Y. Mohammad,T. Nishida
166984||Jacob O. Wobbrock,Andrew D. Wilson,Yang Li
166985||Ola, A.,Ozsoyoglu, G.