In [3]:
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from unidecode import unidecode as unidecode_func
import re
import csv
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\panag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Set the absolute path to your data directory
data_directory = Path("E:/panag/Desktop/Ms Data Science/6 Quarter/Data Science Challenge/data_challenge_aueb_2023")

# Train data
y_train_file = "y_train.txt"
# Authors data
authors_file = "authors.txt"
# Abstract data
abstract_file = "abstract.txt"
# Test data
test_file = "test.txt"
# Year data
year_file = "year.txt"

y_train_path  = data_directory / y_train_file
authors_path  = data_directory / authors_file
abstract_path = data_directory / abstract_file
test_path     = data_directory / test_file
year_path     = data_directory / year_file

### Cleaning the abstracts

#### Create Authors list

In [None]:
# Read the 'authors.txt' file
with open(authors_path, "r") as file:
     lines = file.readlines()

# Create the authors_dict
authors_dict = {int(line.split('||')[0]): line.split('||')[1].strip() for line in lines}

all_authors = []

for authors in authors_dict.values():
    for author in authors.split(','):
        # Replace unicode characters
        author = unidecode_func(author)
        # Remove numerical values
        author = re.sub(r'\d+', '', author)
        # Remove capital letters followed by a dot
        author = re.sub(r'\b[A-Z]\. ?', '', author)
        # Remove leading and trailing whitespaces
        author = author.strip()
        
        if author:
            all_authors.append(author)

# Convert list to set
all_authors = set(all_authors)

#### Useful Functions

In [25]:
def remove_irrelevant_info(text):

    ### Remove Math formulas ###
    # Remove inline math formulas between single dollar signs ($...$)
    text = re.sub(r'\$[^$]*\$', '', text)
    # Remove display math formulas between double dollar signs ($$...$$)
    text = re.sub(r'\$\$[^$]*\$\$', '', text)
    # Remove math formulas between \begin{equation} and \end{equation}
    text = re.sub(r'\\begin\{equation\}.*?\\end\{equation\}', '', text, flags=re.DOTALL)
    # Remove math formulas between \begin{align} and \end{align}
    text = re.sub(r'\\begin\{align\}.*?\\end\{align\}', '', text, flags=re.DOTALL)

    # Remove email addresses
    text = re.sub(r'\S+@\S+\.\S+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # Remove numerical elements in () and []
    text = re.sub(r'(\(\d+\)|\[\d+\])', '', text)

    # Remove text elements in () --> ex. (a),(b),(i)
    text = re.sub(r'\(([a-f]|[ivx]+)\)', '', text)

    # Remove Authors First Names (ex. A., B., C.)
    text = re.sub(r'\b[A-Z]\.', '', text)

    # Remove text inside parentheses that has the expression et al.
    text = re.sub(r'\([^\(\)]*et al\.[^\(\)]*\)', '', text)

    # Remove text with non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    ##### Remove author names #####   ---- Not improved if any Authors removed
    # Tokenize the text by splitting it at whitespace characters
    tokens = text.split()
    # Remove tokens that match author names
    tokens = [token for token in tokens if token not in all_authors]
    # Rejoin the tokens into a single string
    text = ' '.join(tokens)

    return text.strip()

In [198]:
def export_dict_to_csv(data_dict, file_path):
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['paper_id', 'abstract'])
        
        for paper_id, abstract in data_dict.items():
            writer.writerow([paper_id, abstract])

### Algorithm

In [6]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

In [7]:
# Read abstracts of research papers
abstracts = dict()
with open(abstract_path, "r") as f:
    for line in f:
        #print(line)
        t = line.split('||')
        abstracts[int(t[0])] = t[1][:-1]

# Read training data
train_papers = list()
y_train = list()
with open(y_train_path, "r") as f:
    for line in f:
        t = line.split(',')
        train_papers.append(int(t[0]))
        y_train.append(t[1][:-1])

# Read test data
test_papers = list()
with open(test_path, "r") as f:
    for line in f:
        t = line.split(',')
        test_papers.append(int(t[0]))

In [26]:
# Clean the abstract and export to csv

cleaned_abstracts = {}
for paper_id, abstract in abstracts.items():
    cleaned_abstracts[paper_id] = remove_irrelevant_info(abstract)

In [167]:
# Export to csv
export_dict_to_csv(cleaned_abstracts, 'clean_abstracts.csv')

In [27]:
# Import Numeric features
df_edgelist = pd.read_csv('features_edgelist.csv')
df_authors  = pd.read_csv('features_authors.csv')

df_year = pd.read_csv(year_path, header=None)
df_year.columns = ['paper_id','year']

df_numeric = pd.merge(df_edgelist,df_authors, on='paper_id')
# select only the 'class{}_weight' columns
df_numeric = df_numeric.filter(regex='class\d_weight')

# Text feautures
df_text = pd.DataFrame(list(cleaned_abstracts.values()))
# replace the name of the first column
df_text.columns = ['abstract']

In [28]:
### Custom numeric #####
df_numeric = pd.merge(df_edgelist,df_year, on='paper_id')

In [29]:
df_all = pd.merge(df_text, df_numeric, left_index=True, right_index=True)

# Split in train/test dataframe
mask_train = df_all.index.isin(train_papers)
mask_test = df_all.index.isin(test_papers)

df_train = df_all.loc[mask_train]
df_test = df_all.loc[mask_test]

In [30]:
class NumericFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.columns].values

In [31]:
class TextFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

In [99]:
# Assuming df_train is your dataframe and y_train is your target variable
numeric_columns = [
    'class0_weight_x', 'class1_weight_x', 'class2_weight_x', 'class3_weight_x', 'class4_weight_x',
    'class0_weight_y', 'class1_weight_y', 'class2_weight_y', 'class3_weight_y', 'class4_weight_y'
]

In [32]:
# Assuming df_train is your dataframe and y_train is your target variable
numeric_columns = [
    'class0_weight', 'class1_weight', 'class2_weight', 'class3_weight', 'class4_weight','year' 
]

In [227]:
param_dist = {
    'feature_union__tfidf__vectorizer__max_df': [0.5, 0.75, 1.0],
    'feature_union__tfidf__vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__n_estimators': [50, 100, 150, 200],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__num_leaves': [30, 50, 100],
    'classifier__min_child_samples': [10, 20, 30],
}

random_search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions=param_dist,
    n_iter=20,     # Number of random combinations to try
    cv=3,          # Number of cross-validation folds
    n_jobs=-1,     # Use all available CPU cores
    verbose=2,
    random_state=42,
)

In [33]:
# Create a FeatureUnion object
feature_union = FeatureUnion(
    transformer_list=[
        ('tfidf', Pipeline([
            ('text_selector', TextFeaturesExtractor(column='abstract')),
            ('vectorizer', TfidfVectorizer(stop_words='english'))
        ])),  # Text features
        ('numeric', NumericFeaturesExtractor(columns=numeric_columns))  # Numeric features
    ]
)

# Create a pipeline with FeatureUnion and LightGBM
pipeline = Pipeline(
    steps=[
        ('feature_union', feature_union),
        ('classifier', LGBMClassifier(objective='multiclass', num_classes=5))
    ]
)

#### Without Randomized Search

In [34]:
# Fit the pipeline to your data
pipeline.fit(df_train, y_train)

In [35]:
# Predict probabilities for each class
y_pred = pipeline.predict_proba(df_test)

In [36]:
# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(5):
        lst.append('class_'+str(i))
    lst.insert(0, "paperID")
    writer.writerow(lst)
    for i,test_paper in enumerate(test_papers):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_paper)
        writer.writerow(lst)

#### With Randomized Search

In [230]:
#### Use Random Search
random_search.fit(df_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

In [None]:
# Predict probabilities for each class using the best estimator
best_estimator = random_search.best_estimator_
y_pred = best_estimator.predict_proba(df_test)

In [None]:
# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(5):
        lst.append('class_'+str(i))
    lst.insert(0, "paperID")
    writer.writerow(lst)
    for i,test_paper in enumerate(test_papers):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_paper)
        writer.writerow(lst)