In [1]:
import pandas as pd
from pathlib import Path
import csv
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\panag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Set the absolute path to your data directory
data_directory = Path("E:/panag/Desktop/Ms Data Science/6 Quarter/Data Science Challenge/data_challenge_aueb_2023")

# Train data
y_train_file = "y_train.txt"
# Authors data
authors_file = "authors.txt"
# Abstract data
abstract_file = "abstract.txt"
# Test data
test_file = "test.txt"
# Year data
year_file = "year.txt"

y_train_path  = data_directory / y_train_file
authors_path  = data_directory / authors_file
abstract_path = data_directory / abstract_file
test_path     = data_directory / test_file
year_path     = data_directory / year_file

### Algorithm

#### All data

In [3]:
#### All papers ####

# Import Numeric features
df_edgelist = pd.read_csv('features_edgelist.csv')
df_authors  = pd.read_csv('features_authors.csv')

df_year = pd.read_csv(year_path, header=None)
df_year.columns = ['paper_id','year']

# Numeric Features (Edgelist and Year)
df_numeric = pd.merge(df_edgelist,df_year, on='paper_id')
df_numeric.fillna(0.2, inplace = True)

# Text feautures
df_text = pd.read_csv('cleaned_abstracts.csv')
df_text['abstract'].fillna('', inplace=True)

#### 1. Train data

In [4]:
#####  Train data #####
y_train = pd.read_csv(y_train_path, header=None)
y_train.columns = ['paper_id','class']

# merge 1
merged_df = pd.merge(y_train, df_text, on="paper_id", how="inner")
# merge 2
df_train = pd.merge(merged_df, df_numeric, on="paper_id", how="inner")

#### 2. Test data

In [5]:
#####  Test data #####
test_papers = list()
with open(test_path, "r") as f:
    for line in f:
        t = line.split(',')
        test_papers.append(int(t[0]))

# Merge text / numeric data
df_all = pd.merge(df_text, df_numeric, on="paper_id", how="inner")

# Find test data
mask_test = df_all.index.isin(test_papers)
df_test = df_all.loc[mask_test]

#### Pipeline

In [6]:
class NumericFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.columns].values

In [7]:
class TextFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

In [8]:
# Assuming df_train is your dataframe and y_train is your target variable
numeric_columns = [
    'class0_weight', 'class1_weight', 'class2_weight', 'class3_weight', 'class4_weight','year' 
]

In [9]:
# Create a FeatureUnion object
feature_union = FeatureUnion(
    transformer_list=[
        ('tfidf', Pipeline([
            ('text_selector', TextFeaturesExtractor(column='abstract')),
            ('vectorizer', TfidfVectorizer(analyzer='word', stop_words='english'))
        ])),  # Text features
        ('numeric', NumericFeaturesExtractor(columns=numeric_columns))  # Numeric features
    ]
)

# Create a pipeline with FeatureUnion and LightGBM
pipeline = Pipeline(
    steps=[
        ('feature_union', feature_union),
        ('classifier', LGBMClassifier(objective='multiclass', num_classes=5))
    ]
)

#### Without Grid Search

In [94]:
# Fit the pipeline to your data
pipeline.fit(df_train, y_train['class'].values)

In [96]:
# Predict probabilities for each class
y_pred = pipeline.predict_proba(df_test)

In [97]:
# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(5):
        lst.append('class_'+str(i))
    lst.insert(0, "paperID")
    writer.writerow(lst)
    for i,test_paper in enumerate(test_papers):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_paper)
        writer.writerow(lst)

#### With Grid Search

In [10]:
param_grid = {
    'feature_union__tfidf__vectorizer__max_df': [0.5, 0.75, 1.0],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__num_leaves': [30, 50, 100],
    'classifier__min_child_samples': [10, 20, 30],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=2,          # Number of cross-validation folds
    n_jobs=-1,     # Use all available CPU cores
    verbose=10,
)


In [11]:
#### Use Grid Search
grid_search.fit(df_train, y_train['class'].values)

Fitting 2 folds for each of 243 candidates, totalling 486 fits


In [12]:
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

Best parameters found:  {'classifier__learning_rate': 0.1, 'classifier__min_child_samples': 20, 'classifier__n_estimators': 200, 'classifier__num_leaves': 50, 'feature_union__tfidf__vectorizer__max_df': 0.5}
Best score found:  0.8974206786805419


In [13]:
# Predict probabilities for each class using the best estimator
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict_proba(df_test)

In [14]:
# Write predictions to a file
with open('sample_submission_grid.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(5):
        lst.append('class_'+str(i))
    lst.insert(0, "paperID")
    writer.writerow(lst)
    for i,test_paper in enumerate(test_papers):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_paper)
        writer.writerow(lst)