In [1]:
import pandas as pd
import joblib
from sentence_transformers import SentenceTransformer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

Load and preprocess data and TF-IDF vectorisation

In [5]:
# Load dataset
data = pd.read_csv("cleaned_data.csv")

# Fit the vectoriser 
vectoriser = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_features = vectoriser.fit_transform(data['clean_abstract'])

# Save the vectoriser and features
joblib.dump(vectoriser, "tfidf_vectorizer.pkl")
joblib.dump(tfidf_features, "tfidf_features.pkl")

print("Data and features loaded")



Data and features loaded


Discipline classification model training

In [6]:
# Discipline
y = data["Discipline"]
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_features, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic Regression
print("Discipline: Logistic Regression ")
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

# Random Forest
print("Discipline: Random Forest")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

# XGBoost 
print("Discipline: XGBoost")
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train_enc)
y_pred_enc = xgb.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)
print(classification_report(y_test, y_pred))


Discipline: Logistic Regression 
                        precision    recall  f1-score   support

  Computer Engineering       0.77      0.83      0.80        24
      Computer Science       0.72      0.72      0.72        25
   Information Systems       0.87      0.80      0.83        25
Information Technology       0.78      0.84      0.81        25
  Software Engineering       0.87      0.80      0.83        25

              accuracy                           0.80       124
             macro avg       0.80      0.80      0.80       124
          weighted avg       0.80      0.80      0.80       124

Discipline: Random Forest
                        precision    recall  f1-score   support

  Computer Engineering       0.66      0.79      0.72        24
      Computer Science       0.74      0.56      0.64        25
   Information Systems       0.68      0.76      0.72        25
Information Technology       0.76      0.64      0.70        25
  Software Engineering       0.70      0.

Save the best model

In [7]:
# Save the best model 
joblib.dump(lr, "discipline_model.joblib")
print("Saved as discipline_model.joblib")


Saved as discipline_model.joblib


Subfield classification model training

In [8]:
# Subfield
y = data["Subfield"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_features, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic Regression
print("Subfield: Logistic Regression")
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

# Random Forest
print("Subfield: Random Forest")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

# XGBoost 
print("Subfield: XGBoost")
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train_enc)
y_pred_enc = xgb.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)
print(classification_report(y_test, y_pred))


Subfield: Logistic Regression
                                              precision    recall  f1-score   support

              Algorithms and Data Structures       1.00      0.80      0.89         5
Artificial Intelligence and Machine Learning       0.83      1.00      0.91         5
                       Computer Architecture       0.75      0.75      0.75         4
           Computer Systems and Architecture       1.00      0.60      0.75         5
                      Cyber-Physical Systems       0.50      0.60      0.55         5
                               Cybersecurity       0.75      0.60      0.67         5
                             Data Management       0.67      0.80      0.73         5
                    Decision Support Systems       1.00      1.00      1.00         5
                      Digital Systems Design       1.00      1.00      1.00         5
                 E-Government and E-Commerce       1.00      1.00      1.00         5
                       

In [9]:
# Save the best model
joblib.dump(lr, "subfield_model.joblib")
print("Saved as subfield_model.joblib")


Saved as subfield_model.joblib


Methodology classification model training

In [10]:
# Methodology
y = data["Methodology"]
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_features, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic Regression
print("Methodology: Logistic Regression")
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

# Random Forest
print("Methodology: Random Forest")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

# XGBoost
print("Methodology: XGBoost")
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train_enc)
y_pred_enc = xgb.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)
print(classification_report(y_test, y_pred))


Methodology: Logistic Regression
                          precision    recall  f1-score   support

  Design and Development       0.43      0.36      0.39        25
           Mixed Methods       0.29      0.24      0.26        25
             Qualitative       0.48      0.52      0.50        25
            Quantitative       0.27      0.32      0.29        25
Theoretical / Conceptual       0.52      0.54      0.53        24

                accuracy                           0.40       124
               macro avg       0.40      0.40      0.39       124
            weighted avg       0.40      0.40      0.39       124

Methodology: Random Forest
                          precision    recall  f1-score   support

  Design and Development       0.50      0.60      0.55        25
           Mixed Methods       0.71      0.40      0.51        25
             Qualitative       0.62      0.84      0.71        25
            Quantitative       0.59      0.52      0.55        25
Theoretical 

Embeddings using SBERT

In [12]:
# Load cleaned dataset
df = pd.read_csv("cleaned_data.csv")

# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
X_embed = model.encode(df["clean_text"].tolist())

In [13]:
# Labels to evaluate
labels = ["Discipline", "Subfield", "Methodology"]

# Loop through each label
for label in labels:
    print(f"\n SBERT Embeddings: {label}\n")
    
    y = data[label]
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_embed, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Train Logistic Regression
    lr_embed = LogisticRegression(max_iter=1000, random_state=42)
    lr_embed.fit(X_train, y_train)
    y_pred = lr_embed.predict(X_test)
    print(classification_report(y_test, y_pred))


 SBERT Embeddings: Discipline

                        precision    recall  f1-score   support

  Computer Engineering       0.85      0.92      0.88        24
      Computer Science       0.81      0.68      0.74        25
   Information Systems       0.83      0.60      0.70        25
Information Technology       0.62      0.84      0.71        25
  Software Engineering       0.84      0.84      0.84        25

              accuracy                           0.77       124
             macro avg       0.79      0.78      0.77       124
          weighted avg       0.79      0.77      0.77       124


 SBERT Embeddings: Subfield

                                              precision    recall  f1-score   support

              Algorithms and Data Structures       1.00      0.80      0.89         5
Artificial Intelligence and Machine Learning       1.00      1.00      1.00         5
                       Computer Architecture       0.67      1.00      0.80         4
           Com

Embedding with XGBClassifier

In [15]:
# Labels to evaluate
labels = ["Discipline", "Subfield", "Methodology"]

# Loop through each label
for label in labels:
    print(f"\n SBERT Embeddings: {label}\n")
    
    y = data[label]
    
    # Encode labels
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_embed, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )
    
    # Train XGBoost
    xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)
    xgb.fit(X_train, y_train)
   
    # Predict and decode labels
    y_pred_enc = xgb.predict(X_test)
    y_pred = le.inverse_transform(y_pred_enc)
    y_test_labels = le.inverse_transform(y_test)
    print(classification_report(y_test_labels, y_pred))



 SBERT Embeddings: Discipline

                        precision    recall  f1-score   support

  Computer Engineering       0.67      0.83      0.74        24
      Computer Science       0.70      0.56      0.62        25
   Information Systems       0.67      0.64      0.65        25
Information Technology       0.60      0.60      0.60        25
  Software Engineering       0.84      0.84      0.84        25

              accuracy                           0.69       124
             macro avg       0.69      0.69      0.69       124
          weighted avg       0.69      0.69      0.69       124


 SBERT Embeddings: Subfield

                                              precision    recall  f1-score   support

              Algorithms and Data Structures       0.80      0.80      0.80         5
Artificial Intelligence and Machine Learning       0.71      1.00      0.83         5
                       Computer Architecture       0.60      0.75      0.67         4
           Com

Methodology classification boosted with keywords

In [14]:
# Load data
data = pd.read_csv("data_plus_keywords.csv")

# Load TF-IDF features
tfidf_features = joblib.load("tfidf_features.pkl")

# Extract keyword features 
keyword_columns = [col for col in data.columns if col.startswith("kw_")]
rule_features = data[keyword_columns]Methodology Classification Boosted with Keyword Features

# Combine TF-IDF and rule features
from scipy.sparse import hstack, csr_matrix
combined_features = hstack([tfidf_features, csr_matrix(rule_features.values)])





In [15]:
# Methodology
y = data["Methodology"]

# Train/test split (string labels)
X_train, X_test, y_train, y_test = train_test_split(
    combined_features, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic Regression
print("Methodology: Logistic Regression")
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

# Random Forest
print("Methodology: Random Forest")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

# XGBoost
print("Methodology: XGBoost")
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train_enc)
y_pred_enc = xgb.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)
print(classification_report(y_test, y_pred))



Methodology: Logistic Regression
                          precision    recall  f1-score   support

  Design and Development       0.90      0.76      0.83        25
           Mixed Methods       0.88      0.84      0.86        25
             Qualitative       0.92      0.92      0.92        25
            Quantitative       1.00      0.88      0.94        25
Theoretical / Conceptual       0.75      1.00      0.86        24

                accuracy                           0.88       124
               macro avg       0.89      0.88      0.88       124
            weighted avg       0.89      0.88      0.88       124

Methodology: Random Forest
                          precision    recall  f1-score   support

  Design and Development       0.68      0.84      0.75        25
           Mixed Methods       0.71      0.40      0.51        25
             Qualitative       0.70      0.84      0.76        25
            Quantitative       0.73      0.64      0.68        25
Theoretical 

Save the best model

In [16]:
# Save the best model
joblib.dump(lr, "methodology_model.joblib")
print("Saved as methodology_model.joblib")

Saved as methodology_model.joblib
