In [1]:
import json
import time
import string
import re
import requests
import feedparser
import pandas as pd
import joblib
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from typing import List, Dict

Discipline to subfields keywords mapping

In [2]:
# Discipline to subfields keywords mapping

discipline_keywords = {
    "Computer Science": {
        "Algorithms and Data Structures": ["algorithm", "data structure", "graph", "tree", "heap", "sorting", "searching", "complexity", "recursion", "hashing"],
        "Artificial Intelligence and Machine Learning": ["machine learning", "deep learning", "neural network", "supervised", "unsupervised", "reinforcement learning", "AI", "classification", "prediction", "regression"],
        "Computer Systems and Architecture": ["operating system", "kernel", "process", "memory", "cpu", "thread", "architecture", "cache", "multicore", "interrupt"],
        "Human-Computer Interaction": ["user interface", "usability", "HCI", "interaction", "user experience", "UX", "design", "cognitive", "feedback", "accessibility"],
        "Software Engineering Principles": ["software development", "design pattern", "architecture", "refactoring", "agile", "scrum", "version control", "testing", "modularity", "code quality"]
    },
    "Software Engineering": {
        "Software Development Processes": ["agile", "scrum", "waterfall", "lifecycle", "iteration", "process model", "CI/CD", "devops", "release", "deployment"],
        "Software Design and Architecture": ["architecture", "design pattern", "modularity", "system design", "layered", "MVC", "abstraction", "component", "interface", "reuse"],
        "Software Testing and Quality Assurance": ["testing", "unit test", "integration test", "test case", "test suite", "coverage", "assertion", "bug", "defect", "QA"],
        "Requirements Engineering": ["requirement", "elicitation", "stakeholder", "user story", "specification", "goal modeling", "validation", "traceability", "analysis", "needs"],
        "Project Management": ["project", "planning", "scheduling", "cost estimation", "milestone", "risk management", "Gantt", "team", "roles", "budget"]
    },
    "Information Systems": {
        "Enterprise Systems": ["ERP", "CRM", "enterprise", "integration", "SAP", "system deployment", "workflow", "enterprise application", "business logic", "intranet"],
        "Decision Support Systems": ["DSS", "decision making", "analytics", "data warehouse", "BI", "what-if", "dashboards", "support tool", "modelling", "simulation"],
        "Knowledge Management": ["knowledge sharing", "repository", "ontology", "knowledge base", "tacit knowledge", "expert system", "semantic", "taxonomy", "knowledge transfer", "capture"],
        "Information Security": ["access control", "encryption", "security", "authentication", "cybersecurity", "firewall", "data breach", "confidentiality", "risk", "compliance"],
        "E-Government and E-Commerce": ["e-government", "e-service", "e-commerce", "transaction", "digital platform", "public service", "online portal", "citizen", "service quality", "B2C"]
    },
    "Information Technology": {
        "Network and Infrastructure": ["network", "server", "router", "infrastructure", "LAN", "WAN", "firewall", "switch", "IP address", "protocol"],
        "Cybersecurity": ["malware", "vulnerability", "intrusion detection", "zero trust", "TLS", "phishing", "cyber attack", "penetration", "incident", "threat"],
        "IT Service Management": ["ITIL", "helpdesk", "ticketing", "support", "service delivery", "SLA", "incident", "configuration", "availability", "uptime"],
        "Data Management": ["database", "data integrity", "governance", "ETL", "data quality", "metadata", "data warehouse", "big data", "preprocessing", "storage"],
        "User Support": ["help desk", "technical support", "user training", "ticket", "issue resolution", "FAQ", "call centre", "troubleshoot", "remote support", "documentation"]
    },
    "Computer Engineering": {
        "Hardware Systems": ["hardware", "microcontroller", "microprocessor", "circuit", "GPIO", "sensor", "actuator", "register", "peripheral", "driver"],
        "Computer Architecture": ["architecture", "instruction set", "pipeline", "cache", "ALU", "control unit", "memory hierarchy", "fetch", "decode", "execute"],
        "Digital Systems Design": ["VHDL", "Verilog", "FPGA", "logic gate", "flip-flop", "circuit design", "state machine", "clock", "synchronous", "combinational"],
        "Signal Processing": ["FFT", "filter", "signal", "modulation", "frequency", "noise", "DSP", "audio", "image", "waveform"],
        "Cyber-Physical Systems": ["IoT", "real-time", "sensor", "embedded", "CPS", "control", "autonomous", "robot", "actuation", "feedback loop"]
    }
}

# Save to JSON file
with open("discipline_keywords.json", "w") as f:
    json.dump(discipline_keywords, f, indent=4)

print("Saved as discipline_keywords.json")

Saved as discipline_keywords.json


Discipline and subfield mapping

In [3]:
# Create and save discipline subfield mapping
discipline_mapping = {
    discipline: list(subfields.keys())
    for discipline, subfields in discipline_keywords.items()
}

with open("discipline_mapping.json", "w") as f:
    json.dump(discipline_mapping, f, indent=4)

print("Saved as discipline_mapping.json")


Saved as discipline_mapping.json


Methodology keywords mapping

In [31]:
# Define methodology keywords

methodology_keywords = {
    "Quantitative": [
        "quantitative", "statistical", "survey", "questionnaire", "regression", "correlation",
        "t-test", "f-test", "anova", "chi-square", "numerical analysis", "metric",
        "hypothesis testing", "likert scale", "parametric", "nonparametric", "sampling",
        "statistical model", "structural equation modeling", "variance", "data analysis",
        "empirical analysis", "experimental design", "descriptive statistics", "inferential statistics"
    ],
    "Qualitative": [
        "qualitative", "interview", "interviews", "semi-structured", "case study",
        "focus group", "observation", "fieldwork", "ethnography", "thematic analysis",
        "content analysis", "discourse analysis", "grounded theory", "narrative analysis",
        "interpretive", "phenomenology", "manual coding", "participant observation",
        "open-ended", "in-depth interview", "interpretivist"
    ],
    "Mixed Methods": [
        "mixed methods", "mixed methodology", "qualitative and quantitative", "quantitative and qualitative",
        "both qualitative and quantitative", "survey and interview", "interviews and surveys", "combined methods",
        "combining methods", "integration of methods", "multi-method study", "triangulation", 
        "integrated approach", "mixed research design", "complementary methods"
    ],
    "Design and Development": [
        "design science", "prototype", "framework", "architecture", "implementation", "system design",
        "tool development", "software design", "we propose", "we build", "we implement",
        "development process", "model-driven", "platform design", "design approach",
        "developed a tool", "technical artefact", "solution development", "design-oriented"
    ],
    "Theoretical / Conceptual": [
        "theoretical", "conceptual", "taxonomy", "classification scheme", "analytical model",
        "argumentation", "framework development", "literature review", "model proposal",
        "reference model", "theoretical framework", "ontology", "perspective", "review of the literature",
        "conceptual model", "mathematical theory", "historical overview", "evolution of", "principle-based",
        "conceptual discussion", "theoretical lens"
    ]
}

with open("methodology_keywords.json", "w") as f:
    json.dump(methodology_keywords, f, indent=4)
print("Saved as methodology_keywords.json")

Load discipline and methodology kewords mappings

In [2]:
# Load kewords mappings
with open("discipline_keywords.json", "r") as f:
    discipline_keywords = json.load(f)
with open("methodology_keywords.json", "r") as f:
    methodology_keywords = json.load(f)
all_methods = list(methodology_keywords.keys())


Classifier for methodology Mixed methods, keyword and balance rules 

In [6]:
# Classifier 
def classify_methodology(text, keywords):
    text = text.lower()

    quant_score = sum(1 for kw in keywords["Quantitative"] if kw in text)
    qual_score = sum(1 for kw in keywords["Qualitative"] if kw in text)
    mixed_score = sum(1 for kw in keywords["Mixed Methods"] if kw in text)

    # If both quant and qual appear than Mixed
    if quant_score >= 2 and qual_score >= 2:
        if abs(quant_score - qual_score) <= 1:
            return "Mixed Methods"
        # Otherwise return dominant side
        return "Quantitative" if quant_score > qual_score else "Qualitative"

    # Strong Mixed 
    if mixed_score >= 2:
        return "Mixed Methods"

    # Fallback to best single method
    scores = {
        method: sum(1 for kw in kw_list if kw in text)
        for method, kw_list in keywords.items()
        if method not in ["Mixed Methods"]
    }

    top_method, score = max(scores.items(), key=lambda x: x[1])
    return top_method if score >= 2 else "Unknown"


Scrappers functions 

In [4]:
# Scrape data from CrossRef 
def scrape_crossref(query: str, n: int = 1000):
    n = min(n, 1000)  # Limit to 1000 results (max limit)
    try:
        # API query
        url = "https://api.crossref.org/works"
        params = {"query": query, "rows": n, "filter": "type:journal-article"}
        r = requests.get(url, params=params, timeout=10) # Send request
        r.raise_for_status()
        items = r.json()["message"]["items"]
        # Extract relevant fields
        return [
            {
                "title": i["title"][0] if isinstance(i["title"], list) else i["title"],
                "abstract": i.get("abstract", ""),
                "url": i["URL"],
                "source": "CrossRef"
            }
            for i in items if "abstract" in i
        ]
    except Exception as e:
        print("CrossRef error:", e)
        return []

# Scrape data from arXiv
def scrape_arxiv(query: str, n: int = 300):
    n = min(n, 300) # Limit to 300 results 
    try:
         # API query
        url = f"http://export.arxiv.org/api/query?search_query=all:{query.replace(' ', '+')}&start=0&max_results={n}"
        feed = feedparser.parse(url)
        # Extract relevant fields
        return [
            {
                "title": entry.title,
                "abstract": entry.summary,
                "url": entry.link,
                "source": "arXiv"
            }
            for entry in feed.entries
        ]
    except Exception as e:
        print("arXiv error:", e)
        return []

Collect articles

In [5]:
# Scrapers and storage
scrapers = [scrape_crossref, scrape_arxiv]
collected = []
scraper_index = 0

# Loop through discipline,subfield, methodology combinations
for discipline, subfields in discipline_keywords.items():
    print(f"\nDiscipline: {discipline}")
    for subfield in subfields:
        print(f"Subfield: {subfield}")
        for method in all_methods:
            found = []
            # Round scraping with limits
            current_index = scraper_index % len(scrapers)
            attempts = 0
            max_attempts = len(scrapers) * 5

            # Try to get 5 articles or max attempts reached
            while len(found) < 5 and attempts < max_attempts:
                scraper = scrapers[current_index]
                print(f"  Trying {scraper.__name__} for {method}")
                try:
                    results = scraper(subfield)
                    for article in results:
                        if len(found) >= 5:
                            break
                        predicted = classify_methodology(article["abstract"], methodology_keywords)
                        if predicted == method and article not in found:
                            found.append(article)
                except Exception as e:
                    print(f"  Error with {scraper.__name__}: {e}")
                # Move to next scraper
                current_index = (current_index + 1) % len(scrapers)
                attempts += 1
                time.sleep(2)  # Polite pause

            # Update scraper index 
            scraper_index = (scraper_index + 1) % len(scrapers)

            # Store articles with metadata
            for article in found:
                collected.append({
                    "Discipline": discipline,
                    "Subfield": subfield,
                    "Methodology": method,
                    "Title": article["title"],
                    "Abstract": article["abstract"],
                    "URL": article["url"],
                    "Source": article["source"]
                })

            print(f"  {method}: {len(found)} articles found")



Discipline: Computer Science
Subfield: Algorithms and Data Structures
  Trying scrape_crossref for Quantitative
  Quantitative: 5 articles found
  Trying scrape_arxiv for Qualitative
  Trying scrape_crossref for Qualitative
  Qualitative: 5 articles found
  Trying scrape_crossref for Mixed Methods
  Mixed Methods: 5 articles found
  Trying scrape_arxiv for Design and Development
  Design and Development: 5 articles found
  Trying scrape_crossref for Theoretical / Conceptual
  Theoretical / Conceptual: 5 articles found
Subfield: Artificial Intelligence and Machine Learning
  Trying scrape_arxiv for Quantitative
  Trying scrape_crossref for Quantitative
  Quantitative: 5 articles found
  Trying scrape_crossref for Qualitative
  Qualitative: 5 articles found
  Trying scrape_arxiv for Mixed Methods
  Trying scrape_crossref for Mixed Methods
  Trying scrape_arxiv for Mixed Methods
  Trying scrape_crossref for Mixed Methods
  Trying scrape_arxiv for Mixed Methods
  Trying scrape_crossref fo

In [16]:
# Save articles
df = pd.DataFrame(collected)
df.to_csv("collected_articles.csv", index=False)
print("Saved as collected_articles.csv")


Saved as collected_articles.csv


Data preprocessing

In [8]:
# Load stopwords and spaCy model
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

# Load data
data = pd.read_csv("collected_articles.csv")

# Cleaning
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if token.lemma_ not in stop_words
        and not token.is_space
        and token.is_alpha
        and len(token.lemma_) > 2
    ]
    return " ".join(tokens)

# Extract syntactic features
def extract_syntactic_features(text):
    doc = nlp(str(text))
    pos_counts = {"NOUN": 0, "VERB": 0, "ADJ": 0, "ADV": 0}
    sent_lengths = []
    ent_counts = {"ORG": 0, "GPE": 0, "DATE": 0, "PERSON": 0}

    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1

    for sent in doc.sents:
        sent_lengths.append(len(sent))

    for ent in doc.ents:
        if ent.label_ in ent_counts:
            ent_counts[ent.label_] += 1

    total_tokens = len([t for t in doc if t.is_alpha])
    pos_ratios = {k.lower() + "_ratio": (v / total_tokens if total_tokens else 0) for k, v in pos_counts.items()}
    avg_sent_length = sum(sent_lengths) / len(sent_lengths) if sent_lengths else 0

    features = {
        **pos_ratios,
        "avg_sentence_length": avg_sent_length,
        **{f"ner_{k.lower()}": v for k, v in ent_counts.items()}
    }
    return pd.Series(features)

# Apply cleaning and feature extraction
data["clean_title"] = data["Title"].apply(clean_text)
data["clean_abstract"] = data["Abstract"].apply(clean_text)
data["clean_text"] = data["clean_title"] + " " + data["clean_abstract"]

# Extract syntactic features
syntactic_features = data["clean_abstract"].apply(extract_syntactic_features)
data = pd.concat([data, syntactic_features], axis=1)

# TF-IDF vectorisation
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(data["clean_text"])

# Save results
data.to_csv("cleaned_data.csv", index=False)
print("Saved as cleaned_data.csv")

joblib.dump(tfidf_features, "tfidf_features.pkl")
print("Saved as tfidf_features.pkl")

joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")
print("Saved as tfidf_vectorizer.pkl")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nataliribeiro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saved as cleaned_data.csv
Saved as tfidf_features.pkl
Saved as tfidf_vectorizer.pkl


Data plus methodology keywords

In [16]:
# Load the dataset
data = pd.read_csv("cleaned_data.csv")

# Load methodology keywords
with open("methodology_keywords.json", "r") as f:
    methodology_keywords = json.load(f)

# Flatten all keywords across all methods
all_keywords = []
for keywords in methodology_keywords.values():
    all_keywords.extend(keywords)

# Remove duplicates and lowercase
all_keywords = list(set([kw.lower() for kw in all_keywords]))

# Function to generate one binary feature per keyword
def extract_methodology_features(text, keyword_list):
    text_lower = str(text).lower()
    return [1 if kw in text_lower else 0 for kw in keyword_list]

# Apply to all abstracts
keyword_features = data["clean_abstract"].apply(lambda x: extract_methodology_features(x, all_keywords))

# Convert to DataFrame
keyword_df = pd.DataFrame(keyword_features.tolist(), columns=[f"kw_{kw}" for kw in all_keywords])

# Merge with original data
data = pd.concat([data, keyword_df], axis=1)

# Save updated dataset
data.to_csv("data_plus_keywords.csv", index=False)
print("Saved as data_plus_keywords.csv" )



Saved as data_plus_keywords.csv
