
# Scopus Journal Filtering (Same Methods)

This notebook mirrors your original workflow and relies **only** on your LLM-based filter:
- `os.chdir("/content/HRreview/review")`
- `pd.read_csv(..., encoding="latin-1")` then `replace("NaN", np.nan, inplace=True)`
- Drops **Coverage** and **Article Language** from the **working copy** *before* LLM filtering
- Uses `column_name = ['Source Title']`
- Builds `context` from `about.txt` plus your keyword list
- Imports `journal_filter_gpt` from your Scripts folder
- Calls `journal_filter_gpt.filter_journal_dataframe(df, client, column_name, context, "list", 45)`


In [4]:

import os
import pandas as pd
import numpy as np




scopus = pd.read_csv("review/data/Journals/Scopus/ext_list_Jul_2025_Scopus_Sources_Jul.csv")
print(f"Loaded {len(scopus)} rows")
scopus.head(2)


FileNotFoundError: [Errno 2] No such file or directory: 'review/data/Journals/Scopus/ext_list_Jul_2025_Scopus_Sources_Jul.csv'

In [None]:

# Remove date and language columns *from the working copy* before LLM filtering
WORKING_DROP_COLS = [
    'Article Language in Source (Three-Letter ISO Language Codes)',
    'Coverage',
]

scopus_work = scopus.copy()
for col in WORKING_DROP_COLS:
    if col in scopus_work.columns:
        scopus_work.drop(columns=[col], inplace=True)

# Keep the exact column name you used
column_name = ['Source Title']

# Prepare the minimal frame to send to LLM
scopus_for_llm = scopus_work[column_name].dropna().drop_duplicates().reset_index(drop=True)
print(f"Unique titles sent to LLM: {len(scopus_for_llm)}")
scopus_for_llm.head(5)


In [None]:

# Your utility to read context
def read_txt(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# Build context exactly like your snippet
context = read_txt("../../../Downloads/about.txt")
context = context + """these are keywords refering to the subject we are studying :[Human in the loop",
        "AI-human interaction",
        "Decision Making",
        "AI recommendation"
        ,"Trust in recommendation",
        "Explanable Artificial inteligence",
        'Artificial intelligence Bias',
        'Operational Analytics Decision',
        'AI assisted decision making',
        'Trust in automation',
        'Automation bias',
        'Human-AI Collaboration',
        'HAIC Evaluation']"""
print("Context prepared.")


In [None]:

# Import your LLM filter and OpenAI client, preserving your style
import sys
sys.path.append('/content/HRreview/review/Scripts')

import journal_filter_gpt

# OpenAI client initialization like your pattern
from openai import OpenAI
try:
    import userdata  # if present in your environment
    gpt = userdata.get('OPENAI_API_KEY')
except Exception:
    gpt = os.environ.get('OPENAI_API_KEY')

if not gpt:
    raise ValueError("OpenAI API key not found. Set via userdata.get('OPENAI_API_KEY') or env var OPENAI_API_KEY.")

client = OpenAI(api_key=gpt)
print("OpenAI client initialized.")


In [None]:

# Call your LLM-based journal filter EXACTLY like your snippet
f = journal_filter_gpt.filter_journal_dataframe(
    scopus_for_llm,
    client,
    column_name,
    context ,
    "list",  # can be "dict" or "list"
    45
)

type(f), (len(f) if isinstance(f, list) else None)


In [None]:

# Normalize possible returns (list/dict/DataFrame) but keep logic minimal
if isinstance(f, list):
    accepted_titles = set(map(str, f))
elif isinstance(f, dict):
    # Try common keys; if your function returns dict, adjust here if needed
    for k in ("keep", "accepted", "titles", "result"):
        if k in f and isinstance(f[k], (list, set, tuple)):
            accepted_titles = set(map(str, f[k]))
            break
    else:
        raise ValueError("Unexpected dict structure from LLM filter. Please return a list of titles.")
elif isinstance(f, pd.DataFrame) and 'Source Title' in f.columns:
    accepted_titles = set(map(str, f['Source Title'].dropna().unique().tolist()))
else:
    raise TypeError("Unexpected return type from LLM filter. Expected list/dict/DataFrame with 'Source Title'.")

print(f"Accepted titles: {len(accepted_titles)}")
list(sorted(list(accepted_titles))[:10])


In [None]:

# Filter the original scopus DataFrame by accepted titles and save
scopus_filtered = scopus[scopus['Source Title'].astype(str).isin(accepted_titles)].copy()
scopus_filtered.reset_index(drop=True, inplace=True)

out_path = "data/Journals/Scopus/Scopus_list_sources_llm_filtered.csv"
os.makedirs(os.path.dirname(out_path), exist_ok=True)
scopus_filtered.to_csv(out_path, index=False)
print(f"Saved filtered journals to: {out_path} (rows: {len(scopus_filtered)})")
scopus_filtered.head(5)
