In [7]:
import os
import json
import pandas as pd
from pathlib import Path
import numpy as np
import ast
from tqdm import tqdm

In [2]:
import os
import json
import pandas as pd
import numpy as np
import ast  # For safely parsing string lists

# --- ASJC Code Translation Map ---
# A partial map based on your data and common codes.
ASJC_MAP = {
    '1000': 'Multidisciplinary',
    '1100': 'Agricultural and Biological Sciences',
    '1200': 'Arts and Humanities',
    '1300': 'Biochemistry, Genetics and Molecular Biology',
    '1303': 'Biochemistry',
    '1400': 'Business, Management and Accounting',
    '1500': 'Chemical Engineering',
    '1600': 'Chemistry',
    '1602': 'Analytical Chemistry',
    '1607': 'Spectroscopy',
    '1700': 'Computer Science',
    '1800': 'Decision Sciences',
    '1900': 'Earth and Planetary Sciences',
    '1909': 'Geotechnical Engineering and Engineering Geology',
    '2000': 'Economics, Econometrics and Finance',
    '2100': 'Energy',
    '2200': 'Engineering',
    '2208': 'Electrical and Electronic Engineering',
    '2209': 'Industrial and Manufacturing Engineering',
    '2211': 'Mechanics of Materials',
    '2300': 'Environmental Science',
    '2304': 'Environmental Chemistry',
    '2305': 'Environmental Engineering',
    '2311': 'Waste Management and Disposal',
    '2400': 'Immunology and Microbiology',
    '2500': 'Materials Science',
    '2504': 'Materials Chemistry',
    '2508': 'Surfaces, Coatings and Films',
    '2600': 'Mathematics',
    '2700': 'Medicine',
    '2739': 'Occupational Health',
    '2800': 'Neuroscience',
    '2900': 'Nursing',
    '3000': 'Pharmacology, Toxicology and Pharmaceutics',
    '3100': 'Physics and Astronomy',
    '3104': 'Condensed Matter Physics',
    '3110': 'Surfaces and Interfaces',
    '3200': 'Psychology',
    '3202': 'Applied Psychology',
    '3300': 'Social Sciences',
    '3400': 'Veterinary',
    '3500': 'Dentistry',
    '3600': 'Health Professions'
}

# --- Helper Functions for Data Extraction ---

def get_scopus_id(itemidlist):
    """Safely extracts the Scopus ID."""
    try:
        if not itemidlist or 'itemid' not in itemidlist:
            return None
        for item in itemidlist.get('itemid', []):
            if item.get('@idtype') == 'SCP':
                return item.get('$')
    except Exception:
        return None
    return None

def get_authors(author_groups):
    """Extracts and de-duplicates author names."""
    try:
        authors = {}
        for group in author_groups:
            author_data = group.get('author', [{}])[0]
            auid = author_data.get('@auid')
            name = author_data.get('preferred-name', {}).get('ce:indexed-name')
            if auid and name:
                authors[auid] = name
        return "; ".join(authors.values())
    except Exception:
        return None

def get_affiliations(author_groups):
    """Extracts and de-duplicates unique organization names."""
    try:
        affil_set = set()
        for group in author_groups:
            affil = group.get('affiliation', {})
            org_data = affil.get('organization', {})
            org_name = None
            if isinstance(org_data, list):
                org_name = ", ".join([org.get('$') for org in org_data if org.get('$')])
            elif isinstance(org_data, dict):
                org_name = org_data.get('$')
            if org_name:
                affil_set.add(org_name)
        return "; ".join(affil_set)
    except Exception:
        return None

def get_asjc_raw(classificationgroup):
    """Safely extracts the raw ASJC data (could be a string or a list string)."""
    try:
        if not classificationgroup or 'classifications' not in classificationgroup:
            return np.nan
        
        classifications = classificationgroup.get('classifications', [])
        asjc_codes = []
        
        for classification in classifications:
            if classification.get('@type') == 'ASJC':
                asjc_codes.append(classification.get('classification'))
        
        if not asjc_codes:
            return np.nan
        
        # If only one code, return it as a simple string
        if len(asjc_codes) == 1:
            return asjc_codes[0]
        
        # Re-build the string to look like the messy data
        list_of_dicts_str = str([{"$": code} for code in asjc_codes])
        return list_of_dicts_str

    except Exception:
        return np.nan

def translate_asjc(raw_asjc_value, code_map):
    """
    Translates the raw ASJC value (string or list string) into a
    human-readable, comma-separated string.
    """
    
    # --- THIS IS THE FIX ---
    # Replaced `if pd.isna(raw_asjc_value):`
    # with `if np.all(pd.isna(raw_asjc_value)):`
    # This handles cases where raw_asjc_value is an array.
    if np.all(pd.isna(raw_asjc_value)):
        return np.nan
    
    s_value = str(raw_asjc_value).strip()
    codes_to_translate = []
    
    # Case 1: It's a messy list string like "[{'$': '2208'}, {'$': '2504'}]"
    if s_value.startswith('['):
        try:
            list_data = ast.literal_eval(s_value)
            codes_to_translate = [item['$'] for item in list_data if isinstance(item, dict) and '$' in item]
        except Exception:
            return "Parsing Error"
            
    # Case 2: It's a simple code string like "2700"
    else:
        codes_to_translate = [s_value]
        
    # Translate the codes
    translated_names = [code_map.get(code, f"Unknown Code ({code})") for code in codes_to_translate]
    
    return ", ".join(translated_names)

# --- Main Script ---

def process_folder(folder_path, output_csv):
    """
    Scans a folder for JSON files, extracts publication data,
    and saves it to a CSV file.
    """
    all_files_data = []  # List to store a dict for each file

    print(f"Scanning files in: {folder_path}")
    
    try:
        item_names = os.listdir(folder_path)
    except FileNotFoundError:
        print(f"Error: Folder not found at {folder_path}")
        print("Please update the 'folder_path' variable to the correct location.")
        return
    except Exception as e:
        print(f"Error listing files: {e}")
        return

    # Loop through each item in the folder
    for item_name in item_names:
        file_path = os.path.join(folder_path, item_name)
        
        # 1. Skip if it's a directory
        if not os.path.isfile(file_path):
            continue
            
        # 2. Skip the output file itself to prevent reading it
        if item_name == output_csv:
            continue
            
        # 3. Skip the previously created CSV file
        if item_name == "extracted_publication_data.csv":
            continue

        try:
            # Read file content
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Try to convert content to JSON
            data = json.loads(content)
            
            # --- Start Extraction ---
            item = data.get('abstracts-retrieval-response', {}).get('item', {})
            if not item:
                raise ValueError("Not a valid abstracts-retrieval-response file")
                
            bibrecord = item.get('bibrecord', {})
            head = bibrecord.get('head', {})
            item_info = bibrecord.get('item-info', {})
            source = head.get('source', {})

            # Simple extractions
            chapter_title = head.get('citation-title')
            publication_year = source.get('publicationyear', {}).get('@first')
            book_title = source.get('sourcetitle')
            publisher = source.get('publisher', {}).get('publishername')
            doi = item_info.get('itemidlist', {}).get('ce:doi')

            # Complex extractions
            scopus_id = get_scopus_id(item_info.get('itemidlist', {}))
            authors = get_authors(head.get('author-group', []))
            affiliations = get_affiliations(head.get('author-group', []))
            
            # ASJC processing
            asjc_raw = get_asjc_raw(head.get('enhancement', {}).get('classificationgroup'))
            asjc_translation = translate_asjc(asjc_raw, ASJC_MAP)

            # Store in a dict
            row_data = {
                "file_name": item_name,
                "chapter_title": chapter_title,
                "doi": doi,
                "scopus_id": scopus_id,
                "publication_year": publication_year,
                "book_title": book_title,
                "publisher": publisher,
                "authors": authors,
                "affiliation": affiliations,
                "ASJC": asjc_raw,  # The original, messy column
                "ASJC_translation": asjc_translation # The new, translated column
            }
            
            all_files_data.append(row_data)
            print(f"Successfully processed: {item_name}")

        except json.JSONDecodeError:
            # This is now expected! It will skip non-JSON files.
            print(f"Skipping file: {item_name} (Not valid JSON content)")
        except Exception as e:
            # This will catch other errors, like the one you saw
            print(f"Skipping file: {item_name} (Error: {e})")

    # --- Create DataFrame ---
    if all_files_data:
        df = pd.DataFrame(all_files_data)
        
        # Reorder columns
        cols_order = [
            "file_name", "chapter_title", "doi", "scopus_id", 
            "publication_year", "book_title", "publisher", "authors", 
            "affiliation", "ASJC", "ASJC_translation"
        ]
        # Make sure all columns exist before reordering
        df = df[cols_order]
        
        df.to_csv(output_csv, index=False)
        
        print(f"\nSuccessfully processed {len(all_files_data)} file(s).")
        print(f"Data saved to '{output_csv}'")
        print("\nDataFrame Head:")
        print(df.head())
    else:
        print("No valid publication JSON files were found or processed.")

# --- RUN THE SCRIPT ---

# *** IMPORTANT ***
# Set this variable to the path of your folder containing the files.
# Example for Windows: "C:/work/data sci/Project-Data-Sci/ScopusData2018-2023/2018"
# Example for Mac/Linux: "/home/user/my_project/scopus_data/2018"
# Example for the tool's environment (if files are uploaded): "."
folder_to_scan = "C:/work/data sci/Project-Data-Sci/ScopusData2018-2023/2018" 

output_file_name = "extracted_publications_with_translation.csv"

# Run the main function
process_folder(folder_to_scan, output_file_name)

Scanning files in: C:/work/data sci/Project-Data-Sci/ScopusData2018-2023/2018
Error: Folder not found at C:/work/data sci/Project-Data-Sci/ScopusData2018-2023/2018
Please update the 'folder_path' variable to the correct location.


In [3]:
import os
import json
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm

# --- 1. ASJC Code Translation Map ---
ASJC_MAP = {
    '1000': 'Multidisciplinary', '1100': 'Agricultural and Biological Sciences',
    '1200': 'Arts and Humanities', '1300': 'Biochemistry, Genetics and Molecular Biology',
    '1303': 'Biochemistry', '1400': 'Business, Management and Accounting',
    '1500': 'Chemical Engineering', '1600': 'Chemistry', '1602': 'Analytical Chemistry',
    '1607': 'Spectroscopy', '1700': 'Computer Science', '1800': 'Decision Sciences',
    '1900': 'Earth and Planetary Sciences', '1909': 'Geotechnical Engineering',
    '2000': 'Economics, Econometrics and Finance', '2100': 'Energy',
    '2200': 'Engineering', '2208': 'Electrical and Electronic Engineering',
    '2209': 'Industrial and Manufacturing Engineering', '2211': 'Mechanics of Materials',
    '2300': 'Environmental Science', '2304': 'Environmental Chemistry',
    '2305': 'Environmental Engineering', '2311': 'Waste Management and Disposal',
    '2400': 'Immunology and Microbiology', '2500': 'Materials Science',
    '2504': 'Materials Chemistry', '2508': 'Surfaces, Coatings and Films',
    '2600': 'Mathematics', '2700': 'Medicine', '2739': 'Occupational Health',
    '2800': 'Neuroscience', '2900': 'Nursing', '3000': 'Pharmacology',
    '3100': 'Physics and Astronomy', '3104': 'Condensed Matter Physics',
    '3110': 'Surfaces and Interfaces', '3200': 'Psychology', '3202': 'Applied Psychology',
    '3300': 'Social Sciences', '3400': 'Veterinary', '3500': 'Dentistry',
    '3600': 'Health Professions'
}

# --- 2. Helper Functions ---

def get_nested(data, path, default=None):
    """Safely navigates nested dictionaries."""
    try:
        for key in path:
            data = data[key]
        return data
    except (KeyError, TypeError, AttributeError):
        return default

def get_scopus_id(itemidlist):
    try:
        if not itemidlist or 'itemid' not in itemidlist: return None
        for item in itemidlist.get('itemid', []):
            if item.get('@idtype') == 'SCP': return item.get('$')
    except: return None

def get_authors(author_groups):
    try:
        if not author_groups: return None
        if isinstance(author_groups, dict): author_groups = [author_groups]
        authors = {}
        for group in author_groups:
            alist = group.get('author', [])
            if isinstance(alist, dict): alist = [alist]
            for a in alist:
                if a.get('@auid') and a.get('preferred-name', {}).get('ce:indexed-name'):
                    authors[a['@auid']] = a['preferred-name']['ce:indexed-name']
        return "; ".join(authors.values())
    except: return None

def get_affiliations(author_groups):
    try:
        if not author_groups: return None
        if isinstance(author_groups, dict): author_groups = [author_groups]
        affil_set = set()
        for group in author_groups:
            affil = group.get('affiliation', {})
            org = affil.get('organization', {})
            if isinstance(org, list):
                name = ", ".join([o.get('$') for o in org if o.get('$')])
            elif isinstance(org, dict):
                name = org.get('$')
            else: name = None
            if name: affil_set.add(name)
        return "; ".join(affil_set)
    except: return None

def get_asjc_raw(classificationgroup):
    try:
        if not classificationgroup or 'classifications' not in classificationgroup: return np.nan
        clist = classificationgroup.get('classifications', [])
        if isinstance(clist, dict): clist = [clist]
        codes = [c.get('classification') for c in clist if c.get('@type') == 'ASJC']
        if not codes: return np.nan
        if len(codes) == 1: return codes[0]
        return str([{"$": c} for c in codes]) 
    except: return np.nan

def translate_asjc(raw_val, mapper):
    if np.all(pd.isna(raw_val)): return np.nan
    s = str(raw_val).strip()
    try:
        if s.startswith('['):
            data = ast.literal_eval(s)
            codes = [i['$'] for i in data if isinstance(i, dict) and '$' in i]
        else:
            codes = [s]
        return ", ".join([mapper.get(c, f"Unknown({c})") for c in codes])
    except: return "Error"

def get_abstract(head):
    try:
        data = head.get('abstracts')
        if isinstance(data, str): return data
        if isinstance(data, dict): return data.get('$', None)
    except: return None

# --- NEW / MODIFIED HELPERS ---

def get_cover_date(source, coredata, process_info):
    """Tries multiple places to find a valid date."""
    try:
        # 1. Try prism:coverDate in coredata (Easiest)
        if coredata and coredata.get('prism:coverDate'):
            return coredata.get('prism:coverDate')

        # 2. Try detailed publicationdate in source
        pd = source.get('publicationdate', {})
        if pd.get('year') and pd.get('month') and pd.get('day'):
            return f"{pd['year']}-{pd['month']}-{pd['day']}"
        
        # 3. Try Sort Date
        sd = process_info.get('ait:date-sort', {})
        if sd.get('@year') and sd.get('@month') and sd.get('@day'):
            return f"{sd['@year']}-{sd['@month']}-{sd['@day']}"
        
        # 4. Fallback
        if pd.get('year'):
            return f"{pd['year']}-{pd.get('month', '01')}-01"
            
        return None
    except: return None

def get_author_keywords(root_data):
    """Extracts keywords from the root authkeywords block."""
    try:
        keywords = root_data.get('authkeywords', {}).get('author-keyword', [])
        if isinstance(keywords, dict): keywords = [keywords]
        return "; ".join([k.get('$', '') for k in keywords if '$' in k])
    except: return None

# --- 3. Main Processing Function ---

def process_folder_full_columns(root_path, output_csv):
    all_data = []
    files_processed = 0
    
    print(f"üöÄ Scanning all files in: {root_path}")

    if not os.path.exists(root_path):
        print("‚ùå Error: Folder path does not exist.")
        return

    for root, dirs, files in os.walk(root_path):
        for fname in tqdm(files, desc=f"Reading {os.path.basename(root)}"):
            if fname.startswith('.') or fname.endswith('.csv'): continue
            
            fpath = os.path.join(root, fname)
            
            try:
                with open(fpath, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                data = json.loads(content)
                
                # --- NAVIGATE JSON STRUCTURE ---
                root_response = data.get('abstracts-retrieval-response', {})
                item = root_response.get('item', {})
                coredata = root_response.get('coredata', {}) # New: Need this for description/aggType
                
                if not item: continue
                
                bib = item.get('bibrecord', {})
                head = bib.get('head', {})
                source = head.get('source', {})
                info = bib.get('item-info', {})
                
                # --- EXTRACTION ---
                # 1. Standard Fields
                ct = head.get('citation-title')
                chapter_title = ct.get('titletext') if isinstance(ct, dict) else ct
                
                doi = info.get('itemidlist', {}).get('ce:doi')
                scopus_id = get_scopus_id(info.get('itemidlist', {}))
                
                pub_year = source.get('publicationyear', {}).get('@first')
                book_title = source.get('sourcetitle')
                publisher = source.get('publisher', {}).get('publishername')
                
                authors = get_authors(head.get('author-group', []))
                affiliation = get_affiliations(head.get('author-group', []))
                
                asjc_raw = get_asjc_raw(head.get('enhancement', {}).get('classificationgroup'))
                asjc_trans = translate_asjc(asjc_raw, ASJC_MAP)
                
                abstract = get_abstract(head)
                
                # 2. New / Renamed Fields
                cover_date = get_cover_date(source, coredata, item.get('ait:process-info', {}))
                description = coredata.get('dc:description')
                agg_type = coredata.get('prism:aggregationType')
                auth_keywords = get_author_keywords(root_response)
                
                # Get refcount safely
                ref_count = bib.get('tail', {}).get('bibliography', {}).get('@refcount')

                # Build Row
                row = {
                    "file_name": fname,
                    "chapter_title": chapter_title,
                    "doi": doi,
                    "scopus_id": scopus_id,
                    "publication_year": pub_year,
                    "book_title": book_title,
                    "publisher": publisher,
                    "authors": authors,
                    "affiliation": affiliation,
                    "ASJC": asjc_raw,
                    "ASJC_translation": asjc_trans,
                    "abstract": abstract,
                    
                    # --- REQUESTED ADDITIONS ---
                    "cover_date": cover_date,        # Renamed from full_date
                    "description": description,      # New
                    "aggregation_type": agg_type,    # New
                    "author_keywords": auth_keywords,# New
                    "reference_count": ref_count     # New
                }
                all_data.append(row)
                files_processed += 1

            except json.JSONDecodeError:
                pass 
            except Exception:
                pass 

    # --- Save Output ---
    if all_data:
        df = pd.DataFrame(all_data)
        
        # Columns ordered logically
        cols = [
            "file_name", "chapter_title", "doi", "scopus_id", 
            "publication_year", "cover_date", 
            "book_title", "publisher", "aggregation_type",
            "authors", "affiliation", 
            "abstract", "description", "author_keywords",
            "ASJC", "ASJC_translation", "reference_count"
        ]
        # Only keep columns that actually exist
        cols = [c for c in cols if c in df.columns]
        df = df[cols]
        
        df.to_csv(output_csv, index=False)
        print(f"\n‚úÖ Done! Processed {files_processed} valid papers.")
        print(f"üíæ Data saved to: {output_csv}")
        print(df.head(2))
    else:
        print("\n‚ùå No valid Scopus JSON data found.")

# --- 4. RUN IT ---
target_folder = "C:/work/data sci/Project-Data-Sci/ScopusData2018-2023/"
output_filename = "scopus_full_data.csv"

process_folder_full_columns(target_folder, output_filename)

üöÄ Scanning all files in: C:/work/data sci/Project-Data-Sci/ScopusData2018-2023/
‚ùå Error: Folder path does not exist.


In [25]:
df = pd.read_csv("scopus_full_data_v2.csv")
df

Unnamed: 0,file_name,chapter_title,doi,scopus_id,publication_year,cover_date,book_title,publisher,aggregation_type,authors,affiliation,abstract,description,author_keywords,ASJC,ASJC_translation,reference_count
0,201800000,Public health and international epidemiology f...,10.1007/978-3-319-98485-8_15,85077976956,2018,2018-12-31,"Radiology in Global Health: Strategies, Implem...",Springer International Publishing,Book,Pongpirul K.; Lungren M.P.,"Department of Radiology, Stanford University S...",,,,2700,Medicine,76
1,201800001,Flexible Printed Active Antenna for Digital Te...,10.23919/PIERS.2018.8597669,85060936020,2018,2018-12-31,Progress in Electromagnetics Research Symposium,Institute of Electrical and Electronics Engine...,Conference Proceeding,Pratumsiri T.; Janpugdee P.,"Department of Electrical Engineering, Wireless...","¬© 2018 The Institute of Electronics, Informati...",This paper presents the development of a flexi...,,"[{'$': '2208'}, {'$': '2504'}]","Electrical and Electronic Engineering, Materia...",4
2,201800002,Parametric study of hydrogen production via so...,10.1016/j.ces.2018.08.042,85052201238,2018,2018-12-31,Chemical Engineering Science,Elsevier Ltd,Journal,Phuakpunk K.; Assabumrungrat S.; Chalermsinsuw...,"Fuels Research Center, Department of Chemical ...",¬© 2018 Elsevier LtdComputational fluid dynamic...,Computational fluid dynamics was applied for s...,Circulating fluidized bed; Computational fluid...,"[{'$': '1600'}, {'$': '1500'}, {'$': '2209'}]","Chemistry, Chemical Engineering, Industrial an...",42
3,201800003,Superhydrophobic coating from fluoroalkylsilan...,10.1016/j.apsusc.2018.08.059,85051498032,2018,2018-12-31,Applied Surface Science,Elsevier B.V.,Journal,Saengkaew J.; Le D.; Samart C.; Kongparakul S....,"FRST, Academy of Science, Office of the Royal ...",¬© 2018 Elsevier B.V. A superhydrophobic/supero...,A superhydrophobic/superoleophilic mesh was su...,Encapsulation; Fluoroalkylsilane; Natural rubb...,"[{'$': '1600'}, {'$': '3104'}, {'$': '3100'}, ...","Chemistry, Condensed Matter Physics, Physics a...",45
4,201800004,Electrochemical impedance-based DNA sensor usi...,10.1016/j.aca.2018.07.045,85050678366,2018,2018-12-31,Analytica Chimica Acta,Elsevier B.V.,Journal,Teengam P.; Siangproh W.; Tuantranont A.; Vila...,"Organic Synthesis Research Unit, Department of...",¬© 2018 Elsevier B.V. A label-free electrochemi...,A label-free electrochemical DNA sensor based ...,acpcPNA; Electrochemical impedance spectroscop...,"[{'$': '1602'}, {'$': '1303'}, {'$': '2304'}, ...","Analytical Chemistry, Biochemistry, Environmen...",55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19800,202302885,Long-chain bio-olefins production via oxidativ...,10.1016/j.cattod.2021.07.034,85111945558,2023,2023-01-01,Catalysis Today,Elsevier B.V.,Journal,Le D.; Hinchiranan N.; Chaidherasuwet N.; Ruea...,"Program in Petrochemistry and Polymer Science,...",¬© 2021 Elsevier B.V.Long-chain Œ±-olefins (‚â• C1...,Long-chain Œ±-olefins (‚â• C10) are normally appl...,Long-chain olefins; Mesoporous KIT-6; Oleic ac...,"[{'$': '1503'}, {'$': '1600'}]","Unknown(1503), Chemistry",63
19801,202302886,Recent Developments and Applications of Microf...,10.1080/10408347.2021.1949695,85111408415,2023,2023-01-01,Critical Reviews in Analytical Chemistry,Taylor and Francis Ltd.,Journal,Alahmad W.; Varanusupakul P.; Varanusupakul P.,"Department of Chemistry, Faculty of Science, C...","¬© 2021 Taylor & Francis Group, LLC.Nowadays, f...","Nowadays, food safety has become a major conce...",Biological hazards; chemical hazards; food con...,1602,Analytical Chemistry,115
19802,202302887,"Social justice, education and peacebuilding: c...",10.1080/03057925.2021.1951666,85110903700,2023,2023-01-01,Compare,Routledge,Journal,Pherali T.,Centre for Education and International Develop...,¬© 2021 The Author(s). Published by Informa UK ...,Education is increasingly becoming central to ...,conflict; Education; peacebuilding; social jus...,3304,Unknown(3304),76
19803,202302888,Effects of black soldier fly (Hermetia illucen...,10.1080/10454438.2021.1923609,85106740832,2023,2023-01-01,Journal of Applied Aquaculture,Taylor and Francis Ltd.,Journal,Mapanao R.; Jiwyam W.; Nithikulworawong N.; We...,"Program of Fisheries Science, Faculty of Inter...",¬© 2021 Taylor & Francis.The effects of replaci...,The effects of replacing fish meal protein wit...,Anabas testudineus; Black soldier fly; fish me...,"[{'$': '2303'}, {'$': '1104'}]","Unknown(2303), Unknown(1104)",44


In [26]:
cols = [
    "chapter_title",
    "abstract",
    "author_keywords",
    "description",
    "book_title",
    "publication_year",
    "ASJC_translation",
]

df_clean = df[cols].copy()

text_cols = [
    "chapter_title",
    "abstract",
    "author_keywords",
    "description",
    "book_title"
]

# Fill missing text
df_clean[text_cols] = df_clean[text_cols].fillna("")

# Fill numbers
df_clean["publication_year"] = df_clean["publication_year"].fillna(-1)

# Remove rows with all empty text
df_clean = df_clean[
    (df_clean["chapter_title"] != "") |
    (df_clean["abstract"] != "") |
    (df_clean["author_keywords"] != "")
]

# Remove duplicates
df_clean = df_clean.drop_duplicates(subset=["chapter_title", "abstract"])

import re

def remove_copyright_year(text):
    if not isinstance(text, str):
        return text
    
    # Remove pattern like: "¬© 2021", "¬©2021", "¬© 1999"
    return re.sub(r'^¬©\s*\d{4}\s*', '', text)

df_clean["abstract_clean"] = df_clean["abstract"].apply(remove_copyright_year)
df_clean.drop(columns=["abstract"], inplace=True)

df_clean.dropna(columns=["description"],inplace=True)

TypeError: DataFrame.dropna() got an unexpected keyword argument 'columns'

In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import re

class TrendingTopicModel:
    def __init__(self, dataframe):
        self.df = dataframe
        # Fill NaN values in text columns to avoid errors
        self.df['abstract'] = self.df['abstract'].fillna('')
        self.df['chapter_title'] = self.df['chapter_title'].fillna('')
        self.df['author_keywords'] = self.df['author_keywords'].fillna('')
        self.df['ASJC_translation'] = self.df['ASJC_translation'].fillna('')
        
    def clean_text(self, text):
        """Basic text cleaning: lowercase, remove special chars"""
        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text) # Keep only letters and spaces
        return text

    def get_trending_topics(self, category_keyword, start_year, end_year, n_topics=5):
        """
        Identifies trending topics for a specific category and time period.
        
        Args:
            category_keyword (str): e.g., "Medicine", "Engineering"
            start_year (int): e.g., 2019
            end_year (int): e.g., 2021
            n_topics (int): Number of topics to discover
            
        Returns:
            List of dictionaries containing topic keywords and their 'strength'
        """
        # 1. Filter Data by Year and Category
        mask_year = (self.df['publication_year'] >= start_year) & (self.df['publication_year'] <= end_year)
        mask_category = self.df['ASJC_translation'].str.contains(category_keyword, case=False, na=False)
        
        subset_df = self.df[mask_year & mask_category].copy()
        
        if len(subset_df) < n_topics:
            return f"Not enough data found for {category_keyword} in {start_year}-{end_year}. Found {len(subset_df)} records."

        # 2. Prepare Text Data (Combine Title + Abstract + Keywords)
        # Weighting: Keywords * 3, Title * 2, Abstract * 1
        subset_df['combined_text'] = (
            (subset_df['chapter_title'] + " ") * 2 + 
            (subset_df['author_keywords'] + " ") * 3 + 
            subset_df['abstract']
        )
        
        # Clean text
        subset_df['clean_text'] = subset_df['combined_text'].apply(self.clean_text)

        # 3. Vectorization (TF-IDF)
        # Ignore terms that appear in >95% of docs (max_df) or <2 docs (min_df)
        tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
        tfidf = tfidf_vectorizer.fit_transform(subset_df['clean_text'])
        
        if tfidf.shape[1] == 0:
            return "No valid keywords found after cleaning."

        # 4. Run NMF Model (Topic Modeling)
        nmf_model = NMF(n_components=n_topics, random_state=42, init='nndsvd')
        nmf_model.fit(tfidf)

        # 5. Extract Top Words for Each Topic
        feature_names = tfidf_vectorizer.get_feature_names_out()
        topics = []
        
        for topic_idx, topic in enumerate(nmf_model.components_):
            # Get top 10 words for this topic
            top_words_idx = topic.argsort()[:-11:-1]
            top_words = [feature_names[i] for i in top_words_idx]
            
            # Calculate a 'strength' score (sum of weights for these top words)
            strength = topic[top_words_idx].sum()
            
            topics.append({
                "Topic ID": topic_idx + 1,
                "Keywords": ", ".join(top_words),
                "Score": round(strength, 2)
            })
            
        return topics

# --- üöÄ USAGE EXAMPLE ---

# 1. Load your dataset (assuming it's loaded as 'df')
# df = pd.read_csv("your_full_data_2018_2023.csv")

# NOTE: For this demo, I'll use a dummy DataFrame structure based on your request
# In your real code, just use your loaded 'df'
if __name__ == "__main__":
    # creating a dummy dataframe to demonstrate
    data = {
        'publication_year': [2019, 2020, 2021, 2021, 2020],
        'ASJC_translation': ['Medicine', 'Medicine', 'Computer Science', 'Medicine', 'Medicine'],
        'chapter_title': ['Deep learning in radiology', 'AI for cancer detection', 'Neural networks', 'Covid-19 lung imaging', 'Machine learning for health'],
        'abstract': ['Using CNN for X-rays', 'Detecting tumors with AI', 'Deep layers', 'Pandemic virus imaging', 'Health data analysis'],
        'author_keywords': ['AI, Radiology', 'Cancer, AI', 'Deep Learning', 'Covid-19, Virus', 'ML, Health']
    }
    df_demo = pd.DataFrame(data)

    # 2. Initialize the Model
    topic_model = TrendingTopicModel(df_demo)

    # 3. User Input (Simulated)
    user_category = "Medicine"
    user_start_year = 2019
    user_end_year = 2021

    # 4. Get Predictions
    print(f"--- Predicting Trending Topics for '{user_category}' ({user_start_year}-{user_end_year}) ---")
    results = topic_model.get_trending_topics(user_category, user_start_year, user_end_year, n_topics=2)

    if isinstance(results, list):
        for t in results:
            print(f"Topic {t['Topic ID']}: {t['Keywords']}")
    else:
        print(results)

--- Predicting Trending Topics for 'Medicine' (2019-2021) ---
Topic 1: ai, learning
Topic 2: learning, ai


In [12]:
!pip install google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.28.1-py3-none-any.whl.metadata (3.3 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.187.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.43.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.0,>=1.56.2 (from google-api-core->google-generativeai)
  Downloading googleapis_common_protos-1.72.0-py3-none-any.whl.met

In [16]:
import pandas as pd
import numpy as np
import re
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

class TrendingTopicModel:
    def __init__(self, dataframe, api_key=None):
        self.df = dataframe
        # Fill NaN values
        self.df['abstract'] = self.df['abstract'].fillna('')
        self.df['chapter_title'] = self.df['chapter_title'].fillna('')
        self.df['author_keywords'] = self.df['author_keywords'].fillna('')
        self.df['ASJC_translation'] = self.df['ASJC_translation'].fillna('')
        
        # Configure Google API if key is provided
        if api_key:
            genai.configure(api_key=api_key)
            self.model = genai.GenerativeModel('gemini-pro')
            self.has_api = True
        else:
            self.has_api = False
            print("Warning: No Google API Key provided. Topics will only show keywords.")

    def clean_text(self, text):
        if not isinstance(text, str): return ""
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        return text

    def generate_topic_label(self, keywords):
        """
        Uses Google Gemini to convert keywords into a descriptive sentence.
        """
        if not self.has_api:
            return keywords  # Fallback if no API key

        prompt = (
            f"I have a list of keywords extracted from research papers: '{keywords}'. "
            "Generate a short, specific, and professional topic title (one sentence) "
            "that summarizes what these papers are about. Do not use quotes."
        )
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Error generating label: {e}"

    def get_trending_topics(self, category_keyword, start_year, end_year, n_topics=5):
        # 1. Filter Data
        mask_year = (self.df['publication_year'] >= start_year) & (self.df['publication_year'] <= end_year)
        mask_category = self.df['ASJC_translation'].str.contains(category_keyword, case=False, na=False)
        subset_df = self.df[mask_year & mask_category].copy()
        
        if len(subset_df) < n_topics:
            return []

        # 2. Prepare Text
        subset_df['combined_text'] = (
            (subset_df['chapter_title'] + " ") * 2 + 
            (subset_df['author_keywords'] + " ") * 3 + 
            subset_df['abstract']
        )
        subset_df['clean_text'] = subset_df['combined_text'].apply(self.clean_text)

        # 3. Vectorization & NMF
        tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
        tfidf = tfidf_vectorizer.fit_transform(subset_df['clean_text'])
        
        if tfidf.shape[1] == 0: return []

        nmf_model = NMF(n_components=n_topics, random_state=42, init='nndsvd')
        nmf_model.fit(tfidf)

        # 4. Extract Keywords & Generate Sentences
        feature_names = tfidf_vectorizer.get_feature_names_out()
        topics = []
        
        for topic_idx, topic in enumerate(nmf_model.components_):
            top_words_idx = topic.argsort()[:-11:-1]
            top_words = [feature_names[i] for i in top_words_idx]
            keywords_str = ", ".join(top_words)
            
            # --- CALL GOOGLE API HERE ---
            descriptive_label = self.generate_topic_label(keywords_str)
            
            topics.append({
                "Topic ID": topic_idx + 1,
                "Keywords": keywords_str,
                "Label": descriptive_label, # This is the generated sentence
                "Score": round(topic[top_words_idx].sum(), 2)
            })
            
        return topics

# --- üöÄ USAGE EXAMPLE ---
if __name__ == "__main__":
    # 1. Load Data
    # df = pd.read_csv("extracted_publications_with_translation.csv")
    
    # 2. YOUR GOOGLE API KEY
    my_api_key = "AIzaSyAkTLSSVNW3LwQFUJIzYdRQ4sXX0adl4dI"

    # 3. Run Model
    model = TrendingTopicModel(df, api_key=my_api_key)
    results = model.get_trending_topics("Medicine", 2019, 2021, n_topics=3)
    
    for t in results:
       print(f"TOPIC: {t['Label']}")
       print(f"KEYWORDS: {t['Keywords']}\n")

TOPIC: Error generating label: 404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.
KEYWORDS: patients, cancer, disease, kidney, treatment, health, care, sleep, study, group

TOPIC: Error generating label: 404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.
KEYWORDS: anesthesia, adverse, perioperative, events, incident, anesthetic, paad, incidents, intubation, reports

TOPIC: Error generating label: 404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.
KEYWORDS: heart, failure, hfct, council, guideline, atrial, fibrillation, thailand, ischaemic, hf



In [24]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

class TrendGeneratorAI:
    def __init__(self, dataframe):
        self.df = dataframe.copy()
        # Pre-fill NaNs to prevent errors
        text_cols = ['chapter_title', 'abstract', 'author_keywords', 'ASJC_translation']
        for col in text_cols:
            if col in self.df.columns:
                self.df[col] = self.df[col].fillna('')

    def clean_text(self, text):
        """Basic text cleaning to help the AI understand."""
        if not isinstance(text, str): 
            return ""
        # Lowercase and remove special characters
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text) 
        return text

    def generate_trends(self, category_input, start_year, end_year):
        """
        AI Logic:
        1. Filters data by user input.
        2. Vectorizes text (TF-IDF).
        3. Uses NMF (Generative Topic Modeling) to extract top 3 themes.
        """
        print(f"\n--- ü§ñ AI Status: Initializing scan for '{category_input}' ({start_year}-{end_year})... ---")

        # 1. FILTER DATA
        # We use str.contains because a paper might belong to multiple categories
        mask_category = self.df['ASJC_translation'].str.contains(category_input, case=False, na=False)
        mask_year = (self.df['publication_year'] >= start_year) & (self.df['publication_year'] <= end_year)
        
        subset = self.df[mask_category & mask_year].copy()
        
        # Check if we have enough data
        if len(subset) < 5:
            return [f"‚ö†Ô∏è Not enough data ({len(subset)} papers) to generate trends. Try a wider year range."]

        print(f"--- üìö AI Status: Analyzing {len(subset)} publications... ---")

        # 2. PREPARE TEXT FOR AI
        # We combine Title (x3 weight), Keywords (x2 weight), and Abstract
        # This teaches the AI that Titles are the most important part of the trend.
        subset['ai_text'] = (
            (subset['chapter_title'] + " ") * 3 + 
            (subset['author_keywords'] + " ") * 2 + 
            subset['abstract']
        ).apply(self.clean_text)

        # 3. TF-IDF VECTORIZATION (Math translation of words)
        # ignore words that appear in >95% of docs (too common) or <2 docs (too rare)
        tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
        
        try:
            tfidf = tfidf_vectorizer.fit_transform(subset['ai_text'])
        except ValueError:
            return ["‚ö†Ô∏è Text data is too sparse to generate trends."]

        # 4. NMF MODEL (The 'Generative' Engine)
        # We ask the model to generate exactly 3 latent topics
        n_topics = 3
        nmf_model = NMF(n_components=n_topics, random_state=42, init='nndsvd')
        nmf_model.fit(tfidf)

        # 5. DECODE OUTPUT
        feature_names = tfidf_vectorizer.get_feature_names_out()
        trends = []

        for topic_idx, topic in enumerate(nmf_model.components_):
            # Get the top 4 words that define this trend
            top_indices = topic.argsort()[:-5:-1]
            top_words = [feature_names[i] for i in top_indices]
            
            # Format nicely as a "Title Trend"
            trend_title = " ".join([word.capitalize() for word in top_words])
            trends.append(f"Trend {topic_idx + 1}: {trend_title}")

        return trends

# ==========================================
# üöÄ MAIN EXECUTION BLOCK
# ==========================================

# 1. Load your DataFrame (assuming 'df' is already loaded in your environment)
# If not, uncomment the line below:
# df = pd.read_csv('extracted_publications_with_translation.csv')

# 2. Initialize the AI Engine
ai_engine = TrendGeneratorAI(df)

# 3. USER INPUT SIMULATION
# In a real app, these would come from input boxes
user_category = input("Enter Category (e.g., Medicine, Engineering, Chemistry): ")
try:
    user_start_year = int(input("Enter Start Year (e.g., 2018): "))
    user_end_year = int(input("Enter End Year (e.g., 2021): "))
except ValueError:
    print("Please enter valid numbers for years.")
    user_start_year = 2018
    user_end_year = 2023

# 4. RUN GENERATION
generated_results = ai_engine.generate_trends(user_category, user_start_year, user_end_year)

# 5. DISPLAY RESULTS
print("\n" + "="*40)
print(f"üî• GENERATED TITLE TRENDS ({user_category})")
print("="*40)
for result in generated_results:
    print(result)
print("="*40)


--- ü§ñ AI Status: Initializing scan for 'Medicine' (2016-2018)... ---
--- üìö AI Status: Analyzing 84 publications... ---

üî• GENERATED TITLE TRENDS (Medicine)
Trend 1: Adverse Events Incidents Perioperative
Trend 2: Patients Mortality Clinical Study
Trend 3: Pain Version Thai Reliability


In [None]:
!pip install sentence-transformers


^C


Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.9.1-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-1.1.7-py3-none-any.whl.metadata (13 kB)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Using cached sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Using cached transfo

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

df['search_text'] = (
    df['chapter_title'] * 3 + " " +
    df['author_keywords'] * 2 + " " +
    df['abstract']
)

embeddings = model.encode(df['search_text'].tolist(), show_progress_bar=True)
np.save("paper_embeddings.npy", embeddings)


ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search_papers(query, start_year, end_year, top_k=5):

    # filter by year
    df_filtered = df[
        (df['publication_year'] >= start_year) &
        (df['publication_year'] <= end_year)
    ].reset_index(drop=True)

    # embed query
    query_vec = model.encode(query)

    # compute similarity
    filtered_embeddings = embeddings[df_filtered.index]
    scores = cosine_similarity([query_vec], filtered_embeddings)[0]

    # top 20 (before LLM refinement)
    top_idx = scores.argsort()[-20:][::-1]

    results = df_filtered.iloc[top_idx].copy()
    results['score'] = scores[top_idx]

    return results
