In [15]:
import pandas as pd
import numpy as np
import re
import os
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer

# Configuration
INPUT_FILE = 'scopus_full_data_v2.csv'  # Change this to your actual file path
OUTPUT_DIR = 'output/'
MODEL_NAME = 'all-MiniLM-L6-v2' # Excellent balance of speed/performance for semantic search

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"‚úÖ Setup complete. Target device for AI: {'GPU' if pd.Series([1]).dtype == 'int' else 'CPU'}")
# Note: sentence_transformers automatically detects CUDA (GPU) if available.

‚úÖ Setup complete. Target device for AI: GPU


In [16]:
# Load the dataset
try:
    df = pd.read_csv(INPUT_FILE, on_bad_lines='skip')
    print(f"‚úÖ Data loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print("‚ùå File not found. Please check the INPUT_FILE path.")

# Display structure
df.head(3)

‚úÖ Data loaded successfully. Shape: (19805, 17)


Unnamed: 0,file_name,chapter_title,doi,scopus_id,publication_year,cover_date,book_title,publisher,aggregation_type,authors,affiliation,abstract,description,author_keywords,ASJC,ASJC_translation,reference_count
0,201800000,Public health and international epidemiology for radiology,10.1007/978-3-319-98485-8_15,85077976956,2018,2018-12-31,"Radiology in Global Health: Strategies, Implementation, and Applications",Springer International Publishing,Book,Pongpirul K.; Lungren M.P.,"Department of Radiology, Stanford University School of Medicine; Bumrungrad International Hospit...",,,,2700,Medicine,76
1,201800001,Flexible Printed Active Antenna for Digital Television Reception,10.23919/PIERS.2018.8597669,85060936020,2018,2018-12-31,Progress in Electromagnetics Research Symposium,Institute of Electrical and Electronics Engineers Inc.,Conference Proceeding,Pratumsiri T.; Janpugdee P.,"Department of Electrical Engineering, Wireless Network and Future Internet Research Unit, Chulal...","¬© 2018 The Institute of Electronics, Information and Communication Engineers (IEICE).This paper ...",This paper presents the development of a flexible printed active antenna for the digital televis...,,"[{'$': '2208'}, {'$': '2504'}]","Electrical and Electronic Engineering, Materials Chemistry",4
2,201800002,Parametric study of hydrogen production via sorption enhanced steam methane reforming in a circu...,10.1016/j.ces.2018.08.042,85052201238,2018,2018-12-31,Chemical Engineering Science,Elsevier Ltd,Journal,Phuakpunk K.; Assabumrungrat S.; Chalermsinsuwan B.; Putivisutisak S.,"Fuels Research Center, Department of Chemical Technology, Faculty of Science, Chulalongkorn Univ...",¬© 2018 Elsevier LtdComputational fluid dynamics was applied for sorption enhanced steam methane ...,Computational fluid dynamics was applied for sorption enhanced steam methane reforming (SESMR) o...,Circulating fluidized bed; Computational fluid dynamics; Multiphase flow models; Riser; Sorption...,"[{'$': '1600'}, {'$': '1500'}, {'$': '2209'}]","Chemistry, Chemical Engineering, Industrial and Manufacturing Engineering",42


In [17]:
# Inspect columns and missing values to plan our cleaning strategy
print("--- Missing Values ---")
print(df.isnull().sum())

print("\n--- Column Types ---")
print(df.dtypes)

--- Missing Values ---
file_name              0
chapter_title          1
doi                 1135
scopus_id              0
publication_year       0
cover_date             0
book_title             0
publisher              5
aggregation_type       0
authors                0
affiliation           12
abstract             529
description          529
author_keywords     3467
ASJC                   0
ASJC_translation       0
reference_count        0
dtype: int64

--- Column Types ---
file_name            int64
chapter_title       object
doi                 object
scopus_id            int64
publication_year     int64
cover_date          object
book_title          object
publisher           object
aggregation_type    object
authors             object
affiliation         object
abstract            object
description         object
author_keywords     object
ASJC                object
ASJC_translation    object
reference_count      int64
dtype: object


In [18]:
def clean_abstract(text):
    """
    Removes copyright headers and common artifacts from Scopus abstracts.
    Example: '¬© 2018 IEEE. This paper presents...' -> 'This paper presents...'
    """
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text)
    
    # Pattern 1: Remove ¬© [Year] [Publisher]. (e.g., ¬© 2018 Elsevier B.V.)
    # We look for the copyright symbol, followed by chars, until a period that isn't a decimal.
    text = re.sub(r'^¬© \d{4}.*?\.(?=\s*[A-Z])', '', text)
    
    # Pattern 2: Remove explicit "All rights reserved."
    text = re.sub(r'(?i)all rights reserved\.?', '', text)
    
    # Normalize whitespace (replace multiple spaces/newlines with single space)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def clean_authors(authors):
    """Clean author lists to be readable lists."""
    if pd.isna(authors):
        return "Unknown"
    return str(authors).replace(';', ',')

# --- Apply Cleaning ---

# 1. Create a stable ID (prefer Scopus ID, fallback to DOI or Index)
df['doc_id'] = df['scopus_id'].fillna(df['doi']).fillna(df.index.to_series().astype(str))

# 2. Clean Text Fields
print("üßπ Cleaning abstracts and metadata...")
# ... (Previous cleaning functions remain the same) ...

# 1. Inspect Abstract vs Description
# Scopus sometimes puts the text in 'abstract' and sometimes in 'description'.
# We should coalesce them: take abstract; if missing, take description.
df['final_abstract'] = df['abstract'].fillna(df['description']).fillna('')

# 2. Apply cleaning to this new combined field
df['clean_abstract'] = df['final_abstract'].apply(clean_abstract)
df['clean_title'] = df['chapter_title'].fillna("Untitled Document")

# --- STRICT FILTERING ---
# We calculate the length of the cleaned abstract
df['abstract_len'] = df['clean_abstract'].str.len()

# Define a threshold. 
# If an abstract is less than 50 characters, it's likely "No abstract available" or garbage text.
MIN_ABSTRACT_LENGTH = 50 

print(f"üìâ Initial Row Count: {len(df)}")

# Filter: Keep only rows where Abstract is long enough
df_clean = df[df['abstract_len'] > MIN_ABSTRACT_LENGTH].copy()

print(f"‚úÖ Final Row Count (Strict Cleaning): {len(df_clean)}")
print(f"üóëÔ∏è Dropped {len(df) - len(df_clean)} rows due to missing/short descriptions.")

# Update the main dataframe variable
df = df_clean

üßπ Cleaning abstracts and metadata...
üìâ Initial Row Count: 19805
‚úÖ Final Row Count (Strict Cleaning): 19276
üóëÔ∏è Dropped 529 rows due to missing/short descriptions.


In [19]:
def create_combined_text(row):
    # Extract keywords, handling NaNs
    keywords = str(row['author_keywords']) if not pd.isna(row['author_keywords']) else ""
    
    # Construct the semantic blob
    # We put the title twice explicitly if the abstract is missing to give it weight, 
    # but standard practice is just Title + Abstract.
    text_blob = f"Title: {row['clean_title']}. Abstract: {row['clean_abstract']}"
    
    if keywords:
        text_blob += f" Keywords: {keywords}"
        
    return text_blob

df['combined_text'] = df.apply(create_combined_text, axis=1)

print(f"üìù Combined text field created. Average length: {df['combined_text'].str.len().mean():.0f} characters.")

üìù Combined text field created. Average length: 1619 characters.


In [24]:
import os
import numpy as np
import glob
# --- CORRECTION ICI : On utilise tqdm standard pour √©viter l'erreur IProgress ---
from tqdm import tqdm 

# --- Configuration ---
CHECKPOINT_DIR = "output/temp_embeddings"  
FINAL_EMBEDDING_FILE = "output/articles_embeddings.npy"
BATCH_SIZE = 32

# Cr√©er le dossier de checkpoint s'il n'existe pas
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# --- 1. Initialisation ---
print(f"ü§ñ Loading model: {MODEL_NAME}...")
model = SentenceTransformer(MODEL_NAME)

sentences = df['combined_text'].tolist()
total_sentences = len(sentences)

# Calcul du nombre total de batches
num_batches = int(np.ceil(total_sentences / BATCH_SIZE))

print(f"üîÑ Pr√©paration : {total_sentences} documents r√©partis en {num_batches} batches.")

# --- 2. Boucle de g√©n√©ration avec reprise ---
print("üöÄ D√©but/Reprise de la g√©n√©ration des embeddings...")

# On utilise tqdm() standard ici
for i in tqdm(range(num_batches), desc="Processing Batches"):
    # D√©finir les indices de d√©but et de fin pour ce lot
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, total_sentences)
    
    # Nom du fichier pour ce batch
    batch_file = os.path.join(CHECKPOINT_DIR, f"batch_{i:05d}.npy")
    
    # V√âRIFICATION : Si le fichier existe d√©j√†, on le saute
    if os.path.exists(batch_file):
        continue
        
    # Sinon, on calcule
    batch_sentences = sentences[start_idx:end_idx]
    
    # Encodage du lot
    batch_embeddings = model.encode(batch_sentences, show_progress_bar=False)
    
    # Sauvegarde imm√©diate
    np.save(batch_file, batch_embeddings)

print("‚úÖ Tous les batches sont calcul√©s et sauvegard√©s sur le disque.")

# --- 3. Assemblage final ---
print("üì¶ Assemblage du fichier final...")

batch_files = sorted(glob.glob(os.path.join(CHECKPOINT_DIR, "batch_*.npy")))
all_embeddings_list = [np.load(f) for f in batch_files]
final_embeddings = np.vstack(all_embeddings_list)

np.save(FINAL_EMBEDDING_FILE, final_embeddings)

print(f"üéâ Termin√© ! Embeddings complets sauvegard√©s dans : {FINAL_EMBEDDING_FILE}")
print(f"Shape finale : {final_embeddings.shape}")

ü§ñ Loading model: all-MiniLM-L6-v2...
üîÑ Pr√©paration : 19276 documents r√©partis en 603 batches.
üöÄ D√©but/Reprise de la g√©n√©ration des embeddings...



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

‚úÖ Tous les batches sont calcul√©s et sauvegard√©s sur le disque.
üì¶ Assemblage du fichier final...
üéâ Termin√© ! Embeddings complets sauvegard√©s dans : output/articles_embeddings.npy
Shape finale : (19276, 384)


In [26]:
import os

# 1. Prepare Metadata DataFrame
# Select only fields useful for the UI/Filtering
meta_cols = [
    'doc_id', 'clean_title', 'clean_abstract', 'clean_authors', 
    'publication_year', 'doi', 'affiliation', 'ASJC_translation', 'cover_date'
]

# Ensure columns exist before selecting (handle potential missing cols in source)
existing_cols = [c for c in meta_cols if c in df.columns]
metadata_df = df[existing_cols].copy()

# Rename for clarity in the UI app later
metadata_df.rename(columns={
    'clean_title': 'title',
    'clean_abstract': 'abstract',
    'clean_authors': 'authors',
    'ASJC_translation': 'category'
}, inplace=True)

# 2. Save Files
meta_path = os.path.join(OUTPUT_DIR, 'articles_metadata.parquet')
embed_path = os.path.join(OUTPUT_DIR, 'articles_embeddings.npy')

# Save Metadata (Critical step for the UI)
metadata_df.to_parquet(meta_path, index=False)

# Save Embeddings
# NOTE: La cellule pr√©c√©dente l'a peut-√™tre d√©j√† fait, mais on s'assure ici 
# que tout est synchro. On utilise 'final_embeddings' qui est le r√©sultat de ton calcul.
if 'final_embeddings' in locals():
    np.save(embed_path, final_embeddings)
else:
    print("‚ö†Ô∏è Attention : La variable 'final_embeddings' n'est pas en m√©moire.")
    print("Si tu as d√©j√† le fichier .npy g√©n√©r√© par la cellule pr√©c√©dente, c'est bon.")

print("üíæ Output Saved Successfully:")
print(f"   1. Metadata:   {meta_path}")
print(f"   2. Embeddings: {embed_path}")

üíæ Output Saved Successfully:
   1. Metadata:   output/articles_metadata.parquet
   2. Embeddings: output/articles_embeddings.npy


In [29]:
# --- CELLULE DE R√âPARATION DES AUTEURS ---

print("üîß Diagnostic des colonnes...")

# 1. V√©rifions si la colonne brute existe dans le DataFrame d'origine
if 'authors' in df.columns:
    print(f"‚úÖ La colonne source 'authors' existe (Ex: {str(df['authors'].iloc[0])[:30]}...)")
else:
    print("‚ùå La colonne 'authors' est introuvable dans df. V√©rifie le nom exact (maj/min).")
    # Tentative de retrouver la colonne (parfois 'Authors' avec majuscule)
    possible_cols = [c for c in df.columns if 'author' in c.lower()]
    print(f"   Colonnes similaires trouv√©es : {possible_cols}")

# 2. On force la recr√©ation de la colonne propre
def force_clean_authors(val):
    if pd.isna(val) or val == "":
        return "Unknown Author"
    return str(val).replace(';', ',')

# On applique le nettoyage
if 'authors' in df.columns:
    df['clean_authors'] = df['authors'].apply(force_clean_authors)
    print("‚úÖ Colonne 'clean_authors' r√©g√©n√©r√©e.")

# 3. On met √† jour metadata_df
# On s'assure que 'clean_authors' est bien copi√©e
metadata_df['authors'] = df['clean_authors']

# 4. On sauvegarde √† nouveau pour √™tre s√ªr
meta_path = os.path.join(OUTPUT_DIR, 'articles_metadata.parquet')
metadata_df.to_parquet(meta_path, index=False)
print(f"üíæ M√©tadonn√©es mises √† jour et sauvegard√©es dans : {meta_path}")

# --- 5. RETEST IMM√âDIAT ---
print("\nüîé Retest avec affichage des auteurs :")
search_local_test("generative ai energy efficiency")

üîß Diagnostic des colonnes...
‚úÖ La colonne source 'authors' existe (Ex: Pratumsiri T.; Janpugdee P....)
‚úÖ Colonne 'clean_authors' r√©g√©n√©r√©e.
üíæ M√©tadonn√©es mises √† jour et sauvegard√©es dans : output/articles_metadata.parquet

üîé Retest avec affichage des auteurs :

üîé Query: 'generative ai energy efficiency'
--------------------------------------------------
Score: 0.5035 | Year: 2023
Title: Future Distribution Power Flow Scenario Generation Method Using Generative Adversarial Network Considering Correlation Between DERs
Authors: Ichinomiya H., Kawabe K., Chaitusaney S....

Score: 0.4728 | Year: 2019
Title: Generating images with desired properties using the discogan model enhanced with repeated property construction
Authors: Angsarawanee T., Kijsirikul B....

Score: 0.4373 | Year: 2023
Title: A systematic and critical review on effective utilization of artificial intelligence for bio-diesel production techniques
Authors: Ahmad J., Ngamcharussrivichai C., Awais M., 

In [28]:
def search_local_test(query, top_k=3):
    """
    Fonction simple pour tester la recherche dans le notebook.
    Utilise 'final_embeddings' et g√®re les noms de colonnes de mani√®re robuste.
    """
    # 1. Encode the query
    # Le mod√®le est d√©j√† charg√© en m√©moire (model)
    query_vec = model.encode([query])
    
    # 2. Compute Cosine Similarity
    # CORRECTION ICI : On utilise 'final_embeddings' qui contient tous nos vecteurs assembl√©s
    scores = np.dot(final_embeddings, query_vec.T).flatten()
    
    # 3. Get Top K indices
    top_indices = np.argsort(scores)[::-1][:top_k]
    
    # 4. Display results
    print(f"\nüîé Query: '{query}'")
    print("-" * 50)
    
    for idx in top_indices:
        row = metadata_df.iloc[idx]
        score = scores[idx]
        
        # --- GESTION ROBUSTE DES COLONNES ---
        # On essaie de r√©cup√©rer 'title', sinon on cherche 'clean_title', sinon 'chapter_title'
        title = row.get('title', row.get('clean_title', row.get('chapter_title', 'Untitled')))
        
        # On essaie de r√©cup√©rer 'authors', sinon 'clean_authors', sinon 'Unknown'
        authors = row.get('authors', row.get('clean_authors', 'Unknown'))
        
        # On g√®re l'ann√©e
        year = row.get('publication_year', 'N/A')

        print(f"Score: {score:.4f} | Year: {year}")
        print(f"Title: {title}")
        # On coupe la liste des auteurs si elle est trop longue pour l'affichage
        print(f"Authors: {str(authors)[:80]}...") 
        print("")

# --- Run Test Cases ---
# Note : Assure-toi que 'final_embeddings' et 'metadata_df' sont bien en m√©moire.
if 'final_embeddings' in locals() and 'metadata_df' in locals():
    search_local_test("generative ai energy efficiency")
    search_local_test("sustainable materials for construction")
else:
    print("‚ö†Ô∏è Erreur : Les variables 'final_embeddings' ou 'metadata_df' ne sont pas d√©finies.")
    print("Assure-toi d'avoir ex√©cut√© les cellules pr√©c√©dentes (Assemblage et Sauvegarde).")


üîé Query: 'generative ai energy efficiency'
--------------------------------------------------
Score: 0.5035 | Year: 2023
Title: Future Distribution Power Flow Scenario Generation Method Using Generative Adversarial Network Considering Correlation Between DERs
Authors: Unknown...

Score: 0.4728 | Year: 2019
Title: Generating images with desired properties using the discogan model enhanced with repeated property construction
Authors: Unknown...

Score: 0.4373 | Year: 2023
Title: A systematic and critical review on effective utilization of artificial intelligence for bio-diesel production techniques
Authors: Unknown...


üîé Query: 'sustainable materials for construction'
--------------------------------------------------
Score: 0.6194 | Year: 2020
Title: EMBODIED CARBON EMISSIONS OF CONSTRUCTION MATERIALS: A CASE STUDY OF BUILDINGS IN THAILAND
Authors: Unknown...

Score: 0.5860 | Year: 2018
Title: Precast industry contributed toward green construction
Authors: Unknown...

Score: 0.5