In [None]:
# Simple false-negative extractor (extended)
# Reads combined and filtered CSVs, finds molecules in Combined that match any synonym
# but whose SMILES are not present in Filtered. Outputs:
# - a full CSV with requested columns plus matched_term

import pandas as pd
from pathlib import Path

combined_file = 'Combined_models_with_ChemBL.csv'
filtered_file = 'Final_Filtered_models_with_ChemBL.csv'
output_dir = Path('False_Negative')
output_dir.mkdir(exist_ok=True)

df_combined = pd.read_csv(combined_file)
df_filtered = pd.read_csv(filtered_file)

# Define synonyms list exactly as you provided
synonyms = [
    '5HT2A',
    'HTR2A',
    '5-HT2A',
    '5-HT2A receptor',
    'Serotonin receptor 2A',
    '5 Hydroxytryptamine receptor 2A'
]

# Fields to include in the full output (if present in Combined)
wanted_cols = [
    'confidence_score','ptm','iptm','ligand_iptm','protein_iptm',
    'complex_plddt','complex_iplddt','complex_pde','complex_ipde','chains_ptm',
    'pair_chains_iptm','affinity_pred_value','affinity_probability_binary',
    'affinity_pred_value1','affinity_probability_binary1','affinity_pred_value2',
    'affinity_probability_binary2','smiles','IC50','target_name','target_id','all_known_targets'
]

# Function to find the first matched synonym in all_known_targets (case-insensitive)
def find_matched_term(text):
    if pd.isna(text):
        return None
    lower = str(text).lower()
    for term in synonyms:
        if term.lower() in lower:
            return term
    return None

# Apply matching on Combined
combined = df_combined.copy()
combined['matched_term'] = combined['all_known_targets'].apply(find_matched_term)
# keep only rows that matched
combined_hits = combined[combined['matched_term'].notna()].copy()

# Build set of smiles in filtered (exact text match)
filtered_smiles = set(df_filtered['smiles'].dropna().astype(str).tolist())

# Find lost: in combined_hits but smiles not in filtered_smiles
combined_hits['smiles_str'] = combined_hits['smiles'].astype(str)
lost = combined_hits[~combined_hits['smiles_str'].isin(filtered_smiles)].copy()

# Prepare full output: include first column plus wanted_cols if they exist
first_col_name = combined.columns[0]
available_cols = [c for c in wanted_cols if c in combined.columns]
full_cols = [first_col_name] + available_cols + ['matched_term']

out_full = lost[full_cols].copy()
# rename first column to path_or_name for clarity
out_full = out_full.rename(columns={first_col_name: 'path_or_name'})

out_file_full = output_dir / f'lost_5HT2A_full.csv'

out_full.to_csv(out_file_full, index=False)

print(f'Wrote full output with {len(out_full)} rows to {out_file_full}')


In [None]:
# Plot histograms for selected columns from lost_5HT2A_full.csv
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

output_dir = Path('False_Negative')
file_full = output_dir / 'lost_5HT2A_full.csv'

if not file_full.exists():
    print(f'{file_full} not found. Run the extractor cell first.')
else:
    df = pd.read_csv(file_full)
    cols_to_plot = ['confidence_score', 'affinity_pred_value', 'affinity_probability_binary']
    for col in cols_to_plot:
        if col in df.columns:
            series = pd.to_numeric(df[col], errors='coerce').dropna()
            if series.empty:
                print(f'Column {col} exists but has no numeric data.')
                continue
            plt.figure(figsize=(8,5))
            plt.hist(series, bins=50, edgecolor='black', alpha=0.7)
            plt.xlabel(col)
            plt.ylabel('Frequency')
            plt.title(f'Histogram of {col} for lost 5HT2A molecules')
            plt.tight_layout()
            out_png = output_dir / f'{col}_hist_lost_5HT2A.png'
            plt.savefig(out_png)
            plt.close()
            print(f'Saved histogram for {col} to {out_png}')
        else:
            print(f'Column {col} not found in {file_full}')


In [None]:
# 创建FNR统计表格
import pandas as pd
from pathlib import Path

# 从上面单元格读取数据
known_actives = len(combined_hits)  # 从第一个单元格获取
missed_actives = len(out_full)      # 从第一个单元格获取
retrieved_actives = known_actives - missed_actives
fnr = (missed_actives / known_actives) * 100

# 创建表格
fnr_data = {
    'Known Actives': [known_actives],
    'Retrieved Actives': [retrieved_actives], 
    'Missed Actives (FN)': [missed_actives],
    'False Negative Rate (FNR)': [f"{fnr:.2f}%"]
}

fnr_df = pd.DataFrame(fnr_data)
print(fnr_df.to_string(index=False))

# 保存文件
output_dir = Path('False_Negative')
output_file = output_dir / 'FNR%.csv'
fnr_df.to_csv(output_file, index=False)
print(f"\n已保存到: {output_file}")