## This notebook is used for identifying **carnitines in GNPS Library by annotation**
(original notebook content: v_carnitines_M+H_name_df_for_manuscript.ipynb)

---
### Section 1: Read clean GNPS Library data
- already removed spectrum_ids associated with SUSPECT LIST data
- only includes spectrum_ids associated with M+H adduct

### Section 2: Identify all carnitines by annotation
- search 'Compound_Name' column for descriptions of 'carnitine' in metadata (case insensitive)

## Input files needed for the Notebook
1. **Cleaned** GNPS Library metadata from clean_GNPS_Library_data.ipynb

In [1]:
import pandas as pd

### Section 1: Read clean GNPS Library data

In [2]:
# from shape_GNPS_Library_data.ipynb

input_library_cleaned = pd.read_csv('/home/jovyan/work/notebooks/outputs/CLEANED_GNPS_input_library.csv',sep=',', index_col='spectrum_id',low_memory=False)

In [3]:
len(input_library_cleaned)

245648

### Section 2: Identify carnitines by annotation

In [4]:
# Subselection of GNPS Library dataframe for rows explicitly mentioning 'carnitine' in compound name
input_library_carnitines_case_insen = input_library_cleaned[input_library_cleaned['Compound_Name'].str.contains('carnitine',case=False,na=False)]

In [5]:
input_library_carnitines_case_insen

Unnamed: 0_level_0,index,source_file,task,scan,ms_level,library_membership,spectrum_status,peaks_json,splash,submit_user,...,Ion_Mode,create_time,task_id,user_id,InChIKey_smiles,InChIKey_inchi,Formula_smiles,Formula_inchi,url,annotation_history
spectrum_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CCMSLIB00004684226,4046,f.lfnothias/TEMP/201809_TEMP_REFERENCE_N_ACYL_...,10bf09df91cd4bc49f11f9d042619262,1145,2,GNPS-LIBRARY,1,"[[78.831253,48.000000],[81.134575,89.000000],[...",null-null-null-null,lfnothias,...,Positive,2018-09-18 16:17:01.0,fc87812c8b0f4d85811fcb06172d3d60,,LZOSYCMHQXPBFU-UHFFFAOYSA-N,,C17H33NO4,,https://gnps.ucsd.edu/ProteoSAFe/gnpslibrarysp...,"[{'Adduct': 'M+H', 'CAS_Number': ' ', 'Charge'..."
CCMSLIB00004684227,4047,f.lfnothias/TEMP/201809_TEMP_REFERENCE_N_ACYL_...,bf7f0b1faf244732b25797393a0c8bfe,1306,2,GNPS-LIBRARY,1,"[[81.067673,58.000000],[81.868629,39.000000],[...",null-null-null-null,lfnothias,...,Positive,2018-09-18 16:16:43.0,84a4551c00454d7dadf16212716efcfa,,XOMRRQXKHMYMOC-UHFFFAOYSA-O,,C23H46NO4+,,https://gnps.ucsd.edu/ProteoSAFe/gnpslibrarysp...,"[{'Adduct': 'M+H', 'CAS_Number': ' ', 'Charge'..."
CCMSLIB00004684228,4048,f.lfnothias/TEMP/201809_TEMP_REFERENCE_N_ACYL_...,a1c8ea94dcb2408aacf28b0a6b499e70,1604,2,GNPS-LIBRARY,1,"[[76.025177,28.000000],[76.907547,37.000000],[...",null-null-null-null,lfnothias,...,Positive,2018-09-18 16:16:04.0,348e88fbeac0477dabad132e9bee2e67,,XOMRRQXKHMYMOC-UHFFFAOYSA-N,,C23H45NO4,,https://gnps.ucsd.edu/ProteoSAFe/gnpslibrarysp...,"[{'Adduct': 'M+H', 'CAS_Number': ' ', 'Charge'..."
CCMSLIB00004684229,4049,f.lfnothias/TEMP/201809_TEMP_REFERENCE_N_ACYL_...,1d3b324f1cf24c119a520d1633e83122,1548,2,GNPS-LIBRARY,1,"[[77.038208,52.000000],[79.039864,68.000000],[...",null-null-null-null,lfnothias,...,Positive,2018-09-18 16:15:35.0,0c9c258babc44e4d9354d1ceee93b376,,SEXHTZQULWPHBX-UHFFFAOYSA-N,,C27H45NO4,,https://gnps.ucsd.edu/ProteoSAFe/gnpslibrarysp...,"[{'Adduct': 'M+H', 'CAS_Number': ' ', 'Charge'..."
CCMSLIB00004684230,4050,f.lfnothias/TEMP/201809_TEMP_REFERENCE_N_ACYL_...,d83a0ab3796a47db81fa17fcccb28f64,1538,2,GNPS-LIBRARY,1,"[[82.021034,18.000000],[82.307793,40.000000],[...",null-null-null-null,lfnothias,...,Positive,2018-09-18 16:36:46.0,778bf693a1024b7e9d249443e967aeb5,,MJLXQSQYKZWZCB-UHFFFAOYSA-N,,C25H45NO4,,https://gnps.ucsd.edu/ProteoSAFe/gnpslibrarysp...,"[{'Adduct': 'M+H', 'CAS_Number': ' ', 'Charge'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCMSLIB00006121167,577880,birmingham_pos.mgf,04aea32d6ccf4240b8b5131584ae6413,5049,2,BIRMINGHAM-UHPLC-MS-POS,1,"[[60.080601,62910528.000000],[85.028198,655002...",null-null-null-null,mwang87,...,Positive,2021-04-15 16:26:46.0,04aea32d6ccf4240b8b5131584ae6413,,RDHQFKQIGNGIED-MRVPVSSYSA-N,,C9H17NO4,,https://gnps.ucsd.edu/ProteoSAFe/gnpslibrarysp...,"[{'Adduct': 'M+H', 'CAS_Number': '', 'Charge':..."
CCMSLIB00006121169,577881,birmingham_pos.mgf,04aea32d6ccf4240b8b5131584ae6413,5050,2,BIRMINGHAM-UHPLC-MS-POS,1,"[[57.792702,2138579.000000],[60.077400,2803314...",null-null-null-null,mwang87,...,Positive,2021-04-15 16:26:46.0,04aea32d6ccf4240b8b5131584ae6413,,RDHQFKQIGNGIED-MRVPVSSYSA-N,,C9H17NO4,,https://gnps.ucsd.edu/ProteoSAFe/gnpslibrarysp...,"[{'Adduct': 'M+H', 'CAS_Number': '', 'Charge':..."
CCMSLIB00006121170,577882,birmingham_pos.mgf,04aea32d6ccf4240b8b5131584ae6413,5051,2,BIRMINGHAM-UHPLC-MS-POS,1,"[[60.080601,50613920.000000],[73.382797,201758...",null-null-null-null,mwang87,...,Positive,2021-04-15 16:26:46.0,04aea32d6ccf4240b8b5131584ae6413,,RDHQFKQIGNGIED-MRVPVSSYSA-N,,C9H17NO4,,https://gnps.ucsd.edu/ProteoSAFe/gnpslibrarysp...,"[{'Adduct': 'M+H', 'CAS_Number': '', 'Charge':..."
CCMSLIB00006121172,577883,birmingham_pos.mgf,04aea32d6ccf4240b8b5131584ae6413,5052,2,BIRMINGHAM-UHPLC-MS-POS,1,"[[52.450199,2231351.000000],[59.031700,2142775...",null-null-null-null,mwang87,...,Positive,2021-04-15 16:26:46.0,04aea32d6ccf4240b8b5131584ae6413,,RDHQFKQIGNGIED-MRVPVSSYSA-N,,C9H17NO4,,https://gnps.ucsd.edu/ProteoSAFe/gnpslibrarysp...,"[{'Adduct': 'M+H', 'CAS_Number': '', 'Charge':..."


### Save file

In [11]:
input_library_carnitines_case_insen.reset_index().to_csv(
    '/home/jovyan/work/notebooks/outputs/library_df_carnitine_case_insen_M+H.csv', 
    sep=',', index=False)