In [591]:
import pandas as pd
import re

### Find Current Duplicate Cocktails

In [623]:
#read in the csv
ref_cocks = pd.read_csv("Reference Phage Cocktail.csv")

#drop all rows that don't have a single phage in the cocktail (what am I supposed to do with this)
ref_cocks = ref_cocks.dropna(subset='Phage 1')

#inconsequential check that everything in the registry ID column is just "r" + Name

# #remove the "r"
# ref_cocks['Registry ID'] = ref_cocks['Registry ID'].str[1:]
# #does registry id == name
# mismatches = []
# for index, row in ref_cocks.iterrows():
#     if row['Registry ID'] != row['Name']:
#         mismatches.append(row['Registry ID'])
# print(mismatches)

#get the phage columns
phage_cols = ref_cocks.columns[6:]

ref_cocks[phage_cols] = ref_cocks[phage_cols].apply(lambda x: x.astype(str) if x.notna().all() else x)

#check for which phage are capital
capital_phage = []
for col in phage_cols:
    # Use apply to apply the check to each element in the column
    contains_capital = ref_cocks[col].apply(lambda x: any(char.isupper() for char in str(x)))
    
    # Use the boolean mask to filter the values with capital letters
    capital_values = ref_cocks[col][contains_capital]
    
    # Extend the list with the values containing capital letters
    capital_phage.extend(capital_values)
#print(list(set(capital_phage)))

#make all the phage contents lowercase
#ref_cocks[phage_cols] = ref_cocks[phage_cols].apply(lambda x: x.str.lower())

#remove all the specific registration phages
def drop_specific_entity(phage):
    if type(phage) == str:
        if '.' in phage:
            return phage.split('.')[0]
    return phage

ref_cocks[phage_cols] = ref_cocks[phage_cols].applymap(drop_specific_entity)

#drop the 'pdp' from phages
ref_cocks[phage_cols] = ref_cocks[phage_cols].applymap(lambda x: x[:-3] if str(x).endswith('pdp') else x)

#find all phages that end in p
p_df= ref_cocks[phage_cols].applymap(lambda x: x if str(x).endswith('p') else None)
concat_p = pd.concat([p_df[col] for col in p_df.columns])
phages_end_in_p = concat_p.unique()

#drop the p from purified phages that end in 'p' but not the phages whose name ends in 'p'
ref_cocks[phage_cols] = ref_cocks[phage_cols].applymap(lambda x: x[:-1] if str(x).endswith('p') and len(str(x)) > 5 else x)

ref_cocks.to_csv("cocktail_only_reference.csv", index=False)


######### make sorted phages tuple and check for duplicate cocktails ############


#make a 'sorted row' column that is a sorted tuple of all the phages, converted to lowercase first
ref_cocks['sorted_row'] = ref_cocks[phage_cols].apply(lambda row : tuple(sorted(str(x).lower() for x in row.tolist() if pd.notna(x))), axis=1)

#make everything in the 'sorted row' lowercase
#ref_cocks['sorted_row'] = ref_cocks['sorted_row'].apply(lambda x: x.lower())

# Identify duplicate rows based on the 'sorted_row' column
duplicates = ref_cocks[ref_cocks.duplicated(subset='sorted_row', keep=False)]
duplicates = duplicates.dropna(axis=1, how='all')
duplicates.to_csv("cocktail_duplicates.csv", index=False)

# ref_cocks_duplicate_dictionary = duplicates.groupby('sorted_row')['Name'].apply(lambda group: group.values.tolist()).to_dict()
# print(ref_cocks_duplicate_dictionary)

# Get the indices of the second instance of each duplicate row
duplicate_instance_indices = duplicates[duplicates.duplicated(subset='sorted_row', keep='first')].index

#drop the second instance of each duplicate
ref_cocks=ref_cocks.drop(duplicate_instance_indices)

#drop the 'Registry Validation Status' and 'Registry Validation Errors' columns
ref_cocks=ref_cocks.drop(columns=['Registry Validation Status', 'Registry Validation Errors'])

#add a non-tuple column for better upload into Labguru
def my_tuple_func(row):
    tuple_elements = []
    for item in row:
        tuple_elements.append(item)
    return tuple_elements

def convert_to_string(row):
    items_string = ', '.join(row)
    # Remove the brackets
    items_string_without_brackets = items_string.replace('[', '').replace(']', '')
    return items_string_without_brackets

#separate out the reference phage from the reference engineered phage for parental links
def split_parent_phage_types(row):
    pattern = r"e\d{3}$"
    ephage = []
    phage = []
    row_list = row.split(', ')
    for item in row_list:
        if re.search(pattern, item):
            ephage.append(item)
        else:
            phage.append(item)
    return phage, ephage

ref_cocks['phage_for_upload'] = ref_cocks.sorted_row.apply(my_tuple_func)
ref_cocks['phage_for_upload'] = ref_cocks['phage_for_upload'].apply(convert_to_string)
ref_cocks[['ref_phage', 'ref_eng_phage']] = ref_cocks['phage_for_upload'].apply(split_parent_phage_types).apply(pd.Series)
ref_cocks['ref_phage'] = ref_cocks['ref_phage'].apply(convert_to_string)
ref_cocks['ref_eng_phage'] = ref_cocks['ref_eng_phage'].apply(convert_to_string)

#Make the names Labguru compatible
ref_cocks["Author"].replace('htuson', 'Hannah Tuson', inplace=True)
ref_cocks["Author"].replace('lana', 'Lana McMillan', inplace=True)
ref_cocks["Author"].replace('ecbaker', 'Ethan Baker', inplace=True)
ref_cocks["Author"].replace('JoeKenny', 'Joe Kenny', inplace=True)
ref_cocks["Author"].replace('rwmckee', 'Robert McKee', inplace=True)
ref_cocks["Author"].replace('sldorofi', 'Prof. Sidra Dorofi', inplace=True)
ref_cocks["Author"].replace('naomichavez', 'Naomi Chavez', inplace=True)
ref_cocks["Author"].replace('boyermaggie', 'Maggie Boyer', inplace=True)
ref_cocks["Author"].replace('cameron.prybol', 'Cameron Prybol', inplace=True)
ref_cocks["Author"].replace('mabster6', 'Michelle Baldassare', inplace=True)
ref_cocks["Author"].replace('aghobadian', 'Ava Ghobadian', inplace=True)

for index, value in ref_cocks['Author'].items():
    if value in ['dana.elmore', 'eyra.dordi', 'delorend', 'pearlwilcock', 'AStepanek', 'katez', 'julianam', 'Malika123', 'BrookeSauder', 'megan.frisbee', 'sara.siegel', 
                 'Dmun', 'Cbarlogio', 'Souweli', 'sserrano', 'Zhanafy', 'emccabe',  'sara.woessner', 'paige.mason', 'mawelch', 'barberbe', 'jaden.skelly', 'Nfranz' ]:
        ref_cocks.loc[index, 'Author'] = 'Luke Sobolewski'

#Get the unique cocktail dataframe that I'll upload to Labguru
ref_cocks.to_csv("Reference_Phage_Cocktails_forUpload.csv", index=False)

## Getting all Cocktails (not just reference)

In [606]:
cocks = pd.read_csv('Phage_Cocktails.csv')

#drop Enzymes and 'Titers' Columns (bc they're empty and I don't care
cocks.drop(columns=['Enzyme', 'Enzyme Concentration', 'Titers'], inplace=True)

#remove the antibiotic cocktails because they're stupid and/or duplicates
cocks = cocks[~cocks.Antibiotic.notna()]

#remove the antibiotic columns, for obvious reasons and resgistry validation erros + status bc I don't care
cocks.drop(columns=['Registry Validation Errors', 'Registry Validation Status', 'Antibiotic', 'Antibiotic Concentration (ug/mL)'], inplace=True)

#fill in a couple missing reference cocktails
cocks['Reference Cocktail'] = cocks.apply(lambda row: 'CK000007' if row['Name'] == 'CK000007.001' else row['Reference Cocktail'], axis=1)
cocks['Reference Cocktail'] = cocks.apply(lambda row: 'CK000029' if row['Name'] == 'CK000029.001' else row['Reference Cocktail'], axis=1)

#remove Nolan's recent cocktail that is bad. (not registered correctly)
cocks = cocks[cocks.Author != 'SykesNC']

#sort the dataframe by the 'Name" column
cocks = cocks.sort_values(by='Name').reset_index(drop=True)

#Get fields for all phage upload, phage, engineered phage, purified phage.
def split_3parent_types(row):
    eng_pattern = r"e\d{3}\.\d{3}$"
    phage_pattern = r"(?<!p)\.\d{2,3}$"
    ephage = []
    phage = []
    pur_phage = []
    row_list = row.split(', ')
    for item in row_list:
        if re.search(eng_pattern, item):
            ephage.append(item)
        elif re.search(phage_pattern, item):
            phage.append(item)
        else:
            pur_phage.append(item)
    return pur_phage, phage, ephage

cocks[['purified_phage', 'phage', 'eng_phage']] = cocks['Input Phages (NOT REFERENCE PHAGE)'].apply(split_3parent_types).apply(pd.Series)
cocks['purified_phage'] = cocks['purified_phage'].apply(convert_to_string)
cocks['phage'] = cocks['phage'].apply(convert_to_string)
cocks['eng_phage'] = cocks['eng_phage'].apply(convert_to_string)

#Add Genus Species to dataframe
ref_cocks_name_genusSpecies = ref_cocks[["Name", "Genus species"]]
ref_cocks_name_genusSpecies.rename(columns={"Name" : "Reference Cocktail"}, inplace=True)
cocks = pd.merge(cocks, ref_cocks_name_genusSpecies, on="Reference Cocktail", how="left")

#Make the names Labguru compatible
cocks["Author"].replace('htuson', 'Hannah Tuson', inplace=True)
cocks["Author"].replace('lana', 'Lana McMillan', inplace=True)
cocks["Author"].replace('ecbaker', 'Ethan Baker', inplace=True)
cocks["Author"].replace('JoeKenny', 'Joe Kenny', inplace=True)
cocks["Author"].replace('rwmckee', 'Robert McKee', inplace=True)
cocks["Author"].replace('sldorofi', 'Prof. Sidra Dorofi', inplace=True)
cocks["Author"].replace('naomichavez', 'Naomi Chavez', inplace=True)
cocks["Author"].replace('boyermaggie', 'Maggie Boyer', inplace=True)
cocks["Author"].replace('cameron.prybol', 'Cameron Prybol', inplace=True)
cocks["Author"].replace('mabster6', 'Michelle Baldassare', inplace=True)

for index, value in cocks['Author'].items():
    if value in ['dana.elmore', 'eyra.dordi', 'delorend', 'pearlwilcock', 'AStepanek', 'katez', 'julianam', 'Malika123', 'BrookeSauder', 'megan.frisbee', 'sara.siegel', 
                 'Dmun', 'Cbarlogio', 'Souweli', 'sserrano', 'Zhanafy', 'emccabe',  'sara.woessner', 'paige.mason', 'mawelch', 'barberbe']:
        cocks.loc[index, 'Author'] = 'Luke Sobolewski'

# Drop the initial titer column
cocks.drop(columns={"Titer"}, inplace=True)

cocks.to_csv("Sorted_Phage_Cocktails.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ref_cocks_name_genusSpecies.rename(columns={"Name" : "Reference Cocktail"}, inplace=True)


### Practice 'I want to register a cocktail like this'

In [577]:
#Pretend user input is a list of phages
#input = ['p1772.001', 'p1835pdp.013', 'p1695e001.01', 'p00drp.b001']
#input = ['p1240p.b004', 'p2396p.b003', 'p006437p.b001', 'p2606e005.001', 'p006695p.b003', 'p5918p.b002', 'p006319p.b002', 'p006412p.b001', 'p006148p.b002', 'p006175p.b002']
#input = ['p1772.001', 'PB1.001']
#input = ['p1772e005.001', 'PB1e002.001']
#input = ['p1772.001','DMS3.004', 'PB1.001', 'p1106.003']
#input = ['p3769.002', 'p006967.005','p006921.006','p5762.001''p5682.004','p007177.002', 'p007047.001', 'p007037.001']
#input = ['p1106e003.001', 'p1772e005.001', 'p1835e002.001', 'p2131e002.001'] # works at line 53
#input = ['p0031.002']  # works at line 100
#input = ['p1229.001', 'p1234.001', 'p2273.001'] # works at line 500	sorted_row
#input = ['p004k.001', 'p007225.001', 'p00ke.001', 'p5516.001'] # fails at line 1000
#input = ['p00kee066.001','p00jce084.001','p00jce086.001','p00exe014.001','p004Ke009.001','p00c0e103.001','p00exe299.001','p004Ke127.001','p5516.001']
#input = ('p004ke124.001', 'p006921.001', 'p007225.001', 'p00c0e103.001', 'p00exe296.001', 'p00jce098.001', 'p00kee072.001', 'p5516.001') # works?? line 1256
#input = ('p004ke045.001', 'p00c0e093.001', 'p00c0e094.001', 'p00c0e095.001', 'p00exe058.001', 'p00jce048.001', 'p00jce051.001', 'p00kee039.001') # fails
#input =	('p004ke124.001', 'p004ke127.001', 'p006977.001', 'p006984.001', 'p00exe291.001', 'p00exe296.001', 'p00exe299.001', 'p00exe300.001', 'p00jce098.001', 'p00jce172.001', 'p00kee072.001', 'p5516.001') # works at line 1300
#input =	('p004ke045.001', 'p00c0e093.001', 'p00c0e094.001', 'p00c0e095.001', 'p00exe058.001', 'p00jce048.001', 'p00jce051.001', 'p00kee039.001')
#input = ('p004ke133.001', 'p004ke134.001', 'p00c0.001', 'p00exe312.001', 'p00exe314.001', 'p00exe315.001', 'p00jce184.001', 'p00jce185.001', 'p00jce186.001', 'p00ke.001', 'p5516.001','p6921.001', 'p6984e007.001') # fails??? at line 1387
#input =	('p004ke009.001', 'p00c0e103.001', 'p00exe014.001', 'p00ke.001') # fails at line 1345
#input =	('p00exe296.001', 'p00jce098.001') # works line 1304
#input =	('p004ke009.001', 'p006921.001', 'p006977.001', 'p006984.001', 'p00c0e103.001', 'p00exe014.001', 'p00jc.001', 'p00ke.001', 'p5516.001') # works line 1310
#input = ('p004ke009.001', 'p006921.001', 'p006977.001', 'p006984.001', 'p007050.001', 'p00c0e103.001', 'p00exe014.001', 'p00jc.001', 'p00ke.001', 'p5516.001') #works at line 1312
input =	('p004k.001', 'p00ex.001', 'p00jc.001', 'p00ke.001') # fails at line 1313

#this next input should give nothing
#input =	('p004ke045.001', 'p00c0e093.001', 'p00c0e094.001', 'p00c0e095.001', 'p00exe058.001', 'p00jce048.001', 'p00jce051.001', 'p00kee039.001', 'p00notreal.001')

#These phages must be phage entities!!!

# most end in .XXX but some are just .XX and some .bXXX
no_ref_phage = any('.' not in phage for phage in input)

if no_ref_phage:
    raise Exception('Input must contain phage entities, no reference phage')

#then concatenate the input reference phages and check to see if this cocktail exists
def drop_entity(phages):
    cleaned_phages = []
    for phage in phages:
        cleaned_phages.append(phage.split('.')[0])
    return cleaned_phages

cleaned_phages = drop_entity(input)

#eliminate the pdp's and the p's from those pesky purified phage entities
# for phage in cleaned_phages:
#     if phage 

def ref_phages(phages):
    ref_phages = []
    for phage in phages:
        if phage.endswith('pdp'):
            ref_phages.append(phage[:-3])
        elif phage.endswith('p'):
            ref_phages.append(phage[:-1])
        else:
            ref_phages.append(phage)
    return ref_phages

reference_phages = ref_phages(cleaned_phages)

lowercase_phage = [phage.lower() for phage in reference_phages]
#print(lowercase_phage)

sorted_ref_phages = tuple(sorted(lowercase_phage))
print(sorted_ref_phages)

#1st attempt: times out somewhere in iterating between rows 500 and 1000

#check if the sorted tuple exists in the reference cocktail sheet:
def next_cockName_existingRef(ref_cock_df, sorted_phages):
    for i in range(len(ref_cock_df)):
        if sorted_phages == ref_cock_df.sorted_row.iloc[i]:
 
            #get where the existing cocktail is in the reference cocktail sheet
            ref_cocktail_row = ref_cock_df[ref_cock_df.sorted_row == sorted_phages]
            existing_ref_cocktail = ref_cocktail_row.Name.item()
        
            #get the last cocktail entity that uses that reference cocktail
            cock_entity_rows = cocks[cocks['Reference Cocktail'] == existing_ref_cocktail]
            
            #if there is not an existing entity cocktail (but a reference exists) create the first 
            if cock_entity_rows.empty:
                return (existing_ref_cocktail + '.001')
            
            #find the last instance of the cocktail if it exists
            last_cock_entity_row = cock_entity_rows.iloc[-1]
            actual_last_entity = last_cock_entity_row.Name

            #increase that value by 1
            cock_num = int(actual_last_entity.split('.')[1])
            cock_num_plus_1 = cock_num + 1
            return existing_ref_cocktail + '.' + str(cock_num_plus_1).zfill(3)

def create_ref_cock(sorted_phages):
    last_ref_cock = ref_cocks.Name.iloc[-1]
    last_ref_cock_plus_1 = int(last_ref_cock[2:]) + 1
    new_ref_cock = 'CK' + str(last_ref_cock_plus_1).zfill(6)
    return new_ref_cock

next_cocktail_existingRef = next_cockName_existingRef(ref_cocks, sorted_ref_phages)
print(next_cocktail_existingRef)

new_ref_cocktail = create_ref_cock(sorted_ref_phages)
#print(new_ref_cocktail)

('p004k', 'p00ex', 'p00jc', 'p00ke')
CK001403.001


In [578]:
#testing dataframe with just the last row of the current reference cocktail dataframe
test_new_refCocks = ref_cocks.iloc[-1:]

#phages input (after sorting occurs)
user_input = ('p004ke045', 'p00c0e093', 'p00c0e094', 'p00c0e095', 'p00exe058', 'p00jce048', 'p00jce051', 'p00kee039', 'p00notreal')

def create_ref_cock_and_append(sorted_phages):
    last_ref_cock = ref_cocks.Name.iloc[-1]
    last_ref_cock_plus_1 = int(last_ref_cock[2:]) + 1
    new_ref_cock = 'CK' + str(last_ref_cock_plus_1).zfill(6)
    
    #new dataframe row with the new data
    new_ref_row = pd.DataFrame({'Registry ID': ('r' + new_ref_cock),
                                'Name': new_ref_cock, 
                                'sorted_row' : [sorted_phages]})
    
    for i in range(len(sorted_phages)):
        new_ref_row['Phage ' + str(i+1)] =  sorted_phages[i]
    
    return new_ref_row

#if there isn't a reference cocktail, make one and then give the .001 cocktail
if (next_cockName_existingRef(ref_cocks, user_input)) is None:
    updated_ref_cocks = pd.concat([test_new_refCocks, create_ref_cock_and_append(user_input)], ignore_index=True)
    new_cocktail_new_ref = next_cockName_existingRef(updated_ref_cocks, user_input)
print(new_cocktail_new_ref)

#Final Step would be to concat this to the actual cocktail dataframe, but I think it's time to move this to Labguru

CK001599.001
