In [None]:
import pandas as pd
import numpy as np

In [None]:
#this script is to read in metadata files that have clade info added already
#to both make new columns and merge in data from files for
#species #flyways and regions #domestic status #improved order/species groupings
#script outputs new version of these metadata files for each gene

In [None]:
#import metadata files to modify
df = pd.read_csv('h5nx_cattle_update/metadata-with-clade_h5nx.tsv', sep='\t')


In [None]:
#check df
df

In [None]:
#make empty column Animal

df['Animal'] = np.nan
list(df.columns)

In [None]:
# Extract the 'Animal' value where the format is as expected
for index, row in df.iterrows():
    parts = row['strain'].split('/')
    if len(parts) > 1:
        df.at[index, 'Animal'] = parts[1]

In [None]:

df['Animal']=df['Animal'].str.lower()
df

In [None]:
#importing species file
species = pd.read_csv('species.csv')
species

In [None]:
#rename column to match metafiles 
species = species.rename(columns={'annotated': 'Animal'})
list(species.columns)

In [None]:
df = pd.merge(df, species, how='left', on=['Animal'])
df

In [None]:
df=df.drop_duplicates(subset=['strain'])
df.info()

In [None]:
##now we're gonna do the same thing to add flyways and regions
fly_reg = pd.read_csv('flyway_regions.csv')
fly_reg

In [None]:
df = pd.merge(df, fly_reg, how='left', on=['location'])
df

In [None]:
df.loc[df['location'] == 'Minnesota']
#test to see that this worked bc can't see NorthAm strains is preview

In [None]:
#drop duplicates again and check
df=df.drop_duplicates(subset=['strain'])
df

In [None]:
#next merging- for domestic status, first need to read in file with this data
#importing Lambo's tsv with the metadata columns of interest
LDmeta = pd.read_csv('NA-H5Nx-2021-2023-seqmerge.tsv', sep='\t')
LDmeta.head(10)

In [None]:
#need to modify the id column to remove dashes so it will match the other metadata file
LDmeta['ID'] = LDmeta['ID'].str.replace('-','')
LDmeta

In [None]:
#now want to drop a bunch of unnecessary/redundant columns so the merge isn't crazy
LDmeta = LDmeta.drop(columns=[
    'seq', 'ID-22rem', 'Location', 'Note', 'State_Province', 
    'Clade', 'city_county', 'Host', 'Collection_Date', 'Unnamed: 0'])
list(LDmeta.columns)

In [None]:
#clean up the namings for domestic status
LDmeta["Domestic_Status"] = LDmeta["Domestic_Status"].apply(lambda x: x.replace("nonhuman_mammal", "Nonhuman Mammal"))
LDmeta.loc[LDmeta['Domestic_Status'] == 'Nonhuman Mammal']

In [None]:
#clean up the namings for domestic status
LDmeta["Domestic_Status"] = LDmeta["Domestic_Status"].apply(lambda x: x.replace("Backyard bird", "Backyard Bird"))
LDmeta.loc[LDmeta['Domestic_Status'] == 'Backyard Bird']

In [None]:
LDmeta["Domestic_Status"] = LDmeta["Domestic_Status"].apply(lambda x: x.replace("U", "Unknown"))
LDmeta.loc[LDmeta['Domestic_Status'] == 'Unknown']

In [None]:
df['ID'] = np.nan
list(df.columns)

In [None]:
#make ID column from pulling out ID from strain name to match on 

split_values = df['strain'].str.split('/')  # 'strain' is the column to split
df['ID'] = split_values.str[3]
    
#for df in df_list:
    #for index, row in df.iterrows():
       # parts = row['strain'].split('/')
       # if len(parts) > 1:
          #  df.at[index, 'ID'] = parts[2]
df

In [None]:
df = pd.merge(df, LDmeta, how='left', on=['ID'])


In [None]:
print(df.loc[df['region'] == 'North America'])

In [None]:
list(df.columns)

In [None]:
    
df.drop(columns=['state2','broad','correction','Isolate_Id', 
                'Isolate_Name'], inplace=True)

list(df.columns)

In [None]:
df.Animal.unique()

In [None]:
#create list for the missing orders

carnivore_list = ['skunk', 'redfox', 'fox', 'bobcat', 'harborseal', 'raccoon', 
                  'blackbear', 'stripedskunk', 'cat', 'vulpesvulpes', 'coyote',
                  'greyseal', 'wildmink']
marsup_list = ['virginiaopossum']
artiodactyl_list = ['bottlenosedolphin','dolphin', 'dairycattle', 'goat']
anseriformes_list = ['lesserscaup']
passeriformes_list = ['greattailedgrackle', 'americanraven', 'commongrackle']
pelican_list = ['brownpelican', 'snowyegret']
accipitriformes_list = ['osprey', 'turkeyvulture', 'coopershawk']


df.loc[df['Animal'].isin(carnivore_list), 'order'] = 'carnivora'
df.loc[df['Animal'].isin(marsup_list), 'order'] = 'didelphimorphia'
df.loc[df['Animal'].isin(artiodactyl_list), 'order'] = 'artiodactyl'
df.loc[df['Animal'].isin(anseriformes_list), 'order'] = 'anseriformes'
df.loc[df['Animal'].isin(passeriformes_list), 'order'] = 'passeriformes'
df.loc[df['Animal'].isin(pelican_list), 'order'] = 'pelecaniformes'
df.loc[df['Animal'].isin(accipitriformes_list), 'order'] = 'accipitriformes'

In [None]:
#confirm
artis = df.loc[df['order'] == 'artiodactyl']

print(artis[['Animal','order']].head(10))

In [None]:
#adding a couple missing animals to wild designation for domestic status via the same list method

wild_list = ['blackvulture', 'commontern']
dom_list = ['chicken']



df.loc[df['Animal'].isin(wild_list), 'Domestic_Status'] = 'Wild'
df.loc[df['Animal'].isin(dom_list), 'Domestic_Status'] = 'Domestic'
    
print(df.loc[df['Animal'] == 'chicken'])

In [None]:
#create new empty column now named species group for grouping some orders together

df['species_group'] = "unknown"
list(df.columns)

In [None]:
wild_terrest_list = ['skunk', 'redfox', 'fox', 'bobcat', 'raccoon', 
                'blackbear', 'stripedskunk', 'cat', 'domesticcat', 'feline', 'vulpesvulpes', 'coyote',
                'wildmink']
rum_list = ['dairycattle', 'goat']
marine_list = ['harborseal', 'greyseal','bottlenosedolphin','dolphin']
humans = ['Human']

df.loc[df['Animal'].isin(wild_terrest_list), 'species_group'] = 'Mammal- Terrestrial'
df.loc[df['Animal'].isin(marine_list), 'species_group'] = 'Mammal- Marine'
df.loc[df['Animal'].isin(rum_list), 'species_group'] = 'Ruminant'
df.loc[df['host'].isin(humans), 'species_group'] = 'Human'

In [None]:
anser_list = ['anseriformes']
gall_list = ['galliformes']
raptor_list = ['accipitriformes', 'falconiformes', 'strigiformes']
waterbird_list = ['charadriiformes', 'pelecaniformes', 'suliformes', 'podicipediformes']
passer_list = ['passeriformes']
other_avian_list = ['casuariiformes', 'rheiformes', 'avian']


df.loc[df['order'].isin(anser_list), 'species_group'] = 'Anseriformes'
df.loc[df['order'].isin(gall_list), 'species_group'] = 'Galliformes'
df.loc[df['order'].isin(raptor_list), 'species_group'] = 'Raptor'
df.loc[df['order'].isin(waterbird_list), 'species_group'] = 'Other- Waterbird'
df.loc[df['order'].isin(passer_list), 'species_group'] = 'Passerine'
df.loc[df['order'].isin(other_avian_list), 'species_group'] = 'Other- Avian'

In [None]:
test = df.loc[df['species_group'] == 'Passerine']

print(test[['order','species_group', 'strain']].head(15))

In [None]:
#drop duplicates again and check
df=df.drop_duplicates(subset=['strain'])

df

In [None]:
#export, indicating file path to separate output folder

df.to_csv('output_tsvs/merged_meta_05-16.tsv', sep="\t") 
