In [49]:
import pandas as pd
from EcoNameTranslator import to_scientific, to_common, to_species
import os
import csv
from pygbif import species

In [50]:
# set column options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 20)


In [51]:
# PreStart - take the lat/long out from the initial vancouver inventory
vcCoords = pd.read_csv("G:/Shared drives/host_tree_cnn/cleaning_species_names/og_inventories/vancouver_inventory.csv", delimiter=";")
vcCoords = vcCoords["geo_point_2d"]
vcCoords = vcCoords.str.split(",", expand=True)
vcCoords.columns = ["rounded_lat", "rounded_lng"]


In [52]:
# take in the usda codes
usdaDF = pd.read_csv('G:/Shared drives/host_tree_cnn/cleaning_species_names/addl_data/usda_code.txt')
print(usdaDF.columns.tolist())
usdaDF['Scientific Name with Author'] = usdaDF['Scientific Name with Author'].str.lower()
usdaDF['Common Name'] = usdaDF['Common Name'].str.lower()

['Symbol', 'Synonym Symbol', 'Scientific Name with Author', 'Common Name', 'Family']


In [53]:
prevFindings = {}
# clean up the scientific names
def cleanScientific(scientificName):
    global prevFindings
    if (scientificName == 'NA'):
        return ["NA", "NA"]
    if scientificName in prevFindings:
        return prevFindings[scientificName]
    try:
        index = to_species([scientificName])
        values = index[scientificName][0].split()
        # check to see if there are two possible values
        if len(values) > 2:
            print(values)
        # take the species and genus
        prevFindings[scientificName] = [values[-2], values[-1]]
        return values[-2], values[-1]
    except:
        prevFindings[scientificName] = ["NA", "NA"]
        return ["NA", "NA"]


In [54]:
# scientific name to common name
noneCount = 0
# sNaValues = 
sNaValues = set()
scientificNamesDict = {}
def getCommonNames(scientificName):
    scientificName = scientificName.lower()
    global noneCount
    global sNaValues
    # check values that were already found
    if scientificName in sNaValues:
        noneCount += 1
        return None
    elif scientificName in scientificNamesDict:
        return scientificNamesDict[scientificName]
    # check USDA for a match
    result = usdaDF.loc[usdaDF['Scientific Name with Author'] == scientificName, 'CommonName'].values
    if len(result) > 0:
      getCommonNames[scientificName] = result[0]
      return result[0]
    # check econameparser for a match
    try:
        common_names = to_common([scientificName])[scientificName][1]
        if len(common_names) == 0:
            raise ValueError()
        
        scientificNamesDict[scientificName] = common_names
        
        return common_names
    except:
        noneCount += 1 
        sNaValues.add(scientificName)
        return None

In [55]:
# common name to scientific Name

print(usdaDF.columns.tolist())
# ['Symbol', 'Synonym Symbol', 'Scientific Name with Author', 'Common Name', 'Family']

cNaValues = set()
common_namesDict = {}
def getScientificNames(common_name):
    global noneCount
    global cNaValues

    common_name = common_name.lower()

    # check values that were already found
    if common_name in cNaValues:
        noneCount += 1
        return None
    elif common_name in common_namesDict:
        return common_namesDict[common_name]
    # check USDA for a match
    result = usdaDF.loc[usdaDF['Common Name'] == common_name, 'Scientific Name with Author'].values
    if len(result) > 0:
      common_namesDict[common_name] = result[0]
      return result[0]
    # check econameparsre for a match
    try:
        sci_name = to_scientific([common_name])[common_name][1]
        if len(sci_name) == 0:
            raise ValueError()
        
        common_namesDict[common_name] = sci_name[0]
        return sci_name[0]
    except:
        noneCount += 1 
        cNaValues.add(common_name)
        return "NA"

['Symbol', 'Synonym Symbol', 'Scientific Name with Author', 'Common Name', 'Family']


In [56]:
symbolDict = {}
synonymDict = {}
symbolDF = usdaDF.set_index('Symbol')
synonymDF = usdaDF.set_index('Synonym Symbol')

symbolDict = symbolDF['Common Name'].to_dict()
synonymDict = synonymDF['Common Name'].to_dict()

commonDict = symbolDict.copy()  
commonDict.update(synonymDict)


symbolDict = symbolDF['Scientific Name with Author'].to_dict()
synonymDict = synonymDF['Scientific Name with Author'].to_dict()

sciDict = symbolDict.copy()  
sciDict.update(synonymDict)
validCodes = {
    "ABBA": "Abies balsamea",
    "JUVI": "Juniperus virginiana",
    "FRAM": "Fraxinus americana",
    "QUCO": "Quercus coccinea",
    "QUMA1": "Quercus macrocarpa",
    "POTR1": "Populus tremuloides",
    "ULWI": "Ulmus wilsoniana",
    "ACCA": "Acer campestre",
    "ACNI": "Acer nigrum",
    "ACPA": "Acer palmatum",
    "ACPL": "Acer platanoides",
    "ACRU": "Acer rubrum",
    "ACSA1": "Acer saccharinum",
    "ACSA2": "Acer saccharum",
    "AECA": "Aesculus californica",
    "AMLA": "Amelanchier laevis",
    "BENI": "Betula nigra",
    "CABE": "Carpinus betulus",
    "CAGL": "Carya glabra",
    "CAOV": "Carya ovata",
    "CEOC": "Celtis occidentalis",
    "CHLA": "Chamaecyparis lawsoniana",
    "CHOB": "Chionanthus virginicus",
    "CRMO": "Crataegus monogyna",
    "EUAT": "Eucommia ulmoides",
    "FASY": "Fagus sylvatica",
    "FREX": "Fraxinus excelsior",
    "FRNI": "Fraxinus nigra",
    "FROR": "Fraxinus ornus",
    "FRPE": "Fraxinus pennsylvanica",
    "GIBI": "Ginkgo biloba",
    "GLTR": "Gleditsia triacanthos",
    "GYDI": "Gymnocladus dioicus",
    "HAVI": "Hamamelis virginiana",
    "ILCO": "Ilex coriaceae",
    "JUSC": "Juglans cinerea",
    "LIST": "Liquidambar styraciflua",
    "NYSY": "Nyssa sylvatica",
    "PONI": "Populus nigra",
    "POTR": "Populus tremuloides",
    "PRSE": "Prunus serotina",
    "PRSU": "Prunus subhirtella",
    "PRVI": "Prunus virginiana",
    "PYCA": "Pyrus calleryana",
    "QUCO": "Quercus coccinea",
    "QUMA": "Quercus macrocarpa",
    "QUPA": "Quercus palustris",
    "QURO": "Quercus robur",
    "ROPS": "Robinia pseudoacacia",
    "SEGI": "Sequoiadendron giganteum",
    "SOJA": "Sophora japonica",
    "SYRE": "Syringa reticulata",
    "TADI": "Taxodium distichum",
    "THOC": "Thuja occidentalis",
    "THPL": "Thuja plicata",
    "TIAM": "Tilia americana",
    "TICO": "Tilia cordata",
    "TIPL": "Tilia platyphyllos",
    "ULAM": "Ulmus americana",
    "ULPA": "Ulmus parvifolia",
    "ZESE": "Zelkova serrata"
}

sciDict.update(validCodes)

def getSpeciesName(code):
    if code in sciDict:
        return sciDict[code]
    else:
        return sciDict.get(code[:4], None)


In [57]:
# reverse comma ordering
def reorganizeComma(name):
    if isinstance(name, str):
        if ',' in name:
            parts = name.split(',', 1)
            preComma = parts[0].strip()
            postComma = parts[1].strip()
            return postComma + ' ' + preComma
        else:
            return name
    else:
        return name

In [58]:
# Handle NA Values
def handleNA(value):
    value = value.replace('.', '')
    # questions --- common name as array, or string?
    naValues = {'', 'a', 'nan', 'other', 'n/a', ' ', 'not specified', 'na', 'none', 'p', 'f'}
    containsNaValues = {'not specified', 'unidentified', 'unsuitable', 'vacant', '*', '_', '-', 'proposed', 'unknown', '#', 'other ', 'no ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}

    if value.strip() in naValues or any(word in value for word in containsNaValues):
        return 'NA'
    else:
        # If the value contains there characters, include only the elements before
        if "'" in value:
            return value.split("'")[0]
        elif "(" in value:
            return value.split("(")[0]
        elif ":" in value:
            return value.split(":")[0]
        elif "`" in value:
            return value.split("`")[0]
        elif "‘" in value:
            return value.split("‘")[0]

        return value

In [59]:





    


# ==================================================================================================================================
# Get city and create columns

# Find the folder path and create a dataframe from it
folder_path = "G:/Shared drives/host_tree_cnn/cleaning_species_names/og_inventories_modified_labels"

# Iterate through the files
for i, filename in enumerate(os.listdir(folder_path)):
    if filename == 'usda_code.csv':
        continue
    print(filename)
    print("FILE #", i)
    file_path = os.path.join(folder_path, filename)
    if not filename.endswith('.csv'):
        continue
        
    # city in to dataframe
    cityDF = pd.read_csv(file_path, dtype=str)
    cityDF = cityDF.map(reorganizeComma)
    if filename == 'vancouver_inventory.csv':
        cityDF["rounded_lat"] = vcCoords["rounded_lat"]
        cityDF["rounded_lng"] = vcCoords["rounded_lng"]

    # Columbus - special case
    if filename == 'columbus_inventory.csv':
        cityDF['common_name'] = cityDF['SP_CODE'].map(commonDict)
        cityDF['species_name'] = cityDF['SP_CODE'].map(getSpeciesName)


    # Drop all na values and format the table to a string
    cityDF['unique_common_name'] = cityDF['common_name'].astype(str)

    # bloomington - special case
    if filename == 'bloomington_inventory.csv':
      cityDF['species_name'] = cityDF['unique_common_name'].apply(getScientificNames)
    cityDF['unique_sciname'] = cityDF['species_name'].astype(str)

    # handle space removal and lowercase conversion
    cityDF['unique_sciname'] = cityDF['unique_sciname'].str.strip() 
    cityDF['unique_sciname'] = cityDF['unique_sciname'].str.lower()

    cityDF['unique_common_name'] = cityDF['unique_common_name'].str.strip()
    cityDF['unique_common_name'] = cityDF['unique_common_name'].str.lower()

    cityDF['unique_sciname'] = cityDF['unique_sciname'].map(handleNA)
    cityDF['unique_common_name'] = cityDF['unique_common_name'].map(handleNA)
    

    # clean the scientific name
    cityDF[['genus_name', 'species_name']] = cityDF.apply(
        lambda row: pd.Series(cleanScientific(row['unique_sciname'])),
        axis=1,
        result_type='expand'
    )
    
    # take out the columns we need and then reorder them, placing them at the end
    cols = [col for col in cityDF.columns if col not in ['unique_common_name', 'unique_sciname', 'genus_name', 'species_name']]
    cols += ['unique_common_name', 'unique_sciname', 'genus_name', 'species_name']

    # Reorder the DataFrame
    cityDF = cityDF[cols]




    # save to a csv
    cityDF.to_csv("G:/Shared drives/host_tree_cnn/cleaning_species_names/og_inventories_w_names_appended/" + filename, index=False)



vancouver_inventory.csv
FILE # 0
washingtondc_inventory.csv
FILE # 1
bloomington_inventory.csv
FILE # 2
boulder_inventory.csv
FILE # 3
buffalo_inventory.csv
FILE # 4
calgary_inventory.csv
FILE # 5
cambridge_inventory.csv
FILE # 6
charlottesville_inventory.csv
FILE # 7
columbus_inventory.csv
FILE # 8
cupertino_inventory.csv
FILE # 9
denver_inventory.csv
FILE # 10
edmonton_inventory.csv
FILE # 11
kitchener_inventory.csv
FILE # 12
losangeles_inventory.csv
FILE # 13
montreal_inventory.csv
FILE # 14
newyorkcity_inventory.csv
FILE # 15
pittsburgh_inventory.csv
FILE # 16
sanfrancisco_inventory.csv
FILE # 17
sanjose_inventory.csv
FILE # 18
santamonica_inventory.csv
FILE # 19
['tree', 'site', 'deleted']
seattle_inventory.csv
FILE # 20
siouxfalls_inventory.csv
FILE # 21
surrey_inventory.csv
FILE # 22
