In [2]:
import os
import pandas as pd
import io
import unicodedata
import traceback

root_dir = 'coral_and_sponges_all/data/pub/data/paleo/coral'

In [4]:
def create_coral_file_df(filename):
    # print("reading")
    try:
        # Using a weird encoding to solve file read issue; see https://stackoverflow.com/questions/46000191/utf-8-codec-cant-decode-byte-0x92-in-position-18-invalid-start-byte
        coral_file = open(filename,'r', encoding="ascii", errors="ignore")
        file_in = coral_file.readlines()
        if file_in[0][0] != "#":
            print(f"'{filename}' does not begin with '#'; COMPLETELY IGNORING IT")
            return None

        for i in range(len(file_in)):
            line = file_in[i]
            if "Location:" in line:
                location = line.split(": ")[1].strip()

            if "Northernmost_Latitude:" in line:
                northernmost_lat = line.split(": ")[1].strip()
            if "Northmost Latitude" in line: # other variation of north lat
                northernmost_lat = line.split(": ")[1].strip().split(' ')[0]

            if "Southernmost_Latitude:" in line:
                southernmost_lat = line.split(": ")[1].strip()
            if "Southmost Latitude:" in line: # other variation of south lat
                southernmost_lat = line.split(": ")[1].strip().split(' ')[0]

            if "Easternmost_Longitude:" in line:
                easternmost_lat = line.split(": ")[1].strip()
            if "Eastmost Longitude:" in line:
                easternmost_lat = line.split(": ")[1].strip().split(' ')[0]

            if "Westernmost_Longitude:" in line:
                westernmost_lat = line.split(": ")[1].strip()
            if "Westmost Longitude:" in line:
                westernmost_lat = line.split(": ")[1].strip().split(' ')[0]

            if "Elevation:" in line:
                elevation = line.split(": ")[1].split(' ')[0].strip()
            if "Earliest_Year:" in line:
                earliest_year = line.split(": ")[1].strip()
            if "Most_Recent_Year:" in line:
                most_recent_year = line.split(": ")[1].strip()
            if "Species_Name:" in line:
                species = line.split(": ")[1].strip('\n').strip()
            if not line.startswith('#'):
                if not line[0].isprintable():
                    continue # Don't break on weird, non-printable characters
                elif len(line.strip()) == 0:
                    continue
                else:
                    age_line = i
                    #print('exited at line: ' + str(i))
                    extracted = ' '.join(file_in[age_line:])
                    break #exit the for loop, we (theorerically) have obtained the data!

        lat = (float(northernmost_lat) + float(southernmost_lat)) / 2
        lon = (float(easternmost_lat) + float(westernmost_lat)) / 2

        data = io.StringIO(extracted)
        df = pd.read_csv(data, sep="\t", error_bad_lines=False)

        df.columns = (map(lambda x: x.lower(), df.columns))
        # print(df.columns)
        # print(type(df.columns))
        # if len(list(df.columns)) == 1:
        #     # print('in if')
        #     # print(list(df.columns))
        #     # print(list(df.columns)[0].split(' '))
        #     df.columns = list(df.columns)[0].split(' ')
        #     for i in range(len(df.columns)):
        #         if df.columns[i] == '':
        #             df.columns.pop(i)
        print(df.columns)

        # find the correct 'age' column
        if 'age' not in df.columns:
            if 'age_ad' in df.columns:
                df['age'] = df['age_ad']
            elif 'age_calad' in df.columns:
                df['age'] = df['age_calad']
            elif 'age_int' in df.columns:
                df['age'] = df['age_int']
            elif 'age_ce' in df.columns:
                df['age'] = df['age_ce']
            elif 'age_ce_d180' in df.columns:
                df['age'] = df['age_ce_d180']
            elif 'time' in df.columns:
                df['age'] = df['time']
            elif 'date' in df.columns:
                df['age'] = df['date']
            elif 'year' in df.columns:
                df['age'] = df['year']
            elif 'year ' in df.columns:
                df['age'] = df['year ']
            elif 'year_ce' in df.columns:
                df['age'] = df['year_ce']
            else:
                print("No 'age' column header detected.")

        #print(df['age'])
        print(df.columns)

        #find the correct d18O column
        if 'd18o' not in df.columns:
            count = 0
            for col in df.columns:
                # we are assuming that the first col with d18O is the one we want
                if 'd18o' in col:
                    if 'recon' in col:
                        continue
                    df['d18o'] = df[col]
                    break
        
        #print(df['d18o'])

        df = df[['age', 'd18o']]
        df = df[df['d18o'] != 'NAN']

        # average the d18O values across the years
        df['year']  = df['age'].apply(lambda x: str(x).strip()[0:4])
        df = df.groupby(by="year")["d18o"].mean().reset_index('year')

        df['lat'] = lat
        df['lon'] = lon
        df['elevation'] = elevation
        
        try:
            species
            df['species'] = species
        except NameError:
            df['species'] = 'Other'

        return(df)

        # except Exception as e:
        #     print("exception2")
        #     print("ERROR : "+str(e))
        #     # print("Error on line {}".format(sys.exc_info()[-1].tb_lineno))
            # print('hi')
    except Exception as e:
        print(filename)
        print("ERROR : " +str(e))
        traceback.print_exception(type(e), e, e.__traceback__)

In [10]:
filename = f'{root_dir}/west_pacific/Murty.MakassarStrait.2017-1.txt'
#'/atlantic/kuhnert2005-fixed.txt'
create_coral_file_df(filename)

Index(['age            d18oporites', 'unnamed: 1'], dtype='object')
No 'age' column header detected.
Index(['age            d18oporites', 'unnamed: 1'], dtype='object')
coral_and_sponges_all/data/pub/data/paleo/coral/west_pacific/Murty.MakassarStrait.2017-1.txt
ERROR : "['age'] not in index"




  create_coral_file_df(filename)
Traceback (most recent call last):
  File "/var/folders/ws/d7dc4bz53q3_26hs2vbmg8600000gn/T/ipykernel_49260/3759158150.py", line 115, in create_coral_file_df
    df = df[['age', 'd18o']]
  File "/Users/hannahmandell/Desktop/Pai/p-climate/.venv/lib/python3.8/site-packages/pandas/core/frame.py", line 3464, in __getitem__
    indexer = self.loc._get_listlike_indexer(key, axis=1)[1]
  File "/Users/hannahmandell/Desktop/Pai/p-climate/.venv/lib/python3.8/site-packages/pandas/core/indexing.py", line 1314, in _get_listlike_indexer
    self._validate_read_indexer(keyarr, indexer, axis)
  File "/Users/hannahmandell/Desktop/Pai/p-climate/.venv/lib/python3.8/site-packages/pandas/core/indexing.py", line 1377, in _validate_read_indexer
    raise KeyError(f"{not_found} not in index")
KeyError: "['age'] not in index"
