In [97]:
# packages
import pandas as pd


# functions
def text_to_dataframe(text, col_name):
    return pd.DataFrame(text, columns=[col_name])

def remove_extra_chars(df):
    df['species'] = df['species'].apply(lambda x: x.replace('*', '').replace('\n', ''))
    return df

def remove_subspecies(df):
    return df[~df['species'].str.contains('\+')].copy()

def split_text(df):
    df['species'] = df['species'].apply(lambda x: x.split(' ')[0:-1])
    return df

def classify_naming_convention(df):
    df['name_class'] = df['species'].apply(lambda x: len(x))
    return df[df['name_class'] < 8]

def specify_position(name_class_value, adjustment):
    return name_class_value - adjustment

def extract_fourletter_code(df):
    df['fourletter_code'] = df.apply(lambda x: x['species'][specify_position(x['name_class'], 3)], axis=1)
    return df

def extract_english_name(df):
    df['english_name'] = df.apply(lambda x: ' '.join(x['species'][: specify_position(x['name_class'], 3)]), axis=1)
    return df

def extract_scientific_name(df):
    df['scientific_name'] = df['species'].apply(lambda x: ' '.join(x[-2:]))
    return df


# load and process species_codes.txt
with open('C:/Users/Work/Documents/Repos/avian_monitoring/raw_data/species_codes.txt', 'r') as t:
    text = t.readlines()
    df = text_to_dataframe(text, 'species')

bird_codes_df = (
    df.pipe(remove_extra_chars)
        .pipe(remove_subspecies)
        .pipe(split_text)
        .pipe(classify_naming_convention)
        .pipe(extract_fourletter_code)
        .pipe(extract_english_name)
        .pipe(extract_scientific_name)
        .drop(['species', 'name_class'], axis=1)
)


# export df
bird_codes_df.to_csv('C:/Users/Work/Documents/Repos/avian_monitoring/species_codes.csv', index=False)

In [None]:
# save as a json not a csv