# Setup

Todo: 
- [ ] Check if gopher frog files are grouped - do they happen a bunch of files in a row? Bunch of days in a row? 
- [ ] Clean up EDA
- [ ] Clean parameter selection for use in methods 
- [ ] Still losing some files when merging - why??

#*# - indicates locations where you may want to edit (e.g. file paths, parameter values, etc.)

## Import packages and define settings 

In [3]:
# suppress warnings
import warnings
warnings.simplefilter('ignore')

#import packages
import numpy as np
from glob import glob
import pandas as pd
from matplotlib import pyplot as plt
import subprocess
from datetime import datetime
from plotnine import *
import os

pd.options.display.max_colwidth = 200


#local imports from opensoundscape
from opensoundscape.audio import Audio
from opensoundscape.spectrogram import Spectrogram
from opensoundscape.ribbit import ribbit

# create big visuals
plt.rcParams['figure.figsize']=[15,8]
pd.set_option('display.precision', 2)

## Useful functions 

In [19]:
# Combine multiple ribbit score csv files into one csv file. Useful when you break the data into several groups to 
# run the model simultaneously on those groups and then want to recombine the data into one dataframe
# Inputs: 
# folder - folder with all of the ribbit score csv files that should be combined. 
#    csv files must have the exact same columns 
#    e.g. rs_folder = "./ribbit_scores_flshe/"
# delete_files - if True, delete the old csv files and only keep the combined file 
# Results: 
# creates new csv file with all of the ribbit scores in one csv file. deletes all of the individual files
# returns the concatonated ribbit score dataframe 

def combine_csvs(folder_path, new_csv_name = "combined_data.csv", delete_files = False): 
    files = glob(folder_path+ "*.csv") #create list of all csv files in rs_folder 
    df = pd.DataFrame()

    for fp in files: 
        if(os.path.exists(fp) and os.path.isfile(fp)): # check whether file exists or not
            temp_df = pd.read_csv(fp, index_col = 0) # read in ribbit scores
            df = pd.concat([df, temp_df]) # add it to the new dataframe 
            
            print(fp + " concatenated")
            if(delete_files): # delete old files
                os.remove(fp)
                print("file deleted")
        else:
            print(fp + " not found")

    df.to_csv(folder_path + new_csv_name) # save dataframe to csv file 
    
    return df


# calculate prescision/recall for a model 
# Inputs: 
# df - dataframe with ribbit scores and 
# cutoffs - list of cutoff values 
# score_col - name of column with RIBBIT score 
# Outputs: 
# accuracy - dataframe with precision and recall for the ribbit scores 
def calc_accuracy(df, cutoffs, score_col = "score"):    
    accuracy = pd.DataFrame()
    accuracy.index = list(cutoffs)
    accuracy['precision'] = ''
    accuracy['recall'] = ''

    for c in cutoffs: 
        accuracy['precision'][c] = len(df[(df['Lcapito']==1) & (df[score_col]>c)])/len(df[df[score_col]>c])*100
        accuracy['recall'][c] = len(df[(df['Lcapito'] == 1) & (df[score_col]>c)])/len(df[(df['Lcapito'] == 1)])*100

    accuracy = accuracy.astype(int)
    
    return accuracy




# get list of audio files with top ribbit scores
#
# input: 
# df - data frame with ribbit scores 
# n - number of files per group (e.g. n = 5 gets top 5 ribbit scores per group)
# min_score - minimum ribbit score needed for file to be included 
# (e.g. if you want all files above a ribbit score of 50, min_score = 50 and n = 999999999999)
# group_col - the name of the column with the labels grouping our files 
#            (e.g. "pond" for sandhills or "Site" for ichaway wetlands)
#             if group_col = "", just takes overall top scores 
# groups - list of labels of the different categories we want to split the data into - e.g. "range(398, 403)"" for sandhills ponds or "ich_wetlands" for ichaway wetlands
# t_unit - unit for how often we want the top scores (options: D, W, M, Y - day, week, month, year)
#
# out: 
# dataframe with top `n` files with ribbit score over `min_score` for each `groups` for every `t_unit` 
def get_top_rs(df, n = 5, min_score = 0.0, group_col = 'no_groups', groups = range(398, 403), t_unit = "Y", score_col = "score", time_stamp_col = "time_stamp"):
    
    rs_df = df[df[score_col] > min_score] # only keep files with scores above the minimum allowed ribbit score
    
    if group_col == 'no_groups': # if we don't want to group the data - just want overall top scores
        rs_df['no_groups'] = "0"
        groups = ["0"]

    rs_df = rs_df[rs_df[group_col].isin(groups)]
    rs_df['date_group'] = rs_df['date'].dt.to_period(t_unit)


    out_df = rs_df.sort_values(by=score_col,ascending=False).groupby(by = [group_col, 'date_group']).head(n).sort_values(by = [group_col, 'year', 'date_group'])


        # organize the df
    out_df['file_path'] = out_df['file_path'].str.slice(61) # simplify file path to not have beginning crap
    
    first_cols = ['Lcapito',group_col,'date_group', score_col, time_stamp_col, 'file_path']
    last_cols = [col for col in out_df.columns if (col not in first_cols)]
    out_df = out_df[first_cols+last_cols]
    out_df = out_df.drop(labels = ["Unnamed: 0"], axis = 1)
    
    if 'no_groups' in out_df.columns:
        out_df = out_df.drop(labels = ['no_groups'], axis = 1)
    #out_df = out_df.reindex(columns = ['Lcapito', group_col, 'date_group', score_col, time_stamp_col, 'file_path']) #order columns 

    
    return out_df






# FLSHE data 


## Data cleaning

### Combine multiple ribbit score csv files 

In [448]:
# Only need to run if you need to combine ribbit scores from multiple csv files  
# Useful if you broke up a model run into section to run it faster 
# WARNING: if delete_files = True this will delete individual files after combining them
# keep next 2 lines commented out unless running this chunk to avoid deleting files unintentionally 

# folder_path = "./ribbit_scores_flshe_20221206/" #*# path to folder containing the csv files you want to combine 
# rs_flshe = combine_csvs(folder_path, new_csv_name = "ribbit_scores_combined.csv", delete_files = True)


### Set up file and folder paths for data import and cleaning 

In [None]:
# file path to csv file with ribbit scores 
ribbit_scores_fp = "./ribbit_scores_flshe_20221206/ribbit_scores_combined.csv" #*#

# file path to csv file with manually verified data 
verified_data_fp = "../manually_verified_data/FLSHE_pond400.csv" #*#

# path to folder containing audio files 
audio_files_fp = '/Volumes/Expansion/Frog Call Project/Calling Data/FLSHE/' #*#
# Note: if the folders within this folder are structured differently, you may need to edit the full file paths in the 
#       data cleaning section below (inicated with #*#)



### Import and clean csv file with ribbit scores 

In [409]:
# Import ribbit scores based on ribbit_scores_fp
rs_flshe = pd.read_csv(ribbit_scores_fp, index_col = 0)

# extract date from file path 
rs_flshe['date'] = pd.to_datetime(rs_flshe.index.str[-19:-4], format='%Y%m%d_%H%M%S', errors='coerce') 


### Import and clean manually verified data 

In [443]:
# import manually verified data 
verified_flshe = pd.read_csv(verified_data_fp)[["File name", "Pond #", "L. capito", "gopher call time", "Date"]] # keeps only listed columns 

# rename columns for convenience
verified_flshe = verified_flshe.rename(columns = {"File name":"file_name", "Pond #":"logger", "L. capito":"Lcapito", "gopher call time":"call_time", "Date":"date"})

# make Lcapito categorical
verified_flshe.Lcapito = verified_flshe.Lcapito.astype("category")

# create year column based on date string
verified_flshe['year'] = verified_flshe.date.str[0:4]
verified_flshe.astype({"year":"int"})

# add .wav to file name if it is not included with the file name 
for i in verified_flshe.index:
    if verified_flshe["file_name"][i][-4:] != ".wav": 
        verified_flshe["file_name"][i] = verified_flshe["file_name"][i] + ".wav"
    
#*# create full file path from file names, year, and logger numbers #*# 
verified_flshe['file_path'] = audio_files_fp + 'FLSHE_' + \
    verified_flshe['year'].astype('string') + \
    '/FLSHE_' + verified_flshe['year'].astype('string') + '_' + verified_flshe['logger'].astype('string') + '/' + \
    verified_flshe['file_name'] #*#

# set file path as index 
verified_flshe = verified_flshe.set_index('file_path')


### Merge ribbit scores to manually verified data 

In [None]:
# merge with ribbit scores data file 
verified_flshe = verified_flshe.drop(columns = ["year", "date", "logger"]).merge(rs_flshe, left_index = True, right_index = True)
verified_flshe = verified_flshe.dropna(subset=['Lcapito']) # drop any rows with "NaN" for Lcapito - if left empty, etc. 


# Ichaway data 

## Data cleaning

### Combine multiple ribbit score csv files 

In [5]:
# Only need to run if you need to combine ribbit scores from multiple csv files  
# Useful if you broke up a model run into section to run it faster 
# WARNING: if delete_files = True this will delete individual files after combining them
# keep next 2 lines commented out unless running this chunk to avoid deleting files unintentionally 

# folder_path = "./ribbit_scores_ich_20221206/" #*# path to folder containing the csv files you want to combine 
# rs_ich = combine_csvs(folder_path, new_csv_name = "ribbit_scores_combined.csv", delete_files = True)


### Combine multiple csv files with manually verified data 

In [None]:
# Only run this once! if you need to combine manually verified data files 
# Data must have the following columns: "Site", "Logger", "Sample Date", "Species", "NAAMP", "File ID"
# folder_path = "../manually_verified_data/ichaway_verified_data/"
# raw_ich = combine_csvs(folder_path, new_csv_name = "ichaway_verified_data.csv")

### Set up file and folder paths for data import and cleaning 

In [114]:
# file path to csv file with ribbit scores 
ribbit_scores_fp = "./ribbit_scores_ich_20221206/ribbit_scores_combined.csv" #*#

# file path to csv file with manually verified data 
verified_data_fp = "../manually_verified_data/ichaway_verified_data/ichaway_verified_data.csv" #*#

# path to folder containing audio files 
audio_files_fp = '/Volumes/Expansion/Frog Call Project/Calling Data/ichaway/' #*#
# Note: if the folders within this folder are structured differently, you may need to edit the full file paths in the 
#       data cleaning section below (inicated with #*#)


### Import and clean csv files with ribbit scores 

In [115]:
# Import ribbit scores based on ribbit_scores_fp
rs_ich = pd.read_csv(ribbit_scores_fp, index_col = 0)

# extract date from file path 
rs_ich['date'] = pd.to_datetime(rs_ich.index.str[-19:-4], format='%Y%m%d_%H%M%S', errors='coerce') 


### Import and clean manually verfied files data 

In [123]:
# import manually verified data
raw_ich = pd.read_csv(verified_data_fp)[["Site", "Logger", "Sample Date", "Species", "NAAMP", "File ID", "Start Date"]]

# rename columns for convenience
raw_ich = raw_ich.rename(columns = {"Site":"site", "Logger":"logger", "Sample Date":"date", "Species":"species", "File ID":"file_name", "Start Date":"folder_date"})

# create year column based on date string
raw_ich['year'] = raw_ich.date.astype(str).str[-4:]
raw_ich.astype({"year":"int"})


# create full file path from file names andd logger numbers 
raw_ich['folder_date'] = pd.to_datetime(raw_ich['folder_date'], format='%m/%d/%Y').dt.strftime('%-m-%-d-%y')
raw_ich['file_path'] = audio_files_fp + 'ichaway_' + raw_ich['year'].astype('string') + '/' + raw_ich['logger'].astype('string') + 'a/' + raw_ich['folder_date'] + '/' + raw_ich['file_name'] + '.wav' #*#
# set file path as index 
raw_ich = raw_ich.set_index('file_path')

# identify which rows are Lcapito observations 
raw_ich['Lcapito'] = raw_ich['species'] == 'LICAP'
raw_ich['Lcapito'] = raw_ich['Lcapito'].astype('category')

# create "verified_ich" dataframe with one row per file with a column (Lcapito) with 1 if the file has a Lcapito and 0 if it does not
verified_ich = raw_ich.sort_values(["file_path", "Lcapito"], ascending = False).groupby('file_path').head(1) 

# these files were labeled incorrectly in the ichaway data - there are gopher frogs in them 
# logger 5a: 
#20150205_194700
#20150205_204700
#20150205_214700

# fix these mistakes
temp = audio_files_fp + 'ichaway_2015/5a/2-2-15/20150205_'
incorrect_files =  [temp + '194700.wav', temp + '204700.wav', temp + '214700.wav']
verified_ich.loc[verified_ich.index.isin(incorrect_files),'Lcapito'] = True




### Merge ribbit scores and verified files  

In [70]:
# merge option 1 - use this if option 2 is causing issues 

# merge with ribbit scores based on file path
# this drops some files where the file path minutes don't match between the rs_ich and verified_ich
#verified_ich = verified_ich.drop(columns = ["year", "date", "logger"]).merge(rs_ich, left_index = True, right_index = True)
#verified_ich = verified_ich.dropna(subset=['Lcapito']) # drop any rows with "NaN" for Lcapito - if data was entered incorrectly, empty, etc. 


In [124]:
# merge option 2

# merge with ribbit scores based on hour of file path (ignore minutes - these sometimes don't match for some reason)
# still drops some files but not as many 
# warning: potential to match incorrect files (e.g. if one file is labeled 10:01 and another 10:58)
rs_ich["fp_shortened"] = rs_ich.index.str[:-8]
verified_ich["fp_shortened"] = verified_ich.index.str[:-8]
verified_ich = verified_ich.drop(columns = ["year", "date", "logger"]).merge(rs_ich, left_on = "fp_shortened", right_on = "fp_shortened")
verified_ich = verified_ich.dropna(subset=['Lcapito']).drop(columns = ["fp_shortened"]) # drop any rows with "NaN" for Lcapito - if data was entered incorrectly, empty, etc. 

### TODO:  still losing some files after merging - why?