# Set up packages and functions
This notebook imports all packages needed and defines basic functions used by the model and to clean the ribbit scores after data has been run. 

## Import packages and define settings 

In [2]:
# suppress warnings
import warnings
warnings.simplefilter('ignore')

#import packages
import numpy as np
from glob import glob
import pandas as pd
from matplotlib import pyplot as plt
import subprocess
from datetime import datetime
from plotnine import *
import os
import math 

pd.options.display.max_colwidth = 200


#local imports from opensoundscape
from opensoundscape.audio import Audio
from opensoundscape.spectrogram import Spectrogram
from opensoundscape.ribbit import ribbit

# create big visuals
plt.rcParams['figure.figsize']=[15,8]
pd.set_option('display.precision', 2)

## Data cleaning functions

In [3]:
# Combine multiple ribbit score csv files into one csv file. Useful when you break the data into several groups to 
# run the model simultaneously on those groups and then want to recombine the data into one dataframe
# Inputs: 
# folder - folder with all of the ribbit score csv files that should be combined. 
#    csv files must have the exact same columns 
#    e.g. rs_folder = "./ribbit_scores_flshe/"
# delete_files - if True, delete the old csv files and only keep the combined file 
# Results: 
# creates new csv file with all of the ribbit scores in one csv file. deletes all of the individual files
# returns the concatonated ribbit score dataframe 

def combine_csvs(folder_path, new_csv_name = "combined_data.csv", delete_files = False): 
    files = glob(folder_path+ "*.csv") #create list of all csv files in rs_folder 
    df = pd.DataFrame()

    for fp in files: 
        if(os.path.exists(fp) and os.path.isfile(fp)): # check whether file exists or not
            temp_df = pd.read_csv(fp, index_col = 0) # read in ribbit scores
            df = pd.concat([df, temp_df]) # add it to the new dataframe 
            
            print(fp + " concatenated")
            if(delete_files): # delete old files
                os.remove(fp)
                print("file deleted")
        else:
            print(fp + " not found")

    df.to_csv(folder_path + new_csv_name) # save dataframe to csv file 
    
    return df


# calculate prescision/recall for a model 
# Inputs: 
# df - dataframe with ribbit scores and 
# cutoffs - list of cutoff values 
# score_col - name of column with RIBBIT score 
# Outputs: 
# accuracy - dataframe with precision and recall for the ribbit scores 
def calc_accuracy(df, cutoffs, score_col = "score"):    
    accuracy = pd.DataFrame()
    accuracy.index = list(cutoffs)
    accuracy['precision'] = ''
    accuracy['recall'] = ''

    for c in cutoffs: 
        accuracy['precision'][c] = len(df[(df['Lcapito']==1) & (df[score_col]>c)])/len(df[df[score_col]>c])*100
        accuracy['recall'][c] = len(df[(df['Lcapito'] == 1) & (df[score_col]>c)])/len(df[(df['Lcapito'] == 1)])*100

    accuracy = accuracy.astype(int)
    
    return accuracy




# get list of audio files with top ribbit scores
#
# input: 
# df - data frame with ribbit scores 
# n - number of files per group (e.g. n = 5 gets top 5 ribbit scores per group)
# min_score - minimum ribbit score needed for file to be included 
# (e.g. if you want all files above a ribbit score of 50, you could have min_score = 50 and n = 999999999999)
# t_unit - unit for how often we want the top scores (options: D, W, M, Y - day, week, month, year)
# group_col - the name of the column with the labels grouping our files 
#       (e.g. "pond" for sandhills or "site" for ichaway wetlands)
# groups - list of the groupings 
#       (e.g. for sandhills the pond numbers [398, 399, 400, 401, 402, 403]; for ichaway would be the wetlands' names)
# score_col - column name where ribbit score is stored 
# time_stamp_col - column name where time stamp for ribbit score is stored
# save_csv - False if we do not want to save our output to a csv. 
#       Otherwise string of the file path where we want to save the csv file (e.g. "./ribbit_scores/top_ribbit_scores_per_year.csv")
#
# out: 
# dataframe with top `n` files with ribbit score over `min_score` for each `groups` for every `t_unit` 
def get_top_rs(df, n = 5, min_score = 0.0, t_unit = "", \
               group_col = 'no_groups', groups = ["0"], \
               score_col = "score", time_stamp_col = "time_stamp", \
               save_csv = None):
    
    rs_df = df[df[score_col] > min_score] # only keep files with scores above the minimum allowed ribbit score
    
    if group_col == 'no_groups': # if we don't want to group the data - just want overall top scores
        rs_df['no_groups'] = "0"

    rs_df = rs_df[rs_df[group_col].isin(groups)] # keep data that is in the specified "groups"
    
    # group data into time groups (e.g. for every year, month, day etc.) 
    if t_unit == "": 
        rs_df['date_group'] = "0" # no groups 
    else:
        rs_df['date_group'] = rs_df['date'].dt.to_period(t_unit) # group by year, month, week, etc. 


    # get top scores by date_group and category group 
    out_df = rs_df.sort_values(by=score_col,ascending=False).groupby(by = [group_col, 'date_group']).head(n).sort_values(by = [group_col, 'year', 'date_group'])


    # organize the df
    out_df[score_col].round(2)
    
    first_cols = [group_col,'date_group', score_col, time_stamp_col] # want these columns first
    last_cols = [col for col in out_df.columns if (col not in first_cols)] # keep all other columns at end
    out_df = out_df[first_cols+last_cols] 
    #out_df = out_df.drop(labels = ["Unnamed: 0"], axis = 1) # drop this extra column 
    
    # if we didn't group the dataframe, drop the extra column 
    if 'no_groups' in out_df.columns:
        out_df = out_df.drop(labels = ['no_groups'], axis = 1)
        
    #if we didn't group the dataframe by time, drop extra column 
    if t_unit == "":
        out_df = out_df.drop(labels = ['date_group'], axis = 1)

    # save to csv if desired 
    if save_csv is not None: 
        out_df.to_csv(save_csv)
        
    
    return out_df


