# Import packages and define settings 

In [None]:
# suppress warnings
import warnings
warnings.simplefilter (“ignore”)

#import packages
import numpy as np
from glob import glob
import pandas as pd
from matplotlib import pyplot as plt
import subprocess
from datetime import datetime
from plotnine import *
import os

pd.options.display.max_colwidth = 200


#local imports from opensoundscape
from opensoundscape.audio import Audio
from opensoundscape.spectrogram import Spectrogram
from opensoundscape.ribbit import ribbit

# create big visuals
plt.rcParams['figure.figsize']=[15,8]
pd.set_option('display.precision', 2)

# Data cleaning functions

In [None]:
# Combine multiple ribbit score csv files into one csv file. Useful when you break the data into several groups to 
# run the model simultaneously on those groups and then want to recombine the data into one dataframe
# Inputs: 
# folder - folder with all of the ribbit score csv files that should be combined. 
#    csv files must have the exact same columns 
#    e.g. rs_folder = "./ribbit_scores_flshe/"
# delete_files - if True, delete the old csv files and only keep the combined file 
# Results: 
# creates new csv file with all of the ribbit scores in one csv file. deletes all of the individual files
# returns the concatonated ribbit score dataframe 

def combine_csvs(folder_path, new_csv_name = "combined_data.csv", delete_files = False): 
    files = glob(folder_path+ "*.csv") #create list of all csv files in rs_folder 
    df = pd.DataFrame()

    for fp in files: 
        if(os.path.exists(fp) and os.path.isfile(fp)): # check whether file exists or not
            temp_df = pd.read_csv(fp, index_col = 0) # read in ribbit scores
            df = pd.concat([df, temp_df]) # add it to the new dataframe 
            
            print(fp + " concatenated")
            if(delete_files): # delete old files
                os.remove(fp)
                print("file deleted")
        else:
            print(fp + " not found")

    df.to_csv(folder_path + new_csv_name) # save dataframe to csv file 
    
    return df


# calculate prescision/recall for a model 
# Inputs: 
# df - dataframe with ribbit scores and 
# cutoffs - list of cutoff values 
# score_col - name of column with RIBBIT score 
# Outputs: 
# accuracy - dataframe with precision and recall for the ribbit scores 
def calc_accuracy(df, cutoffs, score_col = "score"):    
    accuracy = pd.DataFrame()
    accuracy.index = list(cutoffs)
    accuracy['precision'] = ''
    accuracy['recall'] = ''

    for c in cutoffs: 
        accuracy['precision'][c] = len(df[(df['Lcapito']==1) & (df[score_col]>c)])/len(df[df[score_col]>c])*100
        accuracy['recall'][c] = len(df[(df['Lcapito'] == 1) & (df[score_col]>c)])/len(df[(df['Lcapito'] == 1)])*100

    accuracy = accuracy.astype(int)
    
    return accuracy




# get list of audio files with top ribbit scores
#
# input: 
# df - data frame with ribbit scores 
# n - number of files per group (e.g. n = 5 gets top 5 ribbit scores per group)
# min_score - minimum ribbit score needed for file to be included 
# (e.g. if you want all files above a ribbit score of 50, min_score = 50 and n = 999999999999)
# group_col - the name of the column with the labels grouping our files 
#            (e.g. "pond" for sandhills or "Site" for ichaway wetlands)
#             if group_col = "", just takes overall top scores 
# groups - list of labels of the different categories we want to split the data into - e.g. "range(398, 403)"" for sandhills ponds or "ich_wetlands" for ichaway wetlands
# t_unit - unit for how often we want the top scores (options: D, W, M, Y - day, week, month, year)
#
# out: 
# dataframe with top `n` files with ribbit score over `min_score` for each `groups` for every `t_unit` 
def get_top_rs(df, n = 5, min_score = 0.0, group_col = 'no_groups', groups = range(398, 403), t_unit = "Y", score_col = "score", time_stamp_col = "time_stamp"):
    
    rs_df = df[df[score_col] > min_score] # only keep files with scores above the minimum allowed ribbit score
    
    if group_col == 'no_groups': # if we don't want to group the data - just want overall top scores
        rs_df['no_groups'] = "0"
        groups = ["0"]

    rs_df = rs_df[rs_df[group_col].isin(groups)]
    rs_df['date_group'] = rs_df['date'].dt.to_period(t_unit)


    out_df = rs_df.sort_values(by=score_col,ascending=False).groupby(by = [group_col, 'date_group']).head(n).sort_values(by = [group_col, 'year', 'date_group'])


        # organize the df
    out_df['file_path'] = out_df['file_path'].str.slice(61) # simplify file path to not have beginning crap
    
    first_cols = ['Lcapito',group_col,'date_group', score_col, time_stamp_col, 'file_path']
    last_cols = [col for col in out_df.columns if (col not in first_cols)]
    out_df = out_df[first_cols+last_cols]
    out_df = out_df.drop(labels = ["Unnamed: 0"], axis = 1)
    
    if 'no_groups' in out_df.columns:
        out_df = out_df.drop(labels = ['no_groups'], axis = 1)
    #out_df = out_df.reindex(columns = ['Lcapito', group_col, 'date_group', score_col, time_stamp_col, 'file_path']) #order columns 

    
    return out_df


