In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
import glob
import os
import warnings
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, stddev

warnings.filterwarnings('ignore')

In [3]:
#### Snake Data
sn_data_folders = ['/home/mremington/Documents/therma_sim/therma_sim/Empirical_Data/Snake_iButton_Data/Snake_iButton_Data/Canada/',
                '/home/mremington/Documents/therma_sim/therma_sim/Empirical_Data/Snake_iButton_Data/Snake_iButton_Data/Texas/Marathon/',
                '/home/mremington/Documents/therma_sim/therma_sim/Empirical_Data/Snake_iButton_Data/Snake_iButton_Data/Nebraska/'
                  ]

def get_study_site(fp):
    fp_split = fp.split('/')
    # if fp_split[2] == 'Texas':
    #     study_site = str(fp_split[-1] + '-' + fp_split[3])
    # else:
    study_site = str(fp_split[-2])
    if study_site=='Marathon':
        study_site = 'Texas'
    return study_site

def get_otm(fp):
    csv_name = fp.split('/')[-1]
    otm_name = csv_name.split('_')[0]
    if otm_name == 'AirtempNE22':
        otm_name = 'Air Temperature'
    elif otm_name == 'Air':
        otm_name = 'Air Temperature'
    elif otm_name == 'OTM':
        otm_name = otm_name + csv_name.split('_')[1]
    return otm_name

def get_csv_file_name(fp):
    return fp.split('/')[-1]

def clean_snake_name(name):
    name = get_otm(name)
    if '(' in name:
        name = name.split('(')[0]
    return name

def study_site_key(site_name):
    """
    Dummy codes study sites into numeric codes:
    - Canada -> 1
    - Nebraska -> 2
    - Texas -> 3

    Parameters:
    - site_name: string (Study_Site)

    Returns:
    - int (dummy code)
    """
    site_mapping = {
        'Canada': 1,
        'Nebraska': 2,
        'Texas': 3
    }
    return site_mapping.get(site_name, np.nan)  # returns NaN if site_name not recognized



In [4]:
snake_main = pd.DataFrame()
for folder in sn_data_folders:
    csv_files = glob.glob(folder + "*.csv")
    for csv_file in csv_files:
        col_names = ['Date', 'Unit', 'Temperature']
        temp_df = pd.read_csv(csv_file,names = col_names, skiprows=20, header=None)
        temp_df['Date'] = pd.to_datetime(temp_df['Date']) 
        temp_df['year'] = temp_df['Date'].dt.year
        temp_df['month'] = temp_df['Date'].dt.month
        temp_df['day'] = temp_df['Date'].dt.day
        temp_df['hour'] = temp_df['Date'].dt.hour
        temp_df['minute'] = temp_df['Date'].dt.minute
        temp_df['Study_Site'] = [get_study_site(folder) for i in range(len(temp_df))]
        temp_df['Snake_Name'] = [clean_snake_name(csv_file) for i in range(len(temp_df))]
        temp_df['file_name'] = [get_csv_file_name(csv_file) for i in range(len(temp_df))]
        snake_main = concatenated_df = pd.concat([snake_main, temp_df])

In [5]:
snake_main['Study_Site_Code'] = snake_main['Study_Site'].apply(study_site_key)
snake_main.head()

Unnamed: 0,Date,Unit,Temperature,year,month,day,hour,minute,Study_Site,Snake_Name,file_name,Study_Site_Code
0,2022-06-17 23:00:01,C,21.071,2022,6,17,23,0,Canada,Moose,Moose(EDCRVI006)2022_2023.csv,1
1,2022-06-18 00:10:01,C,21.071,2022,6,18,0,10,Canada,Moose,Moose(EDCRVI006)2022_2023.csv,1
2,2022-06-18 01:20:01,C,20.571,2022,6,18,1,20,Canada,Moose,Moose(EDCRVI006)2022_2023.csv,1
3,2022-06-18 02:30:01,C,20.571,2022,6,18,2,30,Canada,Moose,Moose(EDCRVI006)2022_2023.csv,1
4,2022-06-18 03:40:01,C,20.571,2022,6,18,3,40,Canada,Moose,Moose(EDCRVI006)2022_2023.csv,1


In [14]:
unique_combinations = snake_main[['Study_Site', 'Snake_Name']].drop_duplicates()
print(unique_combinations)

  Study_Site Snake_Name
0     Canada      Moose
0     Canada      Olive
0     Canada     Hector
0     Canada    Chinook
0     Canada     Albert
0     Canada   Manitoba
0     Canada     Hunter
0     Canada    Kokanee
0     Canada     Cooper
0     Canada     Victor
0     Canada     Paloma
0     Canada      Eddie
0     Canada       Chad
0      Texas      Janis
0      Texas    Brisket
0      Texas   Cladwell
0      Texas       Cher
0      Texas      Nando
0      Texas      Dusty
0      Texas    Cecilia
0      Texas     Bonnie
0      Texas     Merlin
0      Texas   Deadwood
0      Texas     Willow
0      Texas    Tornado
0      Texas     Sedona
0      Texas       Mort
0      Texas      Bryan
0      Texas     Caezar
0      Texas      Ringo
0      Texas      Hazel
0   Nebraska      Bjorn
0   Nebraska     Horace
0   Nebraska     Ragnar


In [7]:
snake_main.to_csv('/home/mremington/Documents/therma_sim/therma_sim/Empirical_Data/snake_thermal_database.csv')

In [None]:
# Outliger Analysis: heat waves and cold snaps
def get_outlier_analysis(df, column_name):
    """
    Perform outlier analysis on a specified column of a DataFrame.

    Parameters:
    - df: pandas DataFrame
    - column_name: string (name of the column to analyze)

    Returns:
    - outliers: pandas DataFrame containing outliers
    """
    # Calculate mean and standard deviation
    mean_value = df[column_name].mean()
    std_dev = df[column_name].std()

    # Define outlier threshold (3 standard deviations from the mean)
    threshold = 3 * std_dev

    # Identify outliers
    outliers = df[(df[column_name] > mean_value + threshold) | (df[column_name] < mean_value - threshold)]

    return outliers

def plot_outliers(df, column_name):
    """
    Plot the data and highlight outliers.
    Parameters:
    - df: pandas DataFrame
    - column_name: string (name of the column to plot)
    """                         
    plt.figure(figsize=(12, 6))
    plt.plot(df['Date'], df[column_name], label='Data', color='blue')
    outliers = get_outlier_analysis(df, column_name)
    plt.scatter(outliers['Date'], outliers[column_name], color='red', label='Outliers')
    plt.title(f'Outlier Analysis for {column_name}')
    plt.xlabel('Date')
    plt.ylabel(column_name)
    plt.legend()
    plt.show()

