In [None]:
#import dependencies
import pandas as pd
import numpy as np

import os
from os import listdir
import zipfile
import glob

import pymongo

In [None]:
#create connection to MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

db = client.australia_fire_db
temp_rainfall = db.temp_rainfall

In [None]:
# unzip files in Resources folder.
extension = ".zip"
extracted_dir_name = "."

# Get the current working directory.
# Need to be in root directory of this project for this to work.
cwd_dir_name = os.getcwd()
print(f"The current working directory is {cwd_dir_name}.")

os.chdir("Resources") # change directory from working dir to dir with zip file.
# This should be the "Resources" folder.
dir_name = os.getcwd()
print(f"You are now in the following directory: {dir_name}.")

for item in os.listdir(dir_name): # loop through the items in the directory.
    if item.endswith(extension): # check for ".zip" extension"
        try:
            file_name = os.path.abspath(item) # get full path of files
            zip_ref = zipfile.ZipFile(file_name) # create zipfile object
            unzipped_directory = os.path.join(extracted_dir_name) # reference to the directory where the zip files will be extracted.
            zip_ref.extractall(unzipped_directory) # extract file to dir
            zip_ref.close() # close file
            print(f"Successfully unzipped {item} into the following folder:{dir_name}.")
        except Exception as error:
            print(f"Error trying to unzip data file(s).")
            print(error)
            
# Go up one directory into the project root directory.
os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir))
print(os.path.normpath(os.getcwd() + os.sep + os.pardir))

In [None]:
#read in max temp csvs
temp_path = r'Resources/australia_temp_data' 
all_temp_files = glob.glob(temp_path + "/*.csv")

temp_li = []

for filename in all_temp_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    temp_li.append(df)

temp_df = pd.concat(temp_li, axis=0, ignore_index=True)
temp_df

In [None]:
#read in rainfall csvs
rainfall_path = r'Resources/australia_rainfall_data' 
all_rainfall_files = glob.glob(rainfall_path + "/*.csv")

rainfall_li = []

for filename in all_rainfall_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    rainfall_li.append(df)

rainfall_df = pd.concat(rainfall_li, axis=0, ignore_index=True)
rainfall_df

In [None]:
# Keep neccessary columns from temp_df
max_temp_df = temp_df[["Station Number",
                    "Year", 
                    "Annual"]]

# Keep neccessary columns from rainfall_df
new_rainfall_df = rainfall_df[["Station Number",
                    "Year", 
                    "Annual"]]

In [None]:
#Rename Annual column for anticipated join
annual_temp_df = max_temp_df.rename(columns={"Annual": "Annual Max Temp"})
annual_rainfall_df = new_rainfall_df.rename(columns={"Annual": "Annual Rainfall"})

In [None]:
#Drop null values
new_annual_temp_df = annual_temp_df.dropna(subset=['Annual Max Temp'])

In [None]:
#Drop null values
new_annual_rainfall_df = annual_rainfall_df.dropna(subset=['Annual Rainfall'])

In [None]:
#Filter for data from year 1956-2019
#most capitals have data from 1956-2019
#2020 had an annual value of NaN since it isn't technically a complete year
filtered_temp_df = new_annual_temp_df.loc[(new_annual_temp_df['Year'] >= 1956)]
filtered_rainfall_df = new_annual_rainfall_df.loc[(new_annual_rainfall_df['Year'] >= 1956)]
filtered_rainfall_df

In [None]:
#Overall mean temp and rainfall for future difference from mean calculation
overall_mean_temp = filtered_temp_df["Annual Max Temp"].mean()
print(f'Overall Mean Temp from 1956 to 2019: {overall_mean_temp}')

overall_mean_rainfall = filtered_rainfall_df["Annual Rainfall"].mean()
print(f'Overall Mean Rainfall from 1956 to 2019: {overall_mean_rainfall}')

In [None]:
#Find mean temp and rainfall for each year
temps_by_year = filtered_temp_df.groupby("Year")
mean_temps = temps_by_year["Annual Max Temp"].mean()

rainfall_by_year = filtered_rainfall_df.groupby("Year")
mean_rainfall = rainfall_by_year["Annual Rainfall"].mean()

In [None]:
#create a new dataframe for just averages
yearly_averages_df = pd.DataFrame({"Avg Annual Temp": mean_temps,
                              "Avg Annual Rainfall": mean_rainfall})
yearly_averages_df

In [None]:
#reset index so that the year will be included in a dictionary
yearly_avg_df = yearly_averages_df.reset_index()
yearly_avg_df

In [None]:
yearly_avg_df['temp_differnce'] = yearly_avg_df['Avg Annual Temp'] - overall_mean_temp
yearly_avg_df['rainfall_difference'] = yearly_avg_df['Avg Annual Rainfall'] - overall_mean_rainfall
yearly_avg_df

In [None]:
# transform the dataframe into a dictionary
yearly_avg_dict = yearly_avg_df.to_dict('records')
yearly_avg_dict

In [None]:
# insert records into the MongoDB collection histroicalFires

if (temp_rainfall.count() == 0):
    temp_rainfall.insert(yearly_avg_dict)
    
else:
    print("Data already exists")