In this notebook we merge and clean Florida Climate Center's data to create climatic features for our machine learning learning model.

# 1. Loading data

In [1]:
#importing useful libraries
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#detecting the files in our directory
path = os.getcwd()
files = os.listdir(path)
files

['Tarpon Springs.csv',
 'Miyaka River SP.csv',
 'Venice.csv',
 'Bradenton.csv',
 'Tampa AP.csv',
 'Mimai Beach.csv',
 'Key West.csv',
 'Everglades.csv',
 'Panama City.csv',
 'Pensacola (2011-2017).csv',
 'Titusville.csv',
 'Melbourne Airport.csv',
 'Jacksonville Beach.csv',
 'TemperatureofWeek',
 'Fort Myers.csv',
 'Weeki Wachee.csv',
 '2. Wrangling Florida Weather Data.ipynb',
 'Fort Lauderdale Beach.csv',
 'Temperature&Precipitation.csv',
 'Niceville.csv',
 'Punta Gorda.csv',
 'Fort Pierce.csv',
 'Perry.csv',
 'Apalachicola.csv',
 'Daytona.csv',
 '.ipynb_checkpoints',
 'Naples.csv',
 'West Palm Beach Int AP.csv',
 'Saint Augustine.csv',
 'Stuart.csv',
 'Fort Lauderdale.csv',
 'Crestview.csv',
 'Vero Beach AP.csv']

In [3]:
#We will use the data from following stations
Stations =[ 'Niceville','Panama City', 'Apalachicola', 'Perry', 'Weeki Wachee', 'Tarpon Springs', 'Venice',
            'Punta Gorda', 'Fort Myers','Naples', 'Key West', 'Mimai Beach', 'Fort Lauderdale',  
           'West Palm Beach Int AP', 'Stuart', 'Fort Pierce', 'Vero Beach AP', 'Titusville', 'Daytona',
            'Saint Augustine', 'Jacksonville Beach']

#Here are corresponding csv files
csvfiles = [x + '.csv' for x in Stations]

#The corresponding counties are listed below. Some stations correspond to more than one county
#So we write corresponding counties in a list
Corresponding = [['Escambia', 'Santa Rosa', 'Okaloosa', 'Walton'], ['Bay', 'Gulf'], ['Franklin', 'Wakulla'],  
['Taylor', 'Dixie', 'Levy'], ['Citrus', 'Hernando','Pasco'], ['Pinellas', 'Hillsborough', 'Manatee'], ['Sarasota'], 
['Charlotte'], ['Lee'], ['Collier'], ['Monroe'],['Dade'], ['Broward'], ['Palm Beach'], ['Martin'], ['St Lucie'], 
['Indian River'], ['Brevard'],['Volusia'], ['Flagler', 'St Johns'], ['Duval', 'Nassau']]

In [4]:
#Column names for our dataframe
col_names = ['ID', 'Year', 'Month', 'Day', 'Precipitation', 'MaxTemp', 'MinTemp', 'MeanTemp']

In [5]:
#Now we load our data
Temps = pd.DataFrame() #create an empty dataframe
for index in range(len(csvfiles)): #for each csv file
    for j in range(len(Corresponding[index])): #for each of the corresponding counties
        data = pd.read_csv(csvfiles[index],names = col_names, skiprows = 2, encoding='latin1')
        #load data from the csv file
        data['County'] = Corresponding[index][j] #create a new column with the corresponding county
        Temps = Temps.append(data)#add data to dataframe

In [6]:
Temps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 222863 entries, 0 to 6586
Data columns (total 9 columns):
ID               222826 non-null object
Year             222800 non-null object
Month            222796 non-null object
Day              222792 non-null float64
Precipitation    222792 non-null float64
MaxTemp          222792 non-null float64
MinTemp          222792 non-null float64
MeanTemp         222792 non-null object
County           222863 non-null object
dtypes: float64(4), object(5)
memory usage: 17.0+ MB


In [7]:
Temps.drop(['ID', 'MinTemp', 'MeanTemp'],axis = 1,inplace = True) #ID, MinTemp, MaxTemp columns won't be useful, so drop

In [8]:
Temps.head()

Unnamed: 0,Year,Month,Day,Precipitation,MaxTemp,County
0,2000,1,1.0,0.0,69.0,Escambia
1,2000,1,2.0,0.0,69.0,Escambia
2,2000,1,3.0,0.0,72.0,Escambia
3,2000,1,4.0,0.05,72.0,Escambia
4,2000,1,5.0,0.0,72.0,Escambia


In [9]:
Temps.dropna(axis=0,inplace =True) #dropping rows with null values

In [10]:
Temps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 222792 entries, 0 to 6584
Data columns (total 6 columns):
Year             222792 non-null object
Month            222792 non-null object
Day              222792 non-null float64
Precipitation    222792 non-null float64
MaxTemp          222792 non-null float64
County           222792 non-null object
dtypes: float64(3), object(3)
memory usage: 11.9+ MB


# 2. Handling missing values and adding features

In [11]:
#Turning year,month,date into integers
Temps['Year'] = Temps['Year'].astype('int')
Temps['Month'] = Temps['Month'].astype('int')
Temps['Day'] = Temps['Day'].astype('int')

In [12]:
Temps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 222792 entries, 0 to 6584
Data columns (total 6 columns):
Year             222792 non-null int64
Month            222792 non-null int64
Day              222792 non-null int64
Precipitation    222792 non-null float64
MaxTemp          222792 non-null float64
County           222792 non-null object
dtypes: float64(2), int64(3), object(1)
memory usage: 11.9+ MB


In [13]:
#In the head of the dataframe above we see a -99.9 value
#Since this is Florida data, weather should barely go below 0
#However, we see 11055 such instances
len(Temps[Temps['Precipitation'] < 0])

11055

In [14]:
#Let us turn these negative numbers into null values
Temps['MaxTemp'] = Temps['MaxTemp'].apply(lambda x: np.nan if x < 0 else x)

In [15]:
#Do the same for precipitation
Temps['Precipitation'] = Temps['Precipitation'].apply(lambda x: np.nan if x < 0 else x)

In [16]:
#Now fill those entries with the column mean
for column in ['MaxTemp','Precipitation']:
    Temps[column] = Temps[column].fillna(Temps[column].mean())

In [17]:
#Defining the maximum temprature for last 1 thru 7 days by shifting columns wisely
Temps['MaxTemp1d'] = Temps['MaxTemp'].shift(1)

Temps['MaxTemp2d'] = Temps['MaxTemp'].shift(2)

Temps['MaxTemp3d'] = Temps['MaxTemp'].shift(3)

Temps['MaxTemp4d'] = Temps['MaxTemp'].shift(4)

Temps['MaxTemp5d'] = Temps['MaxTemp'].shift(5)

Temps['MaxTemp6d'] = Temps['MaxTemp'].shift(6)

Temps['MaxTemp7d'] = Temps['MaxTemp'].shift(7)

In [18]:
#Defining the precipitation for last 1 thru 7 days by shifting columns wisely
Temps['Precipitation1d'] = Temps['Precipitation'].shift(1)

Temps['Precipitation2d'] = Temps['Precipitation'].shift(2)

Temps['Precipitation3d'] = Temps['Precipitation'].shift(3)

Temps['Precipitation4d'] = Temps['Precipitation'].shift(4)

Temps['Precipitation5d'] = Temps['Precipitation'].shift(5)

Temps['Precipitation6d'] = Temps['Precipitation'].shift(6)

Temps['Precipitation7d'] = Temps['Precipitation'].shift(7)

In [19]:
Temps.head(10)

Unnamed: 0,Year,Month,Day,Precipitation,MaxTemp,County,MaxTemp1d,MaxTemp2d,MaxTemp3d,MaxTemp4d,MaxTemp5d,MaxTemp6d,MaxTemp7d,Precipitation1d,Precipitation2d,Precipitation3d,Precipitation4d,Precipitation5d,Precipitation6d,Precipitation7d
0,2000,1,1,0.0,69.0,Escambia,,,,,,,,,,,,,,
1,2000,1,2,0.0,69.0,Escambia,69.0,,,,,,,0.0,,,,,,
2,2000,1,3,0.0,72.0,Escambia,69.0,69.0,,,,,,0.0,0.0,,,,,
3,2000,1,4,0.05,72.0,Escambia,72.0,69.0,69.0,,,,,0.0,0.0,0.0,,,,
4,2000,1,5,0.0,72.0,Escambia,72.0,72.0,69.0,69.0,,,,0.05,0.0,0.0,0.0,,,
5,2000,1,6,0.0,56.0,Escambia,72.0,72.0,72.0,69.0,69.0,,,0.0,0.05,0.0,0.0,0.0,,
6,2000,1,7,0.0,64.0,Escambia,56.0,72.0,72.0,72.0,69.0,69.0,,0.0,0.0,0.05,0.0,0.0,0.0,
7,2000,1,8,0.0,64.0,Escambia,64.0,56.0,72.0,72.0,72.0,69.0,69.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0
8,2000,1,9,0.0,64.0,Escambia,64.0,64.0,56.0,72.0,72.0,72.0,69.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0
9,2000,1,10,1.05,73.0,Escambia,64.0,64.0,64.0,56.0,72.0,72.0,72.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0


In [20]:
#Just checking a random slice to see if everything looks right
Temps[(Temps['Year'] == 2002) & (Temps['County'] == 'Dixie') & (Temps['Month'] == 2)]

Unnamed: 0,Year,Month,Day,Precipitation,MaxTemp,County,MaxTemp1d,MaxTemp2d,MaxTemp3d,MaxTemp4d,MaxTemp5d,MaxTemp6d,MaxTemp7d,Precipitation1d,Precipitation2d,Precipitation3d,Precipitation4d,Precipitation5d,Precipitation6d,Precipitation7d
763,2002,2,1,0.0,79.0,Dixie,81.0,80.0,79.0,79.0,66.0,65.0,74.0,0.0,0.0,0.0,0.5,0.0,0.04,0.54
764,2002,2,2,0.0,79.0,Dixie,79.0,81.0,80.0,79.0,79.0,66.0,65.0,0.0,0.0,0.0,0.0,0.5,0.0,0.04
765,2002,2,3,0.0,68.0,Dixie,79.0,79.0,81.0,80.0,79.0,79.0,66.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
766,2002,2,4,0.0,71.0,Dixie,68.0,79.0,79.0,81.0,80.0,79.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
767,2002,2,5,0.0,61.0,Dixie,71.0,68.0,79.0,79.0,81.0,80.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
768,2002,2,6,0.0,68.0,Dixie,61.0,71.0,68.0,79.0,79.0,81.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
769,2002,2,7,0.43,71.0,Dixie,68.0,61.0,71.0,68.0,79.0,79.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
770,2002,2,8,0.0,60.0,Dixie,71.0,68.0,61.0,71.0,68.0,79.0,79.0,0.43,0.0,0.0,0.0,0.0,0.0,0.0
771,2002,2,9,0.0,72.0,Dixie,60.0,71.0,68.0,61.0,71.0,68.0,79.0,0.0,0.43,0.0,0.0,0.0,0.0,0.0
772,2002,2,10,0.0,74.0,Dixie,72.0,60.0,71.0,68.0,61.0,71.0,68.0,0.0,0.0,0.43,0.0,0.0,0.0,0.0


In [21]:
#Creating the temprature and precipitation dataset as a csv file
Temps.to_csv('Temperature&Precipitation.csv', encoding='utf-8', index=False)