In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from time import time

# Package to calculate distance between two coordinates
import geopy.distance

# Library for datetime variables
import datetime

# Import supplementary visualization code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

In [2]:
# Load the location data of all weather stations which will be used in the analysis
stations_analyzed=pd.read_csv('stations_analyzed.csv', header=0,index_col=0)
stations_analyzed

Unnamed: 0,stations,abbreviations,latitude,longitude
12,san_diego_lindbergh_field,KSAN,32.73361,-117.18306
14,san_diego_montgomery_field,KMYF,32.81444,-117.13639
13,san_diego_brown_field,KSDM,32.57528,-116.99306
11,ramona,KRNM,33.0375,-116.91583
0,carlsbad,KCRQ,33.1268,-117.27583


In [17]:
# Define a function which first deals with the missing values in the weather data and then engineers new features
def weather_data_engineer(station):
    # Load the data which corresponds to the chosen weather station
    # station: the name of the weather station as a string
    parse_dates = ['date']
    df = pd.read_csv(station+".csv", header=0,parse_dates=parse_dates)
    
    # Define the temperature features in the weather data: 
    #'observed_low': observed lowest temperature during the day,
    #'observed_high': observed highest temperature during the day, 
    features=['observed_low','observed_high']
    
    # observed_low and observed_high have string 'M' whenever a value is missing, all other values are numeric
    
    # If the value is not numeric replace it by the value one year later, 
    # one year earlier, two years later or two years earlier 
    # until we find a numeric value with which to replace the non-numeric value.
    for feature in features:
        for i in range(len(df)):
            if (df[feature][i].isnumeric()==False)&(i+365<=len(df)):
                df[feature][i]=df[feature][i+365] 
            if (df[feature][i].isnumeric()==False)&(i-365>=0):
                df[feature][i]=df[feature][i-365] 
            if (df[feature][i].isnumeric()==False)&(i+730<=len(df)):
                df[feature][i]=df[feature][i+730] 
            if (df[feature][i].isnumeric()==False)&(i-730>=0):
                df[feature][i]=df[feature][i-730]         
    
    # Define the feature which shows the daily observed precipitation
    feature_p=['observed_precipitation']
    # observed_precipitation has 'T' or 'M' as missing values and some of the numbers are saved as strings as well
    # If the value is 'T' or 'M', replace it by the value one day later, 
    # one day earlier, one week later or one week earlier, one year later or one year earlier
    # until we find a numeric value with which to replace the non-numeric value.
    for feature_p in feature_p:
        for i in range(len(df)):
            if ((df[feature_p].iloc[i]=='T')|(df[feature_p].iloc[i]=='M'))&(i+1<=len(df)):
                df[feature_p][i]=df[feature_p][i+1] 
            if ((df[feature_p][i]=='T')|(df[feature_p][i]=='M'))&(i-1>=0):
                df[feature_p][i]=df[feature_p][i-1] 
            if ((df[feature_p][i]=='T')|(df[feature_p][i]=='M'))&(i+7<=len(df)):
                df[feature_p][i]=df[feature_p][i+7] 
            if ((df[feature_p][i]=='T')|(df[feature_p][i]=='M'))&(i-7>=0):
                df[feature_p][i]=df[feature_p][i-7]       
            if ((df[feature_p][i]=='T')|(df[feature_p][i]=='M'))&(i+365<=len(df)):
                df[feature_p][i]=df[feature_p][i+365] 
            if ((df[feature_p][i]=='T')|(df[feature_p][i]=='M'))&(i-365>=0):
                df[feature_p][i]=df[feature_p][i-365]             
    df['observed_low']=pd.to_numeric(df['observed_low']) 
    print(df['observed_low'].value_counts())
    df['observed_high']=pd.to_numeric(df['observed_high']) 
    print(df['observed_high'].value_counts())
    df['observed_precipitation']=pd.to_numeric(df['observed_precipitation']) 
    print(df['observed_precipitation'].value_counts())
    
    
    # Engineer new features related to weather 
    # 'high_low_diff': The difference between the highest and lowest temberatures observed during the day
    df['high_low_diff']=df['observed_high']-df['observed_low']
    # 'high_temp_diff': The difference with the highest temperature on the previous day
    df['high_temp_diff']=df['observed_high'].diff()
    
    # Creates series of zeros for the new features
    df['high_temp_alert']=pd.Series(np.zeros((len(df),), dtype=np.int))
    df['rain_alert']=pd.Series(np.zeros((len(df),), dtype=np.int))
    
    # Create rain alert and high temperature alert
    # df['high_temp_alert']=1 if the difference with the highest temperature on the previous day is higher than or equal to 10 F degrees, otherwise 0. 
    # df['rain_alert']=1 if no precipitation is observed on the previous day and there is positive precipitation 
    for i in range(len(df)):
        if (i>0) & (df['high_temp_diff'][i]>=10):
            df['high_temp_alert'][i]=1 
        if (i>0) & (df['observed_precipitation'].iloc[i]>0) & (df['observed_precipitation'].iloc[i-1]==0):
            df['rain_alert'][i]=1       
    df_engineered=df[['date','high_low_diff','high_temp_diff','high_temp_alert','rain_alert','observed_precipitation','observed_low','observed_high']]
    df_engineered.to_csv(station+'_engineered.csv',index=False)               

In [18]:
# Perform data cleaning and feature engineering on all 5 dataframes 
# The new csv files contain these features: 'date','high_low_diff','high_temp_diff','high_temp_alert','rain_alert','observed_precipitation','observed_low','observed_high'

for station in stations_analyzed['stations']:
    weather_data_engineer(station)

59    65
61    62
56    61
62    58
63    56
54    56
57    50
64    49
60    48
65    48
53    47
58    42
55    40
68    38
66    38
70    36
50    34
69    32
67    30
51    29
52    27
49    23
71    22
48    18
47    18
73    15
45    12
72     9
44     7
41     6
74     6
46     5
77     2
43     2
75     1
76     1
42     1
78     1
Name: observed_low, dtype: int64
67     68
69     67
70     65
71     65
75     65
73     57
74     54
76     51
78     51
72     50
66     49
64     47
68     43
77     41
79     41
65     40
80     31
62     29
63     25
82     22
61     18
81     17
85     15
60     14
83     11
84      8
86      8
88      7
58      6
87      5
89      4
92      4
59      4
90      3
91      3
57      2
98      1
94      1
96      1
97      1
101     1
Name: observed_high, dtype: int64
0.00    902
0.03     67
0.01     27
0.02     13
0.07      8
0.04      7
0.16      6
0.05      5
0.15      4
0.19      3
0.08      3
0.18      3
0.06      3
0.11      3
0.27      2
0