In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import NearMiss

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
#Let's load our datasets
weather = pd.read_csv('../CapstoneTwo/weather_clean.csv')
train = pd.read_csv('../CapstoneTwo/train_clean.csv')
test = pd.read_csv('../CapstoneTwo/test_clean.csv')

In [4]:
#Let's create functions to handle datetime
def datetime(df):
    df['Date'] = pd.to_datetime(df['Date'])

#And extract the year, month, week, and day
def ymwd(df):
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week
    df['Day'] = df['Date'].dt.day_of_year

In [5]:
#A function to shift weather features
def lag_weather(df):

    df['Tmax_1w'] = df['Tmax'].shift(7)
    df['Tmax_2w'] = df['Tmax'].shift(14)
    df['Tmax_3w'] = df['Tmax'].shift(21)

    df['Tmin_1w'] = df['Tmin'].shift(7)
    df['Tmin_2w'] = df['Tmin'].shift(14)
    df['Tmin_3w'] = df['Tmin'].shift(21)

    df['Tavg_1w'] = df['Tavg'].shift(7)
    df['Tavg_2w'] = df['Tavg'].shift(14)
    df['Tavg_3w'] = df['Tavg'].shift(21)

    df['Depart_1w'] = df['Depart'].shift(7)
    df['Depart_2w'] = df['Depart'].shift(14)
    df['Depart_3w'] = df['Depart'].shift(21)

    df['DewPoint_1w'] = df['DewPoint'].shift(7)
    df['DewPoint_2w'] = df['DewPoint'].shift(14)
    df['DewPoint_3w'] = df['DewPoint'].shift(21)

    df['WetBulb_1w'] = df['WetBulb'].shift(7)
    df['WetBulb_2w'] = df['WetBulb'].shift(14)
    df['WetBulb_3w'] = df['WetBulb'].shift(21)

    df['Heat_1w'] = df['Heat'].shift(7)
    df['Heat_2w'] = df['Heat'].shift(14)
    df['Heat_3w'] = df['Heat'].shift(21)

    df['Cool_1w'] = df['Cool'].shift(7)
    df['Cool_2w'] = df['Cool'].shift(14)
    df['Cool_3w'] = df['Cool'].shift(21)

    df['PrecipTotal_1w'] = df['PrecipTotal'].shift(7)
    df['PrecipTotal_2w'] = df['PrecipTotal'].shift(14)
    df['PrecipTotal_3w'] = df['PrecipTotal'].shift(21)

    df['StnPressure_1w'] = df['StnPressure'].shift(7)
    df['StnPressure_2w'] = df['StnPressure'].shift(14)
    df['StnPressure_3w'] = df['StnPressure'].shift(21)

    df['SeaLevel_1w'] = df['SeaLevel'].shift(7)
    df['SeaLevel_2w'] = df['SeaLevel'].shift(14)
    df['SeaLevel_3w'] = df['SeaLevel'].shift(21)

    df['ResultSpeed_1w'] = df['ResultSpeed'].shift(7)
    df['ResultSpeed_2w'] = df['ResultSpeed'].shift(14)
    df['ResultSpeed_3w'] = df['ResultSpeed'].shift(21)

    df['ResultDir_1w'] = df['ResultDir'].shift(7)
    df['ResultDir_2w'] = df['ResultDir'].shift(14)
    df['ResultDir_3w'] = df['ResultDir'].shift(21)

    df['AvgSpeed_1w'] = df['AvgSpeed'].shift(7)
    df['AvgSpeed_2w'] = df['AvgSpeed'].shift(14)
    df['AvgSpeed_3w'] = df['AvgSpeed'].shift(21)

    return df


In [6]:
#Let's create a function to get dummy variables for species
def spec_dummies(df):
    df = pd.concat([df, pd.get_dummies(df['Species'], drop_first=True)], axis=1)
# We can drop the species column now that we have our dummies.
    df.drop('Species', axis=1, inplace=True)
    return df

In [7]:
#Let's create a function to cast features to float
def float(df):
    for col in df.columns:
        try:
            df[col] = df[col].astype(float)
        except:
            print(col, 'Cannot be transformed into a float')
            pass
    return df

In [8]:
#We'll deal with date features in all of the datasets now
datetime(weather)
datetime(train)
datetime(test)

ymwd(weather)
ymwd(train)
ymwd(test)

#### Let's work on feature engineering for our weather dataset.

In [9]:
#Let's lag features in weather using our predefined function
weather = lag_weather(weather)

In [10]:
#We need to handle the missing values created from shifting features
#Note: I need a better way to fill the NaN values
weather.interpolate(method ='bfill', limit_direction ='backward', inplace=True)

#### Let's create dummy variables for species in our train and test set.

In [11]:
train = spec_dummies(train)
test = spec_dummies(test)

In [12]:
#Combine datasets
train_final = pd.merge(train, weather,on=['Date', 'Year', 'Month', 'Week', 'Day'],how='left')
test_final = pd.merge(test, weather,on=['Date', 'Year', 'Month', 'Week', 'Day'],how='left')

In [13]:
#Let's save our combined datasets
train_final.to_csv('../CapstoneTwo/train_final.csv',index=False)
test_final.to_csv('../CapstoneTwo/test_final.csv',index=False)


In [19]:
train_final.head()

Unnamed: 0,Date,Latitude,Longitude,WnvPresent,Year,Month,Week,Day,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Tmax_1w,Tmax_2w,Tmax_3w,Tmin_1w,Tmin_2w,Tmin_3w,Tavg_1w,Tavg_2w,Tavg_3w,Depart_1w,Depart_2w,Depart_3w,DewPoint_1w,DewPoint_2w,DewPoint_3w,WetBulb_1w,WetBulb_2w,WetBulb_3w,Heat_1w,Heat_2w,Heat_3w,Cool_1w,Cool_2w,Cool_3w,PrecipTotal_1w,PrecipTotal_2w,PrecipTotal_3w,StnPressure_1w,StnPressure_2w,StnPressure_3w,SeaLevel_1w,SeaLevel_2w,SeaLevel_3w,ResultSpeed_1w,ResultSpeed_2w,ResultSpeed_3w,ResultDir_1w,ResultDir_2w,ResultDir_3w,AvgSpeed_1w,AvgSpeed_2w,AvgSpeed_3w
0,2007-05-29,41.95469,-87.800991,0,2007,5,22,149,0,1,0,0,0,0,88.0,62.5,75.25,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,87.0,81.0,81.0,57.5,56.5,57.0,72.25,68.75,69.0,10.0,11.0,12.0,45.0,56.0,57.5,58.0,61.0,62.5,0.0,0.0,0.0,7.5,4.0,4.0,0.0,0.49,0.005,29.375,29.16,29.325,30.055,29.835,30.025,10.05,8.1,2.6,17.0,26.0,9.5,10.4,11.55,5.6
1,2007-05-29,41.95469,-87.800991,0,2007,5,22,149,0,0,1,0,0,0,88.0,62.5,75.25,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,87.0,81.0,81.0,57.5,56.5,57.0,72.25,68.75,69.0,10.0,11.0,12.0,45.0,56.0,57.5,58.0,61.0,62.5,0.0,0.0,0.0,7.5,4.0,4.0,0.0,0.49,0.005,29.375,29.16,29.325,30.055,29.835,30.025,10.05,8.1,2.6,17.0,26.0,9.5,10.4,11.55,5.6
2,2007-05-29,41.994991,-87.769279,0,2007,5,22,149,0,0,1,0,0,0,88.0,62.5,75.25,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,87.0,81.0,81.0,57.5,56.5,57.0,72.25,68.75,69.0,10.0,11.0,12.0,45.0,56.0,57.5,58.0,61.0,62.5,0.0,0.0,0.0,7.5,4.0,4.0,0.0,0.49,0.005,29.375,29.16,29.325,30.055,29.835,30.025,10.05,8.1,2.6,17.0,26.0,9.5,10.4,11.55,5.6
3,2007-05-29,41.974089,-87.824812,0,2007,5,22,149,0,1,0,0,0,0,88.0,62.5,75.25,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,87.0,81.0,81.0,57.5,56.5,57.0,72.25,68.75,69.0,10.0,11.0,12.0,45.0,56.0,57.5,58.0,61.0,62.5,0.0,0.0,0.0,7.5,4.0,4.0,0.0,0.49,0.005,29.375,29.16,29.325,30.055,29.835,30.025,10.05,8.1,2.6,17.0,26.0,9.5,10.4,11.55,5.6
4,2007-05-29,41.974089,-87.824812,0,2007,5,22,149,0,0,1,0,0,0,88.0,62.5,75.25,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,87.0,81.0,81.0,57.5,56.5,57.0,72.25,68.75,69.0,10.0,11.0,12.0,45.0,56.0,57.5,58.0,61.0,62.5,0.0,0.0,0.0,7.5,4.0,4.0,0.0,0.49,0.005,29.375,29.16,29.325,30.055,29.835,30.025,10.05,8.1,2.6,17.0,26.0,9.5,10.4,11.55,5.6


In [20]:
test_final.head()

Unnamed: 0,Date,Latitude,Longitude,Year,Month,Week,Day,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Tmax_1w,Tmax_2w,Tmax_3w,Tmin_1w,Tmin_2w,Tmin_3w,Tavg_1w,Tavg_2w,Tavg_3w,Depart_1w,Depart_2w,Depart_3w,DewPoint_1w,DewPoint_2w,DewPoint_3w,WetBulb_1w,WetBulb_2w,WetBulb_3w,Heat_1w,Heat_2w,Heat_3w,Cool_1w,Cool_2w,Cool_3w,PrecipTotal_1w,PrecipTotal_2w,PrecipTotal_3w,StnPressure_1w,StnPressure_2w,StnPressure_3w,SeaLevel_1w,SeaLevel_2w,SeaLevel_3w,ResultSpeed_1w,ResultSpeed_2w,ResultSpeed_3w,ResultDir_1w,ResultDir_2w,ResultDir_3w,AvgSpeed_1w,AvgSpeed_2w,AvgSpeed_3w
0,2008-06-11,41.95469,-87.800991,2008,6,24,163,0,1,0,0,0,0,0,86.0,63.5,74.75,7.0,55.5,64.0,0.0,10.0,416.0,1926.0,0.0,29.31,29.98,9.15,18.0,10.2,78.0,61.5,65.5,56.5,44.0,43.0,67.25,52.75,54.25,2.0,-10.0,-7.0,60.0,33.5,36.0,62.5,43.5,46.0,0.0,12.0,10.5,2.5,0.0,0.0,0.015,0.0,0.0,28.925,29.63,29.025,29.61,30.335,29.7,3.8,8.0,7.0,4.5,6.5,32.5,4.75,8.6,9.0
1,2008-06-11,41.95469,-87.800991,2008,6,24,163,0,0,1,0,0,0,0,86.0,63.5,74.75,7.0,55.5,64.0,0.0,10.0,416.0,1926.0,0.0,29.31,29.98,9.15,18.0,10.2,78.0,61.5,65.5,56.5,44.0,43.0,67.25,52.75,54.25,2.0,-10.0,-7.0,60.0,33.5,36.0,62.5,43.5,46.0,0.0,12.0,10.5,2.5,0.0,0.0,0.015,0.0,0.0,28.925,29.63,29.025,29.61,30.335,29.7,3.8,8.0,7.0,4.5,6.5,32.5,4.75,8.6,9.0
2,2008-06-11,41.95469,-87.800991,2008,6,24,163,1,0,0,0,0,0,0,86.0,63.5,74.75,7.0,55.5,64.0,0.0,10.0,416.0,1926.0,0.0,29.31,29.98,9.15,18.0,10.2,78.0,61.5,65.5,56.5,44.0,43.0,67.25,52.75,54.25,2.0,-10.0,-7.0,60.0,33.5,36.0,62.5,43.5,46.0,0.0,12.0,10.5,2.5,0.0,0.0,0.015,0.0,0.0,28.925,29.63,29.025,29.61,30.335,29.7,3.8,8.0,7.0,4.5,6.5,32.5,4.75,8.6,9.0
3,2008-06-11,41.95469,-87.800991,2008,6,24,163,0,0,0,1,0,0,0,86.0,63.5,74.75,7.0,55.5,64.0,0.0,10.0,416.0,1926.0,0.0,29.31,29.98,9.15,18.0,10.2,78.0,61.5,65.5,56.5,44.0,43.0,67.25,52.75,54.25,2.0,-10.0,-7.0,60.0,33.5,36.0,62.5,43.5,46.0,0.0,12.0,10.5,2.5,0.0,0.0,0.015,0.0,0.0,28.925,29.63,29.025,29.61,30.335,29.7,3.8,8.0,7.0,4.5,6.5,32.5,4.75,8.6,9.0
4,2008-06-11,41.95469,-87.800991,2008,6,24,163,0,0,0,0,0,1,0,86.0,63.5,74.75,7.0,55.5,64.0,0.0,10.0,416.0,1926.0,0.0,29.31,29.98,9.15,18.0,10.2,78.0,61.5,65.5,56.5,44.0,43.0,67.25,52.75,54.25,2.0,-10.0,-7.0,60.0,33.5,36.0,62.5,43.5,46.0,0.0,12.0,10.5,2.5,0.0,0.0,0.015,0.0,0.0,28.925,29.63,29.025,29.61,30.335,29.7,3.8,8.0,7.0,4.5,6.5,32.5,4.75,8.6,9.0


#### Time to get our X and y

In [14]:
#Remove our target and use only numeric data
X = train_final.drop(columns='WnvPresent')._get_numeric_data()
#Isolate our target
y = train_final['WnvPresent']

In [15]:
#Call train_test_split with an 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [16]:
X_train.shape

(8404, 70)

In [17]:
#Scale our data (We chose to use MinMaxScaler for a light touch, and because the data isn't normally distributed.)
scaler = MinMaxScaler()

#Call scaler (do not fit on X_test)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
#We'll address our imbalanced data with NearMiss to undersample
nm = NearMiss(version=3)
X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)


In [24]:
#We can see the data is now balanced
X_train_nm.shape

(624, 70)

This is my first pass at this section. I had a few questions.

1. After shifting, what is the best way to fill all of those NaNs? Currently, it's backfilled but I know this isn't adequate.

2. Should I drop date since it is not numeric?

3. I'm a little confused on getting the X and y and if I should use train_test_split. Because we are given the test set, should I just set X_train = my training data with the target feature dropped, y_train as the training data target feature, X_test as the test data, which leaves me confused as to what y_test might be since there is no "WnvPresent" in the test dataset?

4. I'm torn between MinMaxScaler and Standard Scaler. I went with MinMaxScaler because our data does not seem normally distributed.

5. Curious why we should undersample rather than oversample. The data is balanced, but seemingly leaves very few records in the training set. I chose NearMiss as it seemed pretty thorough from my research.

6. Finally, I assume my order is correct - split the data, scale it (only transforming on X_test,) and finally undersampling.