In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as matplot
import numpy as np

%matplotlib inline

## Import South East Brazil Weather Data

In [2]:
# Initially load all the data into the data frame
# Initially we will create a model so that we are able to predict the current temperature. 
# For that reason I am taking the temperature variable in a separate series which can act as a label later.
df = pd.read_csv('sudeste.csv')
temp = df.pop("temp")

In [3]:
df.head()

Unnamed: 0,wsid,wsnm,elvt,lat,lon,inme,city,prov,mdct,date,...,tmax,dmax,tmin,dmin,hmdy,hmax,hmin,wdsp,wdct,gust
0,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 00:00:00,2007-11-06,...,29.7,16.8,25.5,10.8,35.0,58.0,32.0,3.2,101.0,6.5
1,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 01:00:00,2007-11-06,...,29.9,13.6,29.0,12.2,39.0,39.0,35.0,3.6,94.0,6.4
2,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 02:00:00,2007-11-06,...,29.0,14.0,27.4,13.6,44.0,44.0,39.0,2.5,93.0,6.9
3,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 03:00:00,2007-11-06,...,27.4,16.9,25.8,14.1,58.0,58.0,44.0,1.7,96.0,5.8
4,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 04:00:00,2007-11-06,...,26.3,17.0,25.3,16.4,57.0,58.0,56.0,3.1,110.0,7.5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9779168 entries, 0 to 9779167
Data columns (total 30 columns):
wsid    int64
wsnm    object
elvt    float64
lat     float64
lon     float64
inme    object
city    object
prov    object
mdct    object
date    object
yr      int64
mo      int64
da      int64
hr      int64
prcp    float64
stp     float64
smax    float64
smin    float64
gbrd    float64
dewp    float64
tmax    float64
dmax    float64
tmin    float64
dmin    float64
hmdy    float64
hmax    float64
hmin    float64
wdsp    float64
wdct    float64
gust    float64
dtypes: float64(19), int64(5), object(6)
memory usage: 2.2+ GB


In [5]:
# To find the statistical information of the data
df.describe()

Unnamed: 0,wsid,elvt,lat,lon,yr,mo,da,hr,prcp,stp,...,tmax,dmax,tmin,dmin,hmdy,hmax,hmin,wdsp,wdct,gust
count,9779168.0,9779168.0,9779168.0,9779168.0,9779168.0,9779168.0,9779168.0,9779168.0,1407984.0,9779168.0,...,9779142.0,9778858.0,9779134.0,9778361.0,9779168.0,9779156.0,9779124.0,8853607.0,9779168.0,9462694.0
mean,359.2531,594.0923,-20.23082,-44.6459,2011.164,6.52196,15.75465,11.5,0.9366544,880.4292,...,21.10503,15.24025,19.86418,14.22089,67.26667,69.96991,64.41965,1.998156,138.5991,4.494015
std,39.0163,398.0379,3.172643,4.882117,3.207774,3.425538,8.802154,6.92219,2.923291,248.265,...,7.545549,5.866811,7.134849,5.777089,26.54213,26.43371,26.5655,1.618531,105.2018,2.98179
min,178.0,0.0,-24.96282,-56.67732,2000.0,1.0,1.0,0.0,0.0,0.0,...,-3.2,-10.0,-8.5,-10.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,328.0,283.0,-22.3958,-47.4341,2009.0,4.0,8.0,5.0,0.0,911.3,...,18.2,12.7,17.2,11.6,53.0,58.0,49.0,0.8,56.0,2.3
50%,358.0,573.0,-20.75041,-44.45378,2011.0,7.0,16.0,12.0,0.0,944.2,...,21.9,16.6,20.8,15.6,74.0,78.0,70.0,1.7,114.0,4.2
75%,394.0,875.0,-18.91707,-42.43575,2014.0,9.0,23.0,18.0,0.6,973.1,...,25.8,19.4,24.2,18.4,89.0,91.0,86.0,2.9,216.0,6.3
max,423.0,1758.0,0.0,0.0,2016.0,12.0,31.0,23.0,100.0,1050.0,...,45.0,44.8,45.0,44.9,100.0,100.0,100.0,19.8,360.0,50.0


In [6]:
df.shape

(9779168, 30)

In [7]:
# In the data it has been mentioned that all the weather stations started operating from the year 2000 so we can drop all the 
# rows which has the data from previous years and will not be relevant to current weather data.
df[df.yr < 2000]
df.shape

(9779168, 30)

In [8]:
# Here we can see that all the data in the column year is more than 2000 so no rows are dropped based on that condition
# But we will verify it again by looking at the minimum value of the year column
print (df['yr'].min())
print (df['yr'].max())

2000
2016


Therefore we have verified that the minimum value of the data in the column yr is 2000 and that is when all the weather stations started operating. Now we have eliminated one condition of cleaning the data i.e. all the weather stations are running from the 
year 2000 till 2016.

In [9]:
df.dropna(axis=0, how='all')
df.shape

(9779168, 30)

Here we are checking if there is any row where all the values are NA so that we can drop it. Becasue that particular row is of no use and can be easily dropped. From the results we find that there is no row where all the rows are NA values.

In [10]:
df.dropna(axis=1, how='all')
df.shape

(9779168, 30)

Here we are checking if there is any column where all the values are NA so that we can drop it. From the results it is clear that there is no column with all NA values.

In [11]:
# Check for any duplicate data
df.duplicated().value_counts()

False    9779168
dtype: int64

All the rows are unique and there are no duplicate rows which could be dropped.

In [12]:
df.count()

wsid    9779168
wsnm    9779168
elvt    9779168
lat     9779168
lon     9779168
inme    9779168
city    9779168
prov    9779168
mdct    9779168
date    9779168
yr      9779168
mo      9779168
da      9779168
hr      9779168
prcp    1407984
stp     9779168
smax    9779168
smin    9779168
gbrd    5670348
dewp    9778693
tmax    9779142
dmax    9778858
tmin    9779134
dmin    9778361
hmdy    9779168
hmax    9779156
hmin    9779124
wdsp    8853607
wdct    9779168
gust    9462694
dtype: int64

With df.count() we are able to see how many entries are present in each column. By which we can get to know how many missin values are present in each column. From the above table it is clearly evident that most of the columns dont have any missing values. But the column Precipitation has very few values in there as compared to the rest of the columns. The column of solar radiation also has some data missing and some other few columns with some data missing.
Now we will have to decide which column data needs to be imputed with some other data or what needs to be done about it.

In [13]:
# Impute missing values with median values for the columns with some missing data
missing_columns = ["gbrd", "dewp", "tmax", "dmax", "tmin", "dmin", "hmax", "hmin", "wdsp", "gust"]

for missing in missing_columns:
    df[missing].fillna(df[missing].median(), inplace=True)

In [14]:
df.count()

wsid    9779168
wsnm    9779168
elvt    9779168
lat     9779168
lon     9779168
inme    9779168
city    9779168
prov    9779168
mdct    9779168
date    9779168
yr      9779168
mo      9779168
da      9779168
hr      9779168
prcp    1407984
stp     9779168
smax    9779168
smin    9779168
gbrd    9779168
dewp    9779168
tmax    9779168
dmax    9779168
tmin    9779168
dmin    9779168
hmdy    9779168
hmax    9779168
hmin    9779168
wdsp    9779168
wdct    9779168
gust    9779168
dtype: int64

Now the only column which is not required here is the precipitation column which can be dropped. We will test our model and the results and based on that we can decide whether to include the parameter or not.

In [15]:
df.drop(['prcp'], axis=1, inplace=True)
df.count()

wsid    9779168
wsnm    9779168
elvt    9779168
lat     9779168
lon     9779168
inme    9779168
city    9779168
prov    9779168
mdct    9779168
date    9779168
yr      9779168
mo      9779168
da      9779168
hr      9779168
stp     9779168
smax    9779168
smin    9779168
gbrd    9779168
dewp    9779168
tmax    9779168
dmax    9779168
tmin    9779168
dmin    9779168
hmdy    9779168
hmax    9779168
hmin    9779168
wdsp    9779168
wdct    9779168
gust    9779168
dtype: int64

Now the data is uniform and all the columns are filled with exact number of rows. We have dropped the column of Precipitation but we can add it later if required and the results are not in our favour. Now, lets focus on the main task of predicting the Temperature (current temperature) using the above data.

### Split the data into train and test data

In [41]:
# Import the libraries to create the training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, temp, test_size=0.20, random_state=42)

## Train on Numeric Features

In [42]:
# Taking all the numeric features in a list
#numeric_data = ['elvt', 'lat', 'lon', 'stp', 'smax', 'smin', 'gbrd', 'dewp', 'tmax', 'dmax', 'tmin', 'dmin', 'hmdy', 'hmax',
#               'hmin', 'wdsp', 'wdct', 'gust']
    
numeric_data = ['stp', 'smax', 'smin', 'gbrd', 'dewp', 'hmdy',
                'wdsp', 'wdct', 'gust']
X_numeric = X_train[numeric_data]
X_numeric.head()

Unnamed: 0,stp,smax,smin,gbrd,dewp,hmdy,wdsp,wdct,gust
8240730,970.5,970.7,970.3,8.923,17.2,97.0,0.7,341.0,1.7
1903388,913.7,913.7,913.2,232.0,14.5,78.0,3.4,124.0,8.5
6378219,823.1,823.4,822.9,366.207,9.7,82.0,7.2,318.0,14.5
7239226,942.5,943.4,942.4,863.427,17.7,62.0,1.3,101.0,5.3
5311471,881.7,881.7,881.3,863.427,6.7,40.0,1.7,114.0,1.8


In [43]:
X_numeric.count()

stp     7823334
smax    7823334
smin    7823334
gbrd    7823334
dewp    7823334
hmdy    7823334
wdsp    7823334
wdct    7823334
gust    7823334
dtype: int64

Since there are no missing values here we don't need to fill any columns with any median values.

In [48]:
X_numeric = X_numeric.fillna(X_numeric.mean())

In [49]:
X_numeric = X_numeric.reset_index()

## Random Forest Baseline Model

In [52]:
# Create the baseline model using Random Forest
model_1 = RandomForestRegressor(oob_score=True, n_jobs = -1, random_state=42)

## Out of Bag Score (Built in Cross Validation)

In [53]:
# Fit the Random Forest Model and evaluate the OOB score
model_1.fit(X_numeric, y_train)

# Calculate the OOB score
print ("The OOB score is: " + str(model_1.oob_score_))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').