In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as matplot
import numpy as np

%matplotlib inline

## Import South East Brazil Weather Data

In [8]:
# Initially load all the data into the data frame
# Initially we will create a model so that we are able to predict the current temperature. 
# For that reason I am taking the temperature variable in a separate series which can act as a label later.
df = pd.read_csv('sudeste.csv')

In [9]:
df_RJ = df[df['prov']=="RJ"]

In [10]:
temp_RJ = df_RJ.pop('temp')

In [11]:
df_RJ.head()

Unnamed: 0,wsid,wsnm,elvt,lat,lon,inme,city,prov,mdct,date,...,tmax,dmax,tmin,dmin,hmdy,hmax,hmin,wdsp,wdct,gust
0,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 00:00:00,2007-11-06,...,29.7,16.8,25.5,10.8,35.0,58.0,32.0,3.2,101.0,6.5
1,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 01:00:00,2007-11-06,...,29.9,13.6,29.0,12.2,39.0,39.0,35.0,3.6,94.0,6.4
2,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 02:00:00,2007-11-06,...,29.0,14.0,27.4,13.6,44.0,44.0,39.0,2.5,93.0,6.9
3,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 03:00:00,2007-11-06,...,27.4,16.9,25.8,14.1,58.0,58.0,44.0,1.7,96.0,5.8
4,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 04:00:00,2007-11-06,...,26.3,17.0,25.3,16.4,57.0,58.0,56.0,3.1,110.0,7.5


In [12]:
df_RJ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1617624 entries, 0 to 7032727
Data columns (total 30 columns):
wsid    1617624 non-null int64
wsnm    1617624 non-null object
elvt    1617624 non-null float64
lat     1617624 non-null float64
lon     1617624 non-null float64
inme    1617624 non-null object
city    1617624 non-null object
prov    1617624 non-null object
mdct    1617624 non-null object
date    1617624 non-null object
yr      1617624 non-null int64
mo      1617624 non-null int64
da      1617624 non-null int64
hr      1617624 non-null int64
prcp    284504 non-null float64
stp     1617624 non-null float64
smax    1617624 non-null float64
smin    1617624 non-null float64
gbrd    936616 non-null float64
dewp    1617603 non-null float64
tmax    1617623 non-null float64
dmax    1617603 non-null float64
tmin    1617622 non-null float64
dmin    1617584 non-null float64
hmdy    1617624 non-null float64
hmax    1617624 non-null float64
hmin    1617596 non-null float64
wdsp    148967

In [13]:
df_RJ.drop(['prcp'], axis=1, inplace=True)
df_RJ.info()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1617624 entries, 0 to 7032727
Data columns (total 29 columns):
wsid    1617624 non-null int64
wsnm    1617624 non-null object
elvt    1617624 non-null float64
lat     1617624 non-null float64
lon     1617624 non-null float64
inme    1617624 non-null object
city    1617624 non-null object
prov    1617624 non-null object
mdct    1617624 non-null object
date    1617624 non-null object
yr      1617624 non-null int64
mo      1617624 non-null int64
da      1617624 non-null int64
hr      1617624 non-null int64
stp     1617624 non-null float64
smax    1617624 non-null float64
smin    1617624 non-null float64
gbrd    936616 non-null float64
dewp    1617603 non-null float64
tmax    1617623 non-null float64
dmax    1617603 non-null float64
tmin    1617622 non-null float64
dmin    1617584 non-null float64
hmdy    1617624 non-null float64
hmax    1617624 non-null float64
hmin    1617596 non-null float64
wdsp    1489673 non-null float64
wdct    16176

In [14]:
df_RJ.duplicated().value_counts()

False    1617624
dtype: int64

In [15]:
# Remove the unnecessary columns like related to date, time, weather station number, etc.
df_RJ.drop(['wsid', 'wsnm', 'inme', 'city', 'prov', 'mdct', 'date', 'yr', 'mo', 'da', 'hr'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [16]:
df_RJ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1617624 entries, 0 to 7032727
Data columns (total 18 columns):
elvt    1617624 non-null float64
lat     1617624 non-null float64
lon     1617624 non-null float64
stp     1617624 non-null float64
smax    1617624 non-null float64
smin    1617624 non-null float64
gbrd    936616 non-null float64
dewp    1617603 non-null float64
tmax    1617623 non-null float64
dmax    1617603 non-null float64
tmin    1617622 non-null float64
dmin    1617584 non-null float64
hmdy    1617624 non-null float64
hmax    1617624 non-null float64
hmin    1617596 non-null float64
wdsp    1489673 non-null float64
wdct    1617624 non-null float64
gust    1570748 non-null float64
dtypes: float64(18)
memory usage: 234.5 MB


In [17]:
for missing in df_RJ:
    df_RJ[missing].fillna(df_RJ[missing].median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [18]:
df_RJ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1617624 entries, 0 to 7032727
Data columns (total 18 columns):
elvt    1617624 non-null float64
lat     1617624 non-null float64
lon     1617624 non-null float64
stp     1617624 non-null float64
smax    1617624 non-null float64
smin    1617624 non-null float64
gbrd    1617624 non-null float64
dewp    1617624 non-null float64
tmax    1617624 non-null float64
dmax    1617624 non-null float64
tmin    1617624 non-null float64
dmin    1617624 non-null float64
hmdy    1617624 non-null float64
hmax    1617624 non-null float64
hmin    1617624 non-null float64
wdsp    1617624 non-null float64
wdct    1617624 non-null float64
gust    1617624 non-null float64
dtypes: float64(18)
memory usage: 234.5 MB


Therefore we have verified that the minimum value of the data in the column yr is 2000 and that is when all the weather stations started operating. Now we have eliminated one condition of cleaning the data i.e. all the weather stations are running from the 
year 2000 till 2016.

Here we are checking if there is any row where all the values are NA so that we can drop it. Becasue that particular row is of no use and can be easily dropped. From the results we find that there is no row where all the rows are NA values.

Here we are checking if there is any column where all the values are NA so that we can drop it. From the results it is clear that there is no column with all NA values.

All the rows are unique and there are no duplicate rows which could be dropped.

With df.count() we are able to see how many entries are present in each column. By which we can get to know how many missin values are present in each column. From the above table it is clearly evident that most of the columns dont have any missing values. But the column Precipitation has very few values in there as compared to the rest of the columns. The column of solar radiation also has some data missing and some other few columns with some data missing.
Now we will have to decide which column data needs to be imputed with some other data or what needs to be done about it.

Now the only column which is not required here is the precipitation column which can be dropped. We will test our model and the results and based on that we can decide whether to include the parameter or not.

Now the data is uniform and all the columns are filled with exact number of rows. We have dropped the column of Precipitation but we can add it later if required and the results are not in our favour. Now, lets focus on the main task of predicting the Temperature (current temperature) using the above data.

### Split the data into train and test data

In [19]:
# Import the libraries to create the training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_RJ, temp_RJ, test_size=0.20, random_state=42)

## Train on Numeric Features

Since there are no missing values here we don't need to fill any columns with any median values.

In [20]:
X_train = X_train.reset_index()

## Random Forest Baseline Model

In [21]:
# Create the baseline model using Random Forest
model_1 = RandomForestRegressor(oob_score=True, random_state=42)

## Out of Bag Score (Built in Cross Validation)

In [22]:
# Fit and Evaluate OOB
model_1 = model_1.fit(X_train, y_train)

# Calculate OOB score
print("The OOB Score is :" + str(model_1.oob_score_))

The OOB Score is :0.919660515384


  warn("Some inputs do not have OOB scores. "


## Cross Validation Score

In [25]:
rf_result = cross_val_score(model_1, X_train, y_train, scoring='neg_mean_absolute_error')

rf_result.mean()

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "


-0.14342797670753607