# Pre-Processing and Training Data Development

#### -  Create dummy or indicator features for categorical variables

#### - Standardize the magnitude of numeric features using a scaler

#### - Split your data into testing and training datasets

In [77]:
# Import relevant libraries and packages.
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit

import warnings # For handling error messages.

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

In [78]:
#Loading and previewing the dataframe
file = 'final_df.csv'
final_df = pd.read_csv(file, index_col=[0,1])
final_df.head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,int_perc,electricity_consumed,population,GDP,continent,country_name
Country,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AFG,1990,0.0,1.02114,12412.3,33.5173,Asia,Afghanistan
AFG,1991,0.0,1.04477,13299.0,30.4497,Asia,Afghanistan
AFG,1992,0.0,0.76805,14485.5,30.0136,Asia,Afghanistan
AFG,1993,0.0,0.72961,15816.6,24.2715,Asia,Afghanistan
AFG,1994,0.0,0.75017,17075.7,18.6874,Asia,Afghanistan
AFG,1995,0.0,0.68544,18110.7,24.393,Asia,Afghanistan
AFG,1996,0.0,0.62057,18853.4,25.2627,Asia,Afghanistan
AFG,1997,0.0,0.56407,19357.1,23.9564,Asia,Afghanistan
AFG,1998,0.0,0.5278,19737.8,22.8357,Asia,Afghanistan
AFG,1999,0.0,0.52315,20170.8,21.744,Asia,Afghanistan


## Creating dummy variables for categorical feature, 'continent'

In [5]:
#Manipulating the dataframe to make it easier to work with, is this necessary?
df = final_df.reset_index()
df.set_index('Country', inplace = True)
df

Unnamed: 0_level_0,Date,int_perc,electricity_consumed,population,GDP,continent,country_name
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AFG,1990,0.00,1.02114,12412.3,33.5173,Asia,Afghanistan
AFG,1991,0.00,1.04477,13299.0,30.4497,Asia,Afghanistan
AFG,1992,0.00,0.76805,14485.5,30.0136,Asia,Afghanistan
AFG,1993,0.00,0.72961,15816.6,24.2715,Asia,Afghanistan
AFG,1994,0.00,0.75017,17075.7,18.6874,Asia,Afghanistan
...,...,...,...,...,...,...,...
ZWE,2010,11.50,7.47868,12697.7,26.1448,Africa,Zimbabwe
ZWE,2011,15.70,7.91654,12894.3,29.8557,Africa,Zimbabwe
ZWE,2012,17.09,7.71048,13115.2,34.8313,Africa,Zimbabwe
ZWE,2013,18.50,8.07970,13350.4,35.5243,Africa,Zimbabwe


In [76]:
df.loc['USA']

KeyError: 'USA'

In [7]:
#One-hot encoding to convert 'continents' catagorical data to numerical
df = pd.get_dummies(df, columns = ['continent'])

In [61]:
df = df.reset_index().set_index('Date')
df

Unnamed: 0_level_0,Country,int_perc,electricity_consumed,population,GDP,country_name,continent_Africa,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1990,AFG,0.00,1.02114,12412.3,33.5173,Afghanistan,0,1,0,0,0,0
1991,AFG,0.00,1.04477,13299.0,30.4497,Afghanistan,0,1,0,0,0,0
1992,AFG,0.00,0.76805,14485.5,30.0136,Afghanistan,0,1,0,0,0,0
1993,AFG,0.00,0.72961,15816.6,24.2715,Afghanistan,0,1,0,0,0,0
1994,AFG,0.00,0.75017,17075.7,18.6874,Afghanistan,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2010,ZWE,11.50,7.47868,12697.7,26.1448,Zimbabwe,1,0,0,0,0,0
2011,ZWE,15.70,7.91654,12894.3,29.8557,Zimbabwe,1,0,0,0,0,0
2012,ZWE,17.09,7.71048,13115.2,34.8313,Zimbabwe,1,0,0,0,0,0
2013,ZWE,18.50,8.07970,13350.4,35.5243,Zimbabwe,1,0,0,0,0,0


In [87]:
final_df.tail(223)

Unnamed: 0_level_0,Unnamed: 1_level_0,int_perc,electricity_consumed,population,GDP,continent,country_name
Country,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
USA,1990,0.784729,2837.116986,249516.8,9798.6,North America,United States of America
USA,1991,1.163194,2886.097489,252831.2,9787.97,North America,United States of America
USA,1992,1.724203,2897.247672,256391.2,10132.74,North America,United States of America
USA,1993,2.271673,3000.745154,259809.5,10411.67,North America,United States of America
USA,1994,4.862781,3080.938525,263017.8,10831.18,North America,United States of America
USA,1995,9.237088,3164.020666,266179.2,11121.9,North America,United States of America
USA,1996,16.419353,3253.831146,269310.5,11541.47,North America,United States of America
USA,1997,21.616401,3301.923343,272559.2,12054.74,North America,United States of America
USA,1998,30.093197,3425.180366,275762.2,12594.97,North America,United States of America
USA,1999,35.848724,3483.813775,278941.5,13193.66,North America,United States of America


In [88]:
final_df.loc['USA']

Unnamed: 0_level_0,int_perc,electricity_consumed,population,GDP,continent,country_name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990,0.784729,2837.116986,249516.8,9798.6,North America,United States of America
1991,1.163194,2886.097489,252831.2,9787.97,North America,United States of America
1992,1.724203,2897.247672,256391.2,10132.74,North America,United States of America
1993,2.271673,3000.745154,259809.5,10411.67,North America,United States of America
1994,4.862781,3080.938525,263017.8,10831.18,North America,United States of America
1995,9.237088,3164.020666,266179.2,11121.9,North America,United States of America
1996,16.419353,3253.831146,269310.5,11541.47,North America,United States of America
1997,21.616401,3301.923343,272559.2,12054.74,North America,United States of America
1998,30.093197,3425.180366,275762.2,12594.97,North America,United States of America
1999,35.848724,3483.813775,278941.5,13193.66,North America,United States of America


## Splitting the Data in Train/Test Datasets

In [9]:
#Checking dtypes and location for slicing
df.dtypes

Date                         int64
int_perc                   float64
electricity_consumed       float64
population                 float64
GDP                        float64
country_name                object
continent_Africa             uint8
continent_Asia               uint8
continent_Europe             uint8
continent_North America      uint8
continent_Oceania            uint8
continent_South America      uint8
dtype: object

In [49]:
#Seperating features
X = df.iloc[:, [0,1,3,4,6,7,8,9,10,11]].set_index(['Country', 'Date'])

#Selecting for label (target feature), 'electricity_consumed'
y = df.iloc[:, [2]]


KeyError: "None of ['Country'] are in the columns"

In [11]:
X.head()

Unnamed: 0_level_0,Date,int_perc,population,GDP,continent_Africa,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AFG,1990,0.0,12412.3,33.5173,0,1,0,0,0,0
AFG,1991,0.0,13299.0,30.4497,0,1,0,0,0,0
AFG,1992,0.0,14485.5,30.0136,0,1,0,0,0,0
AFG,1993,0.0,15816.6,24.2715,0,1,0,0,0,0
AFG,1994,0.0,17075.7,18.6874,0,1,0,0,0,0


In [12]:
y.head()

Unnamed: 0_level_0,electricity_consumed
Country,Unnamed: 1_level_1
AFG,1.02114
AFG,1.04477
AFG,0.76805
AFG,0.72961
AFG,0.75017


In [13]:
#Creating the train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)

In [14]:
#Checking the shape of the train/test data
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(3186, 10) (3186, 1)
(1063, 10) (1063, 1)


## Scaling the X_train data

In [15]:
#Creating the scaler object
scaler = StandardScaler().fit(X_train)

In [16]:
#Checking the mean of the scaler data
scaler.mean_

array([2.00214030e+03, 1.56741247e+01, 3.59428859e+04, 4.25742848e+02,
       3.01004394e-01, 2.13119900e-01, 2.24105461e-01, 1.34965474e-01,
       5.17890772e-02, 7.50156937e-02])

In [17]:
#Checking the variation of the scaler data
scaler.scale_

array([7.17055878e+00, 2.38698691e+01, 1.31572705e+05, 1.40700415e+03,
       4.58694614e-01, 4.09511670e-01, 4.16991851e-01, 3.41686691e-01,
       2.21600922e-01, 2.63416665e-01])

In [18]:
#Transforming the dataset
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[ 0.81718857,  3.00570879, -0.14811002, ..., -0.39499775,
        -0.23370425, -0.2847796 ],
       [ 0.25935199, -0.57386105,  8.33006448, ..., -0.39499775,
        -0.23370425, -0.2847796 ],
       [ 0.81718857,  0.42756311, -0.0631367 , ..., -0.39499775,
        -0.23370425,  3.51148742],
       ...,
       [-1.27469861, -0.65664896, -0.24027724, ..., -0.39499775,
        -0.23370425, -0.2847796 ],
       [-0.85632118, -0.60238807,  0.02040875, ..., -0.39499775,
        -0.23370425, -0.2847796 ],
       [ 1.37502515,  2.8653645 ,  0.10761156, ..., -0.39499775,
        -0.23370425, -0.2847796 ]])

In [19]:
#Checking the mean of the scaled data
print(X_train_scaled.mean(axis = 0))

[ 1.00961298e-14 -2.78775399e-17  2.23020319e-17 -1.89567271e-17
  2.89926414e-17  4.46040638e-17 -7.80571116e-18  6.57909941e-17
 -8.92081275e-18 -8.14024164e-17]


In [20]:
#Checking that the scaled data has a standard deviation of 1
print(X_train_scaled.std(axis = 0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


## Scaling the X_test data

In [21]:
#Creating the scaler object
scaler = StandardScaler().fit(X_test)

#Checking the mean of the scaler data
print(scaler.mean_)

[2.00224835e+03 1.56774781e+01 3.50429739e+04 3.98114601e+02
 3.19849483e-01 2.00376294e-01 2.29539040e-01 1.36406397e-01
 5.64440263e-02 5.73847601e-02]


In [22]:
#Checking the variation of the scaler data
print(scaler.scale_)



[7.12107591e+00 2.38112301e+01 1.40697533e+05 1.38409671e+03
 4.66418043e-01 4.00281944e-01 4.20536407e-01 3.43219597e-01
 2.30777161e-01 2.32576330e-01]


In [23]:
#Transforming the dataset
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled)

[[-0.73701696 -0.65569403 -0.13956871 ... -0.39743184 -0.24458238
  -0.24673517]
 [-0.73701696 -0.65737172 -0.2044196  ... -0.39743184 -0.24458238
  -0.24673517]
 [ 0.24598057  1.603551   -0.17495455 ... -0.39743184 -0.24458238
  -0.24673517]
 ...
 [-1.15830161 -0.6584069  -0.15502741 ... -0.39743184 -0.24458238
  -0.24673517]
 [ 0.10555235  1.68922486  0.33067372 ... -0.39743184 -0.24458238
  -0.24673517]
 [-0.45616052 -0.6500275  -0.03141188 ... -0.39743184 -0.24458238
  -0.24673517]]


In [24]:
#Checking the mean of the scaled data
print(X_test_scaled.mean(axis = 0))

#Checking that the scaled data has a standard deviation of 1
print(X_test_scaled.std(axis = 0))

[-1.03573468e-14 -4.34480506e-17 -1.33686310e-17 -3.34215774e-18
 -4.51191295e-17 -3.67637352e-17  0.00000000e+00 -3.84348140e-17
  3.34215774e-18  6.85142337e-17]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


## Scaling the y_train data

In [25]:
#Creating the scaler object
scaler = StandardScaler().fit(y_train)

#Checking the mean of the scaler data
print(scaler.mean_)

[84.0089052]


In [26]:
#Checking the variation of the scaler data
print(scaler.scale_)

[351.34182252]


In [27]:
#Transforming the dataset
y_train_scaled = scaler.transform(y_train)
print(y_train_scaled)

[[ 0.0826892 ]
 [ 1.10144614]
 [-0.00129579]
 ...
 [-0.23852906]
 [ 0.07767454]
 [ 1.13615843]]


In [28]:
#Checking the mean of the scaled data
print(y_train_scaled.mean(axis = 0))

#Checking that the scaled data has a standard deviation of 1
print(y_train_scaled.std(axis = 0))

[6.69060956e-18]
[1.]


## Scaling the y_test data

In [29]:
#Creating the scaler object
scaler = StandardScaler().fit(y_test)

#Checking the mean of the scaler data
print(scaler.mean_)

[81.46693232]


In [30]:
#Checking the variation of the scaler data
print(scaler.scale_)

[350.03756339]


In [31]:
#Transforming the dataset
y_test_scaled = scaler.transform(y_test)
print(y_test_scaled)

[[-0.10431718]
 [-0.23193863]
 [ 0.00503954]
 ...
 [-0.22636408]
 [ 1.32150979]
 [-0.17990907]]


In [32]:
#Checking the mean of the scaled data
print(y_test_scaled.mean(axis = 0))

#Checking that the scaled data has a standard deviation of 1
print(y_test_scaled.std(axis = 0))

[3.34215774e-17]
[1.]


## Questions:

- Do I need to simplify the index in the beginning of the notebook like it did, or can this be done maintaining a multi-index format?

- Should I be approaching the train/test split differently considering that this is a time series problem? 

- What is the best way to save the X_train, X_test, y_train, y_test datasets to open in the next notebook for the project? As seperate csv files? 

In [47]:
tscv = TimeSeriesSplit(n_splits=5)

In [48]:
tscv

TimeSeriesSplit(max_train_size=None, n_splits=5)