# Chapter 1: Google Colab Setup

## Import libraries

In [1]:
import sys
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
print('Python version: ', sys.version)
print('NumPy version: ', np.__version__)
print('Pandas version: ', pd.__version__)
print('TensorFlow version: ', tf.__version__)

Python version:  3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
NumPy version:  1.19.0
Pandas version:  1.0.5
TensorFlow version:  2.2.0


## Manipulating Data with Pandas

Workflow

1. Create a 4 column DataFrame with 10 rows, the first column being a date field and the rest numbers.

2. Fill the first column with the first day of each month for 3 years (for example: 1/1/2018, 2/1/2018).

3. Fill the next 2 columns with random numbers.

4. Fill the 4th column with the difference of the first 2 data columns (for example: Col3 - Col2).

5. Break the DataFrame into 3 different DataFrames based on the dates (for example: 2018, 2019, 2020)

In [18]:
# It is best practice NOT to initialize an empty dataframe as it takes up a lot of memory
# beforehand and grows linearly as the number of rows increase. Best to have the dataframe
# at hand and then fill-up create the dataframe

# First column
df_date = pd.date_range(start='1/1/2018', end='12/31/2020', freq='MS')

length = len(df_date)

df_date

DatetimeIndex(['2018-01-01', '2018-02-01', '2018-03-01', '2018-04-01',
               '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01',
               '2018-09-01', '2018-10-01', '2018-11-01', '2018-12-01',
               '2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01',
               '2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01',
               '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
               '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01',
               '2020-05-01', '2020-06-01', '2020-07-01', '2020-08-01',
               '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01'],
              dtype='datetime64[ns]', freq='MS')

In [19]:
# Second & third column
col_a = np.random.randint(20,size=length)
col_b = np.random.randint(20,size=length)

In [20]:
my_dict = {'Date':df_date, 'A':col_a, 'B':col_b}

In [21]:
df = pd.DataFrame(my_dict)

In [22]:
df.head()

Unnamed: 0,Date,A,B
0,2018-01-01,4,4
1,2018-02-01,13,12
2,2018-03-01,0,1
3,2018-04-01,10,3
4,2018-05-01,8,7


In [23]:
df['C'] = df.B - df.A

In [24]:
df.head()

Unnamed: 0,Date,A,B,C
0,2018-01-01,4,4,0
1,2018-02-01,13,12,-1
2,2018-03-01,0,1,1
3,2018-04-01,10,3,-7
4,2018-05-01,8,7,-1


In [30]:
# here is one brute force way but I dont like it
df_01 = df[df['Date'].dt.year == 2018]
df_02 = df[df['Date'].dt.year == 2019]
df_03 = df[df['Date'].dt.year == 2020]

In [35]:
# Found this on stackoverflow
# https://stackoverflow.com/questions/51072938/how-to-split-dataframe-based-on-years-in-python
# But I am still not satisfied
def split_years(dt):
    dt['year'] = dt['Date'].dt.year
    return [dt[dt['year'] == y] for y in dt['year'].unique()]

In [37]:
DF = split_years(df)

In [52]:
df1 = pd.DataFrame(DF[0])
df2 = pd.DataFrame(DF[1])
df3 = pd.DataFrame(DF[2])

In [53]:
df1

Unnamed: 0,Date,A,B,C,year
0,2018-01-01,4,4,0,2018
1,2018-02-01,13,12,-1,2018
2,2018-03-01,0,1,1,2018
3,2018-04-01,10,3,-7,2018
4,2018-05-01,8,7,-1,2018
5,2018-06-01,2,9,7,2018
6,2018-07-01,16,16,0,2018
7,2018-08-01,17,15,-2,2018
8,2018-09-01,13,15,2,2018
9,2018-10-01,3,7,4,2018


In [54]:
df2

Unnamed: 0,Date,A,B,C,year
12,2019-01-01,9,10,1,2019
13,2019-02-01,12,16,4,2019
14,2019-03-01,0,4,4,2019
15,2019-04-01,7,3,-4,2019
16,2019-05-01,3,16,13,2019
17,2019-06-01,9,1,-8,2019
18,2019-07-01,4,15,11,2019
19,2019-08-01,19,4,-15,2019
20,2019-09-01,2,16,14,2019
21,2019-10-01,7,3,-4,2019


In [58]:
df3

Unnamed: 0,Date,A,B,C,year
24,2020-01-01,3,3,0,2020
25,2020-02-01,17,15,-2,2020
26,2020-03-01,10,3,-7,2020
27,2020-04-01,7,18,11,2020
28,2020-05-01,7,13,6,2020
29,2020-06-01,5,16,11,2020
30,2020-07-01,16,6,-10,2020
31,2020-08-01,3,2,-1,2020
32,2020-09-01,7,1,-6,2020
33,2020-10-01,12,10,-2,2020
