In [1]:
# usually we always import these three libraries for Data Analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Use Columns for Indexing

In [9]:
# import some data - it is a good idea to let Pandas parse any date columns
# unless we say otherwise, it will import the first sheet
# we can choose which column should be the index
df_GOOG = pd.read_excel('data/market_data.xls', index_col='Date', sheet_name='GOOGL', parse_dates=True) 
df_GOOG.head() # we have the correct sheet

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19,52.082081,48.028027,50.050049,50.220219,44659000,50.220219
2004-08-20,54.594593,50.300301,50.555557,54.209209,22834300,54.209209
2004-08-23,56.796795,54.579578,55.430431,54.754753,18256100,54.754753
2004-08-24,55.855854,51.836838,55.675674,52.487488,15247300,52.487488
2004-08-25,54.054054,51.991993,52.532532,53.053055,9188600,53.053055


In [22]:
# since we have dates for our index, we can slice like this
df_GOOG.loc['2010'] # all the dates in 2010
df_GOOG.loc['2010-05'] # May 2010
df_GOOG.loc['2010 May'] # May 2010 in a different format
# df_GOOG.loc['2010-05-26'] # 26th May 2010 in a different format

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-05-03,266.726715,262.802795,263.513519,265.565552,3711800,265.565552
2010-05-04,263.633636,252.357361,263.523529,253.438446,12140400,253.438446
2010-05-05,258.118103,250.485489,250.740738,255.135132,9155200,255.135132
2010-05-06,259.019012,230.230225,254.629623,249.584579,9990100,249.584579
2010-05-07,252.912918,240.905899,250.235229,246.816818,10167800,246.816818
2010-05-10,261.671661,256.556549,257.242249,261.08609,8247700,261.08609
2010-05-11,260.200195,254.364365,258.093079,254.779785,6638500,254.779785
2010-05-12,256.276276,251.251251,256.276276,252.947952,7695800,252.947952
2010-05-13,261.261261,255.440445,258.508514,255.695694,6644900,255.695694
2010-05-14,255.750748,248.373367,255.140137,254.019012,8223700,254.019012


In [25]:
# 'loc' will locate a row by explicit index
# 'iloc' will locate a row by underlying index number
# we can find members by iloc
df_GOOG.iloc[0:4] # there is still an underlying index (counting from zero)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19,52.082081,48.028027,50.050049,50.220219,44659000,50.220219
2004-08-20,54.594593,50.300301,50.555557,54.209209,22834300,54.209209
2004-08-23,56.796795,54.579578,55.430431,54.754753,18256100,54.754753
2004-08-24,55.855854,51.836838,55.675674,52.487488,15247300,52.487488


### import several separate sheets then combine

In [28]:
df_IBM = pd.read_excel('data/market_data.xls' ,sheet_name='IBM', parse_dates=True, index_col='Date')
df_MSFT = pd.read_excel('data/market_data.xls' ,sheet_name='MSFT', parse_dates=True, index_col='Date')

In [32]:
# combine these three separate DataFrames into one new DataFrame
df_ALL = pd.DataFrame() # we have an empty DataFrame
# add data to this
# NB since each sheet was indexed by date, our df_ALL gets indexed by date
df_ALL['IBM'] = df_IBM['Volume']
df_ALL['MSFT'] = df_MSFT['Volume']
df_ALL['GOOGL']= df_GOOG['Volume']
df_ALL

Unnamed: 0_level_0,IBM,MSFT,GOOGL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-03,10347700,53228400,
2000-01-04,8227800,54119000,
2000-01-05,12733200,64059600,
2000-01-06,7971900,54976600,
2000-01-07,11856700,62013600,
...,...,...,...
2019-12-19,3866500,24958900,1446100.0
2019-12-20,7111800,53477500,2504500.0
2019-12-23,2803200,17718200,996800.0
2019-12-24,1202100,8989200,673400.0


## Techniques for cleaning data

In [57]:
# often we use fillna
df_dash = df_ALL.fillna('-') # we provide a value for the missing data
# NOTE - the changes are NOT persisent
df_dash # to persist the changes, put them in a new DataFrame
# CAUTION - we dont want too many large dataframes!!!
# Also - we now have 'Object' data instead of 'float'

# a common strategy is to find an average, then use that to fill NA
# here is the mean value of GOOGL:
mean_goog = df_GOOG[['Volume']].mean()
mean_goog # this is a key-value pair
# we then use this average to replace the missing values
df_ALL.fillna(mean_goog['Volume']) # this is better - we still have numbers
# other strategies:
# - replace with zero
# - replace with a representative value (e.g. commonest or similar)
# - leave as NaN - the stats will still work!!

Unnamed: 0_level_0,IBM,MSFT,GOOGL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-03,10347700,53228400,7.126476e+06
2000-01-04,8227800,54119000,7.126476e+06
2000-01-05,12733200,64059600,7.126476e+06
2000-01-06,7971900,54976600,7.126476e+06
2000-01-07,11856700,62013600,7.126476e+06
...,...,...,...
2019-12-19,3866500,24958900,1.446100e+06
2019-12-20,7111800,53477500,2.504500e+06
2019-12-23,2803200,17718200,9.968000e+05
2019-12-24,1202100,8989200,6.734000e+05
