# Data ingestion and inspection

In [20]:
# Indexes and columns

import pandas as pd

AAPL = pd.read_csv('./datasets/AAPL.csv')

type(AAPL)

pandas.core.frame.DataFrame

In [21]:
AAPL.shape

(378, 6)

In [22]:
AAPL.columns

Index(['date', 'close', 'volume', 'open', 'high', 'low'], dtype='object')

In [23]:
type(AAPL.columns)

pandas.core.indexes.base.Index

In [24]:
AAPL.index

RangeIndex(start=0, stop=378, step=1)

In [25]:
type(AAPL.index)

pandas.core.indexes.range.RangeIndex

In [26]:
# Slicing
AAPL.iloc[:5,:]

Unnamed: 0,date,close,volume,open,high,low
0,11:41,174.27,15925321.0,172.93,175.08,172.3501
1,2019/02/04,171.25,31384720.0,167.41,171.655,167.28
2,2019/02/01,166.52,32644590.0,166.96,168.98,165.93
3,2019/01/31,166.44,40613260.0,166.11,169.0,164.56
4,2019/01/30,165.25,60800480.0,163.25,166.15,160.23


In [27]:
# Slicing
AAPL.iloc[-5:,:]

Unnamed: 0,date,close,volume,open,high,low
373,2017/08/10,155.32,39636190.0,159.9,160.0,154.63
374,2017/08/09,161.06,26060430.0,159.26,161.27,159.11
375,2017/08/08,160.08,36127490.0,158.6,161.83,158.27
376,2017/08/07,158.81,21827400.0,157.06,158.92,156.6701
377,2017/08/04,156.39,20514810.0,156.07,157.4,155.69


In [28]:
# head()
AAPL.head(5)

Unnamed: 0,date,close,volume,open,high,low
0,11:41,174.27,15925321.0,172.93,175.08,172.3501
1,2019/02/04,171.25,31384720.0,167.41,171.655,167.28
2,2019/02/01,166.52,32644590.0,166.96,168.98,165.93
3,2019/01/31,166.44,40613260.0,166.11,169.0,164.56
4,2019/01/30,165.25,60800480.0,163.25,166.15,160.23


In [29]:
AAPL.head(2)

Unnamed: 0,date,close,volume,open,high,low
0,11:41,174.27,15925321.0,172.93,175.08,172.3501
1,2019/02/04,171.25,31384720.0,167.41,171.655,167.28


In [30]:
# tail()
AAPL.tail()

Unnamed: 0,date,close,volume,open,high,low
373,2017/08/10,155.32,39636190.0,159.9,160.0,154.63
374,2017/08/09,161.06,26060430.0,159.26,161.27,159.11
375,2017/08/08,160.08,36127490.0,158.6,161.83,158.27
376,2017/08/07,158.81,21827400.0,157.06,158.92,156.6701
377,2017/08/04,156.39,20514810.0,156.07,157.4,155.69


In [31]:
AAPL.tail(3)

Unnamed: 0,date,close,volume,open,high,low
375,2017/08/08,160.08,36127490.0,158.6,161.83,158.27
376,2017/08/07,158.81,21827400.0,157.06,158.92,156.6701
377,2017/08/04,156.39,20514810.0,156.07,157.4,155.69


In [32]:
# info()
AAPL.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378 entries, 0 to 377
Data columns (total 6 columns):
date      378 non-null object
close     378 non-null float64
volume    378 non-null object
open      378 non-null float64
high      378 non-null float64
low       378 non-null float64
dtypes: float64(4), object(2)
memory usage: 17.8+ KB


In [33]:
# Broadcasting
import numpy as np

AAPL.iloc[::3,-1] = np.nan

In [34]:
AAPL.head(6)

Unnamed: 0,date,close,volume,open,high,low
0,11:41,174.27,15925321.0,172.93,175.08,
1,2019/02/04,171.25,31384720.0,167.41,171.655,167.28
2,2019/02/01,166.52,32644590.0,166.96,168.98,165.93
3,2019/01/31,166.44,40613260.0,166.11,169.0,
4,2019/01/30,165.25,60800480.0,163.25,166.15,160.23
5,2019/01/29,154.68,39914850.0,156.25,158.13,154.11


In [35]:
AAPL.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378 entries, 0 to 377
Data columns (total 6 columns):
date      378 non-null object
close     378 non-null float64
volume    378 non-null object
open      378 non-null float64
high      378 non-null float64
low       252 non-null float64
dtypes: float64(4), object(2)
memory usage: 17.8+ KB


In [36]:
# Series

low = AAPL['low']

type(low)

pandas.core.series.Series

In [37]:
low.head()

0       NaN
1    167.28
2    165.93
3       NaN
4    160.23
Name: low, dtype: float64

In [39]:
lows = low.values
type(lows)

numpy.ndarray

### Inspecting your data
You can use the DataFrame methods .head() and .tail() to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as pd and loaded population data from 1960 to 2014 as a DataFrame df. This dataset was obtained from the [World Bank](https://databank.worldbank.org/data/reports.aspx?source=2&type=metadata&series=SP.URB.TOTL.IN.ZS#).

Your job is to use df.head() and df.tail() to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the 'Year' and 'Total Population' columns.

* First: 1980, 26183676.0; Last: 2000, 35.
* First: 1960, 92495902.0; Last: 2014, 15245855.0.
* First: 40.472, 2001; Last: 44.5, 1880.
* First: CSS, 104170.0; Last: USA, 95.203.

In [42]:
import pandas as pd

df = pd.read_csv('./datasets/world_dev_ind.csv')

#df.info()
print(df.head(1))
print(df.tail(1))

  CountryName CountryCode  Year  Total Population  \
0  Arab World         ARB  1960        92495902.0   

   Urban population (% of total)  
0                      31.285384  
      CountryName CountryCode  Year  Total Population  \
13373    Zimbabwe         ZWE  2014        15245855.0   

       Urban population (% of total)  
13373                         32.501  


### DataFrame data types
Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and NaN ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as pd and read in the world population data which contains some NaN values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use df.info() to determine information about the total count of non-null entries and infer the total count of 'null' entries, which likely indicates missing data. Select the best description of this data set from the following:

* The data is all of type float64 and none of it is missing.
* The data is of mixed type, and 9914 of it is missing.
* The data is of mixed type, and 3460 float64s are missing.
* The data is all of type float64, and 3460 float64s are missing.

In [47]:
df = pd.read_csv('./datasets/world_dev_ind2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13374 entries, 0 to 13373
Data columns (total 5 columns):
CountryName                      13374 non-null object
CountryCode                      13374 non-null object
Year                             13374 non-null int64
Total Population                 9914 non-null float64
Urban population (% of total)    13374 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 522.5+ KB


### NumPy and pandas working together
Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute .values to represent a DataFrame df as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as pd and loaded world population data every 10 years since 1960 into the DataFrame df. This dataset was derived from the one used in the previous exercise.

Your job is to extract the values and store them in an array using the attribute .values. You'll then use those values as input into the NumPy np.log10() method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy np.log10() method and compare the results.

* Import numpy using the standard alias np.
* Assign the numerical values in the DataFrame df to an array np_vals using the attribute values.
* Pass np_vals into the NumPy method log10() and store the results in np_vals_log10.
* Pass the entire df DataFrame into the NumPy method log10() and store the results in df_log10.
* Inspect the output of the print() code to see the type() of the variables that you created.

In [52]:
import pandas as pd

df = pd.read_csv('./datasets/world_population.csv')

# Import numpy
import numpy as np

# Create array of DataFrame values: np_vals
np_vals = df.values

# Create new array of base 10 logarithm values: np_vals_log10
np_vals_log10 = np.log10(np_vals)

# Create array of new DataFrame by passing df to np.log10(): df_log10
df_log10 = np.log10(df)

# Print original and new data containers
[print(x, 'has type', type(eval(x))) for x in ['np_vals', 'np_vals_log10', 'df', 'df_log10']]

'''As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.'''

np_vals has type <class 'numpy.ndarray'>
np_vals_log10 has type <class 'numpy.ndarray'>
df has type <class 'pandas.core.frame.DataFrame'>
df_log10 has type <class 'pandas.core.frame.DataFrame'>


"As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful."