In [1]:
import datetime

import numpy as np
import pandas as pd
import pandas_datareader as pdr  # IF NECESSARY, from terminal: pip install pandas_datareader
from numpy.random import default_rng

pd.set_option("display.max_rows", 10)  # display option for pandas
# more here: https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html

## A quick hit of Numpy

In [2]:
# create a random vector 
# every run of this --> diff #s
# see 3.2.2.2 in the textbook for why, 
# and how to prevent

rg = default_rng()
myray = rg.standard_normal(5)
print("myray:", myray) 

myray: [ 1.63135973 -1.24288834  0.15216388 -1.4352827   0.20090536]


In [3]:
# q1 - indexing, pick the odd elements

myray[::2]


array([1.63135973, 0.15216388, 0.20090536])

Booleans arrays: Asking a logic test on an array, returns the array, where each element has been tested against that logic and converted to the boolean answer.

In [5]:
# q2 - (a) boolean array + (b) "masking"

# (a)
# myray[[1,3,4]] # magic nums suck. Don't wirk when the random array is build again
myray > 0 # this tests every element in the array against logic

# (b)

mask = myray > 0 
print(mask) # a mask is a boolean array of trues and falses
print(myray[mask]) # if the mask 

#one shot version:
myray[myray > 0]


[ True False  True False  True]
[1.63135973 0.15216388 0.20090536]


array([1.63135973, 0.15216388, 0.20090536])

What you just learned about masking and filtering can be done with dataframes!

## The main event - Pandas

Vocab
- series
- index 
- dataframe
- columns and names
- rows and index 
- multiindex 

In [21]:
start = datetime.datetime(2000, 1, 1) # you can specify start and end dates this way
end = datetime.datetime(2021, 1, 27)
macro_df = pdr.data.DataReader(['GDP','CPIAUCSL','UNRATE'], 'fred', start, end)


In [19]:
# preview the dataframe

macro_df # default prints top and bottom 5
macro_df.head(20) # prints top 20
macro_df.tail(10) # bottom 10

macro_df[30:40] # slice a list


macro_df.sample(20) # with big data, nice. Pulls random 20
macro_df.sample(frac = .1)

# if installed variable inspecter, right click , oper var inspector, pick the var, look

Unnamed: 0_level_0,GDP,CPIAUCSL,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-09-01,,252.182,3.7
2019-07-01,21717.171,255.802,3.7
2018-11-01,,252.594,3.8
2019-09-01,,256.43,3.5
2018-08-01,,251.663,3.8


In [22]:
# shape

macro_df.shape

(253, 3)

In [23]:
# variable types

macro_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 253 entries, 2000-01-01 to 2021-01-01
Freq: MS
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   GDP       85 non-null     float64
 1   CPIAUCSL  253 non-null    float64
 2   UNRATE    253 non-null    float64
dtypes: float64(3)
memory usage: 7.9 KB


In [None]:
# look at top X rows - head



In [None]:
# look at bottom X rows - tail

In [25]:
# grab one variable

macro_df['GDP'] # name of var
macro_df['GDP'] + 1 # change the vars

DATE
2000-01-01    10003.179
2000-02-01          NaN
2000-03-01          NaN
2000-04-01    10248.720
2000-05-01          NaN
                ...    
2020-09-01          NaN
2020-10-01    22069.767
2020-11-01          NaN
2020-12-01          NaN
2021-01-01    22657.793
Freq: MS, Name: GDP, Length: 253, dtype: float64

In [26]:
# grab two (or more) variables
macro_df[['GDP','UNRATE']] # give it a list

Unnamed: 0_level_0,GDP,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-01,10002.179,4.0
2000-02-01,,4.1
2000-03-01,,4.0
2000-04-01,10247.720,3.8
2000-05-01,,4.0
...,...,...
2020-09-01,,7.8
2020-10-01,22068.767,6.9
2020-11-01,,6.7
2020-12-01,,6.7


In [27]:
# see column names

macro_df.columns

Index(['GDP', 'CPIAUCSL', 'UNRATE'], dtype='object')

In [28]:
# change column names
# couple ways
new_names = ['GDP', 'CPI', 'UNRATE']
macro_df.columns = new_names
macro_df

Unnamed: 0_level_0,GDP,CPI,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01,10002.179,169.300,4.0
2000-02-01,,170.000,4.1
2000-03-01,,171.000,4.0
2000-04-01,10247.720,170.900,3.8
2000-05-01,,171.200,4.0
...,...,...,...
2020-09-01,,259.951,7.8
2020-10-01,22068.767,260.249,6.9
2020-11-01,,260.895,6.7
2020-12-01,,262.005,6.7


Series = Single Column
index = stuff to the left in bold, row names, can be multiple var
dataframe = 
columns and

In [29]:
# see index
macro_df.index

DatetimeIndex(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01',
               '2000-05-01', '2000-06-01', '2000-07-01', '2000-08-01',
               '2000-09-01', '2000-10-01',
               ...
               '2020-04-01', '2020-05-01', '2020-06-01', '2020-07-01',
               '2020-08-01', '2020-09-01', '2020-10-01', '2020-11-01',
               '2020-12-01', '2021-01-01'],
              dtype='datetime64[ns]', name='DATE', length=253, freq='MS')

In [35]:
# reset_index() and set_index()

macro_df.reset_index().set_index('DATE')



Unnamed: 0_level_0,GDP,CPI,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01,10002.179,169.300,4.0
2000-02-01,,170.000,4.1
2000-03-01,,171.000,4.0
2000-04-01,10247.720,170.900,3.8
2000-05-01,,171.200,4.0
...,...,...,...
2020-09-01,,259.951,7.8
2020-10-01,22068.767,260.249,6.9
2020-11-01,,260.895,6.7
2020-12-01,,262.005,6.7


In [36]:
macro_df.reset_index()
macro_df

Unnamed: 0_level_0,GDP,CPI,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01,10002.179,169.300,4.0
2000-02-01,,170.000,4.1
2000-03-01,,171.000,4.0
2000-04-01,10247.720,170.900,3.8
2000-05-01,,171.200,4.0
...,...,...,...
2020-09-01,,259.951,7.8
2020-10-01,22068.767,260.249,6.9
2020-11-01,,260.895,6.7
2020-12-01,,262.005,6.7


In [None]:
# grab some rows (by position)
macro_df[20:30]

In [37]:
# grab some rows (by value)

mask = macro_df['UNRATE'] > 6
macro_df[mask]

Unnamed: 0_level_0,GDP,CPI,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-05-01,,182.900,6.1
2003-06-01,,183.100,6.3
2003-07-01,11566.669,183.700,6.2
2003-08-01,,184.500,6.1
2003-09-01,,185.100,6.1
...,...,...,...
2020-09-01,,259.951,7.8
2020-10-01,22068.767,260.249,6.9
2020-11-01,,260.895,6.7
2020-12-01,,262.005,6.7


In [41]:
# create a variable

macro_df['HIGH'] = mask # df['NAME'] = stuff

df = df.assign...

#2.
macro_df.assign(high2 = macro_df['UNRATE'] > 6) # doesn't show, wasnt made permanant


macro_df

SyntaxError: invalid syntax (1223474396.py, line 5)

## EDA

Stop here. Back to the lecture. 

### Part 1

Q0: Do each of the [EDA golden rules for initial data exploration](https://ledatascifi.github.io/ledatascifi-2025/content/03/02e_eda_golden.html) and write down your observations.
- **Important: What is the "key" or "unit level" that observations in this database describes?** 
    - This is defined in 3.2.5 and discussed in [3.2.2.2 via example](https://ledatascifi.github.io/ledatascifi-2025/content/03/02b_pandasVocab.html#the-shape-of-data)
    - The "key" levels in databases we will look at are often increments of time in the data, the type of entity describe (e.g. firm, person, state, country, industry), and combinations of entity and time. 
    - E.g. "firm" level, "firm-year" level

Q1: What is the second series above?

Q2: What is the frequency of the series?

Q3: What is the average ANNUAL GDP, based on the data?

## Part 2

Q4: Download the annual *real* gdp from 1960 to 2018 from FRED and compute the average annual percent change

Q5: Compute the average gdp percent change within *each decade*


## Part 3

First, I'll load January data on unemployment, the Case-Shiller housing index, and median household income in three states (CA/MI/PA). 

In [None]:
# LOAD DATA AND CONVERT TO ANNUAL

start = 1990 # pandas datareader can infer these are years
end = 2018
macro_data = pdr.data.DataReader(['CAUR','MIUR','PAUR', # unemployment 
                                  'LXXRSA','DEXRSA','WDXRSA', # case shiller index in LA, Detroit, DC (no PA  available!)
                                  'MEHOINUSCAA672N','MEHOINUSMIA672N','MEHOINUSPAA672N'], #  
                                 'fred', start, end)
macro_data = macro_data.resample('Y').first() # get's the first observation for each variable in a given year

# CLEAN UP THE FORMATING SOMEWHAT

macro_data.index = macro_data.index.year
print("\n\n DATA BEFORE FORMATTING: \n\n")
print(macro_data[:20]) # see how the data looks now? ugly variable names, but its an annual dataset at least
macro_data.columns=pd.MultiIndex.from_tuples([
    ('Unemployment','CA'),('Unemployment','MI'),('Unemployment','PA'),
    ('HouseIdx','CA'),('HouseIdx','MI'),('HouseIdx','PA'),
    ('MedIncome','CA'),('MedIncome','MI'),('MedIncome','PA')
    ])
print("\n\n DATA AFTER FORMATTING: \n\n")
print(macro_data[:20]) # this is a dataset that is "wide", and now 
                       # the column variable names have 2 levels - var name, 
                       # and unit/state that variable applies to


Q6: for each decade and state, report the average annual CHANGE (level, not percent) in unemployment

Q7: for each decade and state, report the average annual PERCENT CHANGE in house prices and household income