# Setup

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics  #to include metrics for evaluation 
from sklearn.model_selection import train_test_split
%matplotlib inline

# Fed funds rate data: add a column denoting rate changed or not from previous month

In [15]:
ffr_df = pd.read_csv('fedfundsrates/fed_funds_target_rates_Dec2018_Sep1982.csv')

In [16]:
# Check if it renders correctly
ffr_df.head()

Unnamed: 0,DATE,DFEDTAR_20081215
0,9/27/82,10.25
1,9/28/82,10.25
2,9/29/82,10.25
3,9/30/82,10.25
4,10/1/82,10.0


In [17]:
# Info about the data
ffr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9587 entries, 0 to 9586
Data columns (total 2 columns):
DATE                9587 non-null object
DFEDTAR_20081215    9587 non-null float64
dtypes: float64(1), object(1)
memory usage: 149.9+ KB


In [19]:
# Add a new column that denotes how much the rate changed from the previous month
ffr_df = pd.read_csv('fedfundsrates/fed_funds_target_rates_Dec2018_Sep1982.csv')
df = pd.DataFrame(ffr_df)
df['Diff'] = df['DFEDTAR_20081215'].diff(1)
print(df) 


          DATE  DFEDTAR_20081215    Diff
0      9/27/82           10.2500     NaN
1      9/28/82           10.2500  0.0000
2      9/29/82           10.2500  0.0000
3      9/30/82           10.2500  0.0000
4      10/1/82           10.0000 -0.2500
5      10/2/82           10.0000  0.0000
6      10/3/82           10.0000  0.0000
7      10/4/82           10.0000  0.0000
8      10/5/82           10.0000  0.0000
9      10/6/82           10.0000  0.0000
10     10/7/82            9.5000 -0.5000
11     10/8/82            9.5000  0.0000
12     10/9/82            9.5000  0.0000
13    10/10/82            9.5000  0.0000
14    10/11/82            9.5000  0.0000
15    10/12/82            9.5000  0.0000
16    10/13/82            9.5000  0.0000
17    10/14/82            9.5000  0.0000
18    10/15/82            9.5000  0.0000
19    10/16/82            9.5000  0.0000
20    10/17/82            9.5000  0.0000
21    10/18/82            9.5000  0.0000
22    10/19/82            9.5000  0.0000
23    10/20/82  

In [20]:
# Add a new column to denote if a rate
# changed based on the previous month
# 1 is changed; 0 is did not change
df['Changed'] = np.where(df['Diff']!=0.0000, 1, 0)
print(df) 

          DATE  DFEDTAR_20081215    Diff  Changed
0      9/27/82           10.2500     NaN        1
1      9/28/82           10.2500  0.0000        0
2      9/29/82           10.2500  0.0000        0
3      9/30/82           10.2500  0.0000        0
4      10/1/82           10.0000 -0.2500        1
5      10/2/82           10.0000  0.0000        0
6      10/3/82           10.0000  0.0000        0
7      10/4/82           10.0000  0.0000        0
8      10/5/82           10.0000  0.0000        0
9      10/6/82           10.0000  0.0000        0
10     10/7/82            9.5000 -0.5000        1
11     10/8/82            9.5000  0.0000        0
12     10/9/82            9.5000  0.0000        0
13    10/10/82            9.5000  0.0000        0
14    10/11/82            9.5000  0.0000        0
15    10/12/82            9.5000  0.0000        0
16    10/13/82            9.5000  0.0000        0
17    10/14/82            9.5000  0.0000        0
18    10/15/82            9.5000  0.0000        0


# Text data: Create a dataframe that includes the cleaned text doc content and adds a column for the date of the document taken from the file name

In [21]:
# Get a list of filenames from the local cleaned statements directory using glob library 
import glob
filenames = glob.glob('statements/statements.clean/*.txt')

# Initialize an empty list
# Then loop through the file list, and create a dictionary with the file dates and content
records = []
for filename in filenames:   
    data = open(filename).read()
    date = filename.split('/')[-1].split('.')[-2]
    record = {'filedate':date,'document':data}
    records.append(record)

In [22]:
docdf = pd.DataFrame(records)
docdf.head()

Unnamed: 0,document,filedate
0,fomc decid today rais target fedfundsr bas poi...,20051213
1,fomc meet today decid low target fedfundsr bas...,20010515
2,fomc decid today rais target fedfundsr bas poi...,20041214
3,fomc decid today leav target fedfundsr unchang...,19991005
4,fomc meet today decid maintain ex stant monet ...,20001219


In [25]:
# Modify filedate string to a timestamp
docdf.filedate = pd.to_datetime(docdf.filedate, format='%Y%m%d')
docdf.head()

Unnamed: 0,document,filedate
0,fomc decid today rais target fedfundsr bas poi...,2005-12-13
1,fomc meet today decid low target fedfundsr bas...,2001-05-15
2,fomc decid today rais target fedfundsr bas poi...,2004-12-14
3,fomc decid today leav target fedfundsr unchang...,1999-10-05
4,fomc meet today decid maintain ex stant monet ...,2000-12-19
