In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Cleaning

### We need to decide what kind of time interval we want to use for our predictions. Annual predictions? Monthly? Weekly?

### Who knows, maybe SARIMA models work well on quarterly/annual time scales, while an LSTM does the best job predicting 1 week out

### But long-term and short-term prediction would be interesting

### Rolling window?

In [None]:
#Loading the data
corn_raw = pd.read_csv('data/corn.csv', header = 2)
oil_raw = pd.read_csv('data/crude.csv', header = 2)
wheat_raw = pd.read_csv('data/wheat.csv', header = 2)

In [None]:
# Define a helper function to process the data
def process_data(df, label):
    
    # Convert the Date column from a string to datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    
    # We only want dates prior to this cutoff
    df = df.loc[(df['Date'] <= '2021-11-01')]
    
    # We only want dates after this cutoff
    df = df.loc[(df['Date'] >= '1980-01-01')]
    
    # Carries forward old prices, so we aren't using future information
    df.fillna(method = 'ffill', inplace=True) 
    
    # Drop the 'Ticker' column
    df = df.drop(columns = ['Ticker'], axis=1)
    
    # Rename the 'Close' column with the passed label
    df = df.rename(columns={'Close': label})
    #df = df.drop(columns = 'Close')
    
    return df

In [None]:
# Process all of the data
CORN = process_data(corn_raw, 'CORN') 
OIL = process_data(crude_raw, 'OIL')
WHEAT = process_data(wheat_raw, 'WHEAT')

# Merge all of the data into one large dataframe
from functools import reduce

dfs = [CORN, OIL, WHEAT]
data = reduce(lambda  left,right: pd.merge(left,right,on=['Date'], how='outer'), dfs)

# Sort the data by descending date
data = data.sort_values(by='Date', ascending=True).reset_index(drop=True)

data

In [None]:
def slice_datasets(data, start_date, end_date):

    # Get the indices of the start date and the end date
    start_index = int(np.where(data[:,0] == pd.to_datetime(start_date))[0])
    end_index = int(np.where(data[:,0] == pd.to_datetime(end_date))[0])
    
    # Get all of the input data (X)
    X_data = data.astype('float32')   
    
    # Get all of the input data (X) for the desired date range
    X_data = X_data[start_index:end_index+1]

    # Get all of the output data (Y) for the desired date range
    y_data = data[start_index:end_index+1,1].astype('float32')
    
    return X_data, y_data


# Convert the data from a pandas dataframe to a numpy array
np_data = data.to_numpy()

# Split data into train, dev, and test sets
X_train, y_train = lstm_data_processing(scaled_data, start_date = '01/01/1970', end_date = '12/31/2008')
X_dev, y_dev = lstm_data_processing(scaled_data, start_date = '01/01/2009', end_date = '12/31/2014')
X_test, y_test = lstm_data_processing(scaled_data, start_date = '01/01/2015', end_date = '11/01/2021')

# Verify the shapes of the input and output data
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)
print(X_test.shape)
print(y_test.shape)