In [11]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import seaborn as sns
import sklearn
from sklearn.ensemble import (
BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.tree import DecisionTreeRegressor

In [13]:
def create_lagged_series(symbol, start_date, end_date, lags=3):
    
    ts = web.DataReader(symbol, "yahoo", start_date, end_date)
    
    # Create the new lagged DataFrame
    tslag = pd.DataFrame(index=ts.index)
    tslag["Today"] = ts["Adj Close"]
    tslag["Volume"] = ts["Volume"]
    
    # Create the shifted lag series of
    # prior trading period close values
    for i in range(0,lags):
        tslag["Lag%s" % str(i+1)] = ts["Adj Close"].shift(i+1)
        
    # Create the returns DataFrame
    tsret = pd.DataFrame(index=tslag.index)
    tsret["Volume"] = tslag["Volume"]
    tsret["Today"] = tslag["Today"].pct_change()*100.0
    
    # Create the lagged percentage returns columns
    for i in range(0,lags):
        tsret["Lag%s" % str(i+1)] = tslag["Lag%s" % str(i+1)].pct_change()*100.0
    tsret = tsret[tsret.index >= start_date]
    return tsret

In [22]:
if __name__ == "__main__":
    # Set the random seed, number of estimators
    # and the "step factor" used to plot the graph of MSE
    # for each method
    random_state = 42
    n_jobs = 1 # Parallelisation factor for bagging, random forests
    n_estimators = 1000
    step_factor = 10
    axis_step = int(n_estimators/step_factor)
    # Download ten years worth of Amazon
    # adjusted closing prices
    start = datetime.datetime(2006, 1, 1)
    end = datetime.datetime(2015, 12, 31)
    amzn = create_lagged_series("AMZN", start, end, lags=3)
    amzn.dropna(inplace=True)
    
    # Use the first three daily lags of AMZN closing prices
    # and scale the data to lie within -1 and +1 for comparison
    X = amzn[["Lag1", "Lag2", "Lag3"]]
    y = amzn["Today"]
    X = scale(X)
    y = scale(y)
    
    # Use the training-testing split with 70% of data in the
    # training data with the remaining 30% of data in the testing
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.3, random_state=random_state
#         )
    
#     # Pre-create the arrays which will contain the MSE for
#     # each particular ensemble method
#     estimators = np.zeros(axis_step)
#     bagging_mse = np.zeros(axis_step)
#     rf_mse = np.zeros(axis_step)
#     boosting_mse = np.zeros(axis_step)
    # Estimate the Bagging MSE over the full number
    # of estimators, across a step size ("step_factor")
#     for i in range(0, axis_step):
#         print("Bagging Estimator: %d of %d..." % (
#             step_factor*(i+1), n_estimators)
#             )
        
#         bagging = BaggingRegressor(DecisionTreeRegressor(), n_estimators=step_factor*(i+1), n_jobs=n_jobs,
#                                random_state=random_state)
    
#         bagging.fit(X_train, y_train)
        
#         mse = mean_squared_error(y_test, bagging.predict(X_test))
#         estimators[i] = step_factor*(i+1)
#         bagging_mse[i] = mse
    
    # Estimate the Random Forest MSE over the full number
    # of estimators, across a step size ("step_factor")
    
#     for i in range(0, axis_step):
#         print("Random Forest Estimator: %d of %d..." % (step_factor*(i+1), n_estimators))
        
#         rf = RandomForestRegressor(
#             n_estimators=step_factor*(i+1),
#             n_jobs=n_jobs,
#             random_state=random_state
#             )
#         rf.fit(X_train, y_train)
#         mse = mean_squared_error(y_test, rf.predict(X_test))
#         estimators[i] = step_factor*(i+1)
#         rf_mse[i] = mse
        
        # Estimate the AdaBoost MSE over the full number
        # of estimators, across a step size ("step_factor")
    
#     for i in range(0, axis_step):
#         print("Boosting Estimator: %d of %d..." % (
#         step_factor*(i+1), n_estimators)
#         )
            
#         boosting = AdaBoostRegressor(
#             DecisionTreeRegressor(),
#             n_estimators=step_factor*(i+1),
#             random_state=random_state,
#             learning_rate=0.01
#             )
        
#         boosting.fit(X_train, y_train)
#         mse = mean_squared_error(y_test, boosting.predict(X_test))
#         estimators[i] = step_factor*(i+1)
#         boosting_mse[i] = mse
    
    
    # Plot the chart of MSE versus number of estimators
#     plt.figure(figsize=(8, 8))
#     plt.title(’Bagging, Random Forest and Boosting comparison’)
#     plt.plot(estimators, bagging_mse, ’b-’, color="black", label=’Bagging’)
#     plt.plot(estimators, rf_mse, ’b-’, color="blue", label=’Random Forest’)
#     plt.plot(estimators, boosting_mse, ’b-’, color="red", label=’AdaBoost’)
#     plt.legend(loc=’upper right’)
#     plt.xlabel(’Estimators’)
#     plt.ylabel(’Mean Squared Error’)
#     plt.show()

TypeError: iteration over a 0-d array

In [None]:
# read in our data
df_features=pd.read_csv('prices.csv')

df_features['date'] = pd.to_datetime(df_features['date'])
df_features.head()

In [2]:
import matplotlib.pyplot as plt

a = df_features.loc[df_features.loc[:,'Ticker'] == 'AAL', ['date', 'close']]
a.plot(x = 'date', y = 'close')
plt.show()

<Figure size 640x480 with 1 Axes>

In [3]:
from sklearn.ensemble import RandomForestRegressor

X_train = df_features.loc[(df_features['date'] < '2016-01-01'),:]
y_train = X_train['close']
X_train.drop(['close', 'date'], axis = 1, inplace = True)

X_test = df_features.loc[(df_features['date'] > '2015-12-31'), :]
y_test = X_test['close']
X_test.drop(['close', 'date'], axis = 1, inplace = True)

#X_train.head()
X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Ticker,open,low,high,volume
0,AAL,4.84,4.66,4.94,9837300
1,AAP,40.700001,40.360001,41.040001,1701700
2,AAPL,30.49,30.34,30.642857,123432400
3,ABC,26.290001,26.139999,26.690001,2455900
4,ABT,26.000339,25.870792,26.177866,10829000


In [4]:
X_train_one = X_train.loc[X_train['Ticker'] == 'AAPL', :]
y_train_one = y_train[X_train_one.index]

X_test_one = X_test.loc[X_test['Ticker'] == 'AAPL', :]
y_test_one = y_test[X_test_one.index]

X_train_one.drop(['Ticker'], axis = 1, inplace = True)
X_test_one.drop(['Ticker'], axis = 1, inplace = True)

mod = RandomForestRegressor()
mod.fit(X_train_one, y_train_one)
#mod.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [11]:
from sklearn.metrics import mean_squared_error

y_pred = mod.predict(X_test_one)
y_test_one_df = pd.DataFrame(y_test_one)
print(mean_squared_error(y_test_one, y_pred))

0.6366600208803723


In [9]:
print(y_pred[0])
print(y_test_one.iloc[0])

102.94699940000001
105.349998
