In [None]:
import pandas, json, numpy, requests, os, datetime, statistics, math, pytz, tweepy, sqlite3, time, re, random, matplotlib.pyplot as plt, sklearn, statsmodels.api as sm, seaborn
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

This script is to analyze the csv file created in the Price_Predict script ("Prediction Summary" files).

In [None]:
def prediction_analysis(symbol, name):
    prediction_summary = pandas.read_csv(os.getcwd() + '\\Predictions\\'  + symbol + ' - ' + name + ' - Prediction Summary.csv', index_col= 'Predicted_Date', parse_dates= True).sort_index(ascending=True)

    ### Want to splice the newest model best parameters:
    prediction_summary_selected = prediction_summary.iloc[[len(prediction_summary)-1]]

    ### Extracting and assigning the best tress:
    Random_Forest_Shuffled_Best_Trees = prediction_summary_selected.Random_Forest_Shuffled_Best_Trees[0]

    Random_Forest_Ordered_Best_Trees = prediction_summary_selected.Random_Forest_Ordered_Best_Trees[0]

    Random_Forest_Ordered_No_Bootstrap_Best_Trees = prediction_summary_selected.Random_Forest_Ordered_No_Bootstrap_Best_Trees[0]

    AdaBoost_Shuffled_Best_Trees = prediction_summary_selected.AdaBoost_Shuffled_Best_Trees[0]

    AdaBoost_Ordered_Best_Trees = prediction_summary_selected.AdaBoost_Ordered_Best_Trees[0]

    Gradient_Boosting_Shuffled_Best_Trees = prediction_summary_selected.Gradient_Boosting_Shuffled_Best_Trees[0]

    Gradient_Boosting_Ordered_Best_Trees = prediction_summary_selected.Gradient_Boosting_Ordered_Best_Trees[0]

    ### Modified version from the Price_Predict script:
    final_df = pandas.read_csv(os.getcwd() + '\\Final DataFrame\\'  + symbol + ' - ' + name + ' - Final DataFrame.csv', index_col= 'Date', parse_dates= True).sort_index(ascending=True)
    final_df = final_df.drop(['Unnamed: 0', 'Symbol', 'Name'], axis= 1)

    print(symbol, name)

    df_counter = 0

    for u, v in final_df.iterrows():

        if v.name != final_df.index[-1]:
            final_df.loc[u, 'Label'] = final_df.iloc[df_counter+1]['Close']

        if v.Trend == 'High Uptrend':
            final_df.loc[u, 'Trend'] = 3

        elif v.Trend == 'Uptrend':
            final_df.loc[u, 'Trend'] = 2

        elif v.Trend == 'Slight Uptrend':
            final_df.loc[u, 'Trend'] = 1

        elif v.Trend == 'No Trend':
            final_df.loc[u, 'Trend'] = 0

        elif v.Trend == 'Slight Downtrend':
            final_df.loc[u, 'Trend'] = -1

        elif v.Trend == 'Downtrend':
            final_df.loc[u, 'Trend'] = -2

        elif v.Trend == 'High Downtrend':
            final_df.loc[u, 'Trend'] = -3

        df_counter += 1

    final_df = final_df.drop_duplicates()

    ##### Because the last day on the df will not be labeled, it will need to be removed.
    final_df_last_row = final_df.iloc[[-1]]
    final_df_last_row = final_df_last_row.drop('Label', axis= 1)

    final_df = final_df.drop(final_df.index[-1], axis= 0)

    ### Splitting the label from the variables:
    ##### The first 5 rows do not have the 5, 7, or 10 day slopes and the modeling should not be affected since there are many 
    ##### data points avaliable for most stocks
    y = final_df.iloc[5:, 19]
    X = final_df.iloc[5:, :19]

    ## The reordering of the index is to include the newest dates, which has more data on the current market (coronavirus), when
    ## splitting with shuffle = False--forcing the train data to include the newest dates. 
    ## This should not affect if shuffle = True.
    y = y.sort_index(ascending=False)
    X = X.sort_index(ascending=False)

    ### Splitting the data: test_size = default = 0.25
    shuffled_X_train, shuffled_X_test, shuffled_y_train, shuffled_y_test = train_test_split(X, y)

    ordered_X_train, ordered_X_test, ordered_y_train, ordered_y_test = train_test_split(X, y, shuffle= False)

    ordered_X_train = ordered_X_train.sort_index(ascending=True)
    ordered_X_test = ordered_X_test.sort_index(ascending=True)
    ordered_y_train = ordered_y_train.sort_index(ascending=True)
    ordered_y_test = ordered_y_test.sort_index(ascending=True)

    ########## Changes from Price_Predict start here:------------------------------------------------------------------------------
    ### All modeling is set to one parameter and .predict method is done on all X observations for visual comparison
    ### as well as statistical comparison between the difference models


    # The last unlabeled row is added with the X:

    X_plus_last = pandas.concat([X, final_df_last_row]).sort_index(ascending=True)

    ### Random Forest:
    #### According to a Harvard lecture, random forest for regression should have max_features = 'sqrt' else the default ('auto')
    #### will do bagging instead of random forest

    random_forest = RandomForestRegressor(n_estimators= Random_Forest_Shuffled_Best_Trees, max_features= 'sqrt')
    random_forest.fit(shuffled_X_train, shuffled_y_train)

    best_random_forest_accuracy_score_shuffled = random_forest.score(shuffled_X_test, shuffled_y_test)

    best_random_forest_predict_shuffled = random_forest.predict(X_plus_last)

    features_df = pandas.DataFrame({'RF_Shuffled_Open': random_forest.feature_importances_[0], 
                                    'RF_Shuffled_High': random_forest.feature_importances_[1], 
                                    'RF_Shuffled_Low': random_forest.feature_importances_[2], 
                                    'RF_Shuffled_Close': random_forest.feature_importances_[3], 
                                    'RF_Shuffled_Volume': random_forest.feature_importances_[4], 
                                    'RF_Shuffled_SMA_30': random_forest.feature_importances_[5], 
                                    'RF_Shuffled_SMA_60': random_forest.feature_importances_[6], 
                                    'RF_Shuffled_Patterns_Score': random_forest.feature_importances_[7], 
                                    'RF_Shuffled_Base_Score_5': random_forest.feature_importances_[8], 
                                    'RF_Shuffled_Base_Score_7': random_forest.feature_importances_[9], 
                                    'RF_Shuffled_Base_Score_10': random_forest.feature_importances_[10], 
                                    'RF_Shuffled_Trend': random_forest.feature_importances_[11], 
                                    'RF_Shuffled_Average_Slope_5_7_10': random_forest.feature_importances_[12], 
                                    'RF_Shuffled_Predicted_Value_For_End_Date': random_forest.feature_importances_[13], 
                                    'RF_Shuffled_Earnings_Score': random_forest.feature_importances_[14], 
                                    'RF_Shuffled_Number_Of_Times_Articles': random_forest.feature_importances_[15], 
                                    'RF_Shuffled_NYTimes_Score': random_forest.feature_importances_[16], 
                                    'RF_Shuffled_Number_Of_Google_Articles': random_forest.feature_importances_[17], 
                                    'RF_Shuffled_Google_Score': random_forest.feature_importances_[18]}, index=[symbol])

    features_df.index.names = ['Symbol']

    random_forest = RandomForestRegressor(n_estimators= Random_Forest_Ordered_Best_Trees, max_features= 'sqrt')
    random_forest.fit(ordered_X_train, ordered_y_train)

    best_random_forest_accuracy_score_ordered = random_forest.score(shuffled_X_test, shuffled_y_test)

    best_random_forest_predict_ordered = random_forest.predict(X_plus_last)

    temp_df = pandas.DataFrame({'RF_Ordered_Open': random_forest.feature_importances_[0], 
                                'RF_Ordered_High': random_forest.feature_importances_[1], 
                                'RF_Ordered_Low': random_forest.feature_importances_[2], 
                                'RF_Ordered_Close': random_forest.feature_importances_[3], 
                                'RF_Ordered_Volume': random_forest.feature_importances_[4], 
                                'RF_Ordered_SMA_30': random_forest.feature_importances_[5], 
                                'RF_Ordered_SMA_60': random_forest.feature_importances_[6], 
                                'RF_Ordered_Patterns_Score': random_forest.feature_importances_[7], 
                                'RF_Ordered_Base_Score_5': random_forest.feature_importances_[8], 
                                'RF_Ordered_Base_Score_7': random_forest.feature_importances_[9], 
                                'RF_Ordered_Base_Score_10': random_forest.feature_importances_[10], 
                                'RF_Ordered_Trend': random_forest.feature_importances_[11], 
                                'RF_Ordered_Average_Slope_5_7_10': random_forest.feature_importances_[12], 
                                'RF_Ordered_Predicted_Value_For_End_Date': random_forest.feature_importances_[13], 
                                'RF_Ordered_Earnings_Score': random_forest.feature_importances_[14], 
                                'RF_Ordered_Number_Of_Times_Articles': random_forest.feature_importances_[15], 
                                'RF_Ordered_NYTimes_Score': random_forest.feature_importances_[16], 
                                'RF_Ordered_Number_Of_Google_Articles': random_forest.feature_importances_[17], 
                                'RF_Ordered_Google_Score': random_forest.feature_importances_[18]}, index=[symbol])

    temp_df.index.names = ['Symbol']

    features_df = features_df.merge(temp_df, how='outer', on='Symbol')

    random_forest = RandomForestRegressor(n_estimators= Random_Forest_Ordered_No_Bootstrap_Best_Trees, max_features= 'sqrt', bootstrap= False)
    random_forest.fit(ordered_X_train, ordered_y_train)

    best_random_forest_accuracy_score_ordered_no_boot = random_forest.score(shuffled_X_test, shuffled_y_test)

    best_random_forest_predict_ordered_no_boot = random_forest.predict(X_plus_last)

    temp_df = pandas.DataFrame({'RF_Ordered_Open_No_Boot': random_forest.feature_importances_[0], 
                                'RF_Ordered_High_No_Boot': random_forest.feature_importances_[1], 
                                'RF_Ordered_Low_No_Boot': random_forest.feature_importances_[2], 
                                'RF_Ordered_Close_No_Boot': random_forest.feature_importances_[3], 
                                'RF_Ordered_Volume_No_Boot': random_forest.feature_importances_[4], 
                                'RF_Ordered_SMA_30_No_Boot': random_forest.feature_importances_[5], 
                                'RF_Ordered_SMA_60_No_Boot': random_forest.feature_importances_[6], 
                                'RF_Ordered_Patterns_Score_No_Boot': random_forest.feature_importances_[7], 
                                'RF_Ordered_Base_Score_5_No_Boot': random_forest.feature_importances_[8], 
                                'RF_Ordered_Base_Score_7_No_Boot': random_forest.feature_importances_[9], 
                                'RF_Ordered_Base_Score_10_No_Boot': random_forest.feature_importances_[10], 
                                'RF_Ordered_Trend_No_Boot': random_forest.feature_importances_[11], 
                                'RF_Ordered_Average_Slope_5_7_10_No_Boot': random_forest.feature_importances_[12], 
                                'RF_Ordered_Predicted_Value_For_End_Date_No_Boot': random_forest.feature_importances_[13], 
                                'RF_Ordered_Earnings_Score_No_Boot': random_forest.feature_importances_[14], 
                                'RF_Ordered_Number_Of_Times_Articles_No_Boot': random_forest.feature_importances_[15], 
                                'RF_Ordered_NYTimes_Score_No_Boot': random_forest.feature_importances_[16], 
                                'RF_Ordered_Number_Of_Google_Articles_No_Boot': random_forest.feature_importances_[17], 
                                'RF_Ordered_Google_Score_No_Boot': random_forest.feature_importances_[18]}, index=[symbol])

    temp_df.index.names = ['Symbol']

    features_df = features_df.merge(temp_df, how='outer', on='Symbol')


    ### Testing AdaBoost:
    ada = AdaBoostRegressor(n_estimators= AdaBoost_Shuffled_Best_Trees)
    ada.fit(shuffled_X_train, shuffled_y_train)

    best_ada_forest_accuracy_score_shuffled = ada.score(shuffled_X_test, shuffled_y_test)

    best_ada_predict_shuffled = ada.predict(X_plus_last)

    temp_df = pandas.DataFrame({'Ada_Shuffled_Open': ada.feature_importances_[0], 
                                'Ada_Shuffled_High': ada.feature_importances_[1], 
                                'Ada_Shuffled_Low': ada.feature_importances_[2], 
                                'Ada_Shuffled_Close': ada.feature_importances_[3], 
                                'Ada_Shuffled_Volume': ada.feature_importances_[4], 
                                'Ada_Shuffled_SMA_30': ada.feature_importances_[5], 
                                'Ada_Shuffled_SMA_60': ada.feature_importances_[6], 
                                'Ada_Shuffled_Patterns_Score': ada.feature_importances_[7], 
                                'Ada_Shuffled_Base_Score_5': ada.feature_importances_[8], 
                                'Ada_Shuffled_Base_Score_7': ada.feature_importances_[9], 
                                'Ada_Shuffled_Base_Score_10': ada.feature_importances_[10], 
                                'Ada_Shuffled_Trend': ada.feature_importances_[11], 
                                'Ada_Shuffled_Average_Slope_5_7_10': ada.feature_importances_[12], 
                                'Ada_Shuffled_Predicted_Value_For_End_Date': ada.feature_importances_[13], 
                                'Ada_Shuffled_Earnings_Score': ada.feature_importances_[14], 
                                'Ada_Shuffled_Number_Of_Times_Articles': ada.feature_importances_[15], 
                                'Ada_Shuffled_NYTimes_Score': ada.feature_importances_[16], 
                                'Ada_Shuffled_Number_Of_Google_Articles': ada.feature_importances_[17], 
                                'Ada_Shuffled_Google_Score': ada.feature_importances_[18]}, index=[symbol])

    temp_df.index.names = ['Symbol']

    features_df = features_df.merge(temp_df, how='outer', on='Symbol')


    ada = AdaBoostRegressor(n_estimators= AdaBoost_Ordered_Best_Trees)
    ada.fit(ordered_X_train, ordered_y_train)

    best_ada_accuracy_score_ordered = ada.score(shuffled_X_test, shuffled_y_test)

    best_ada_predict_ordered = ada.predict(X_plus_last)

    temp_df = pandas.DataFrame({'Ada_Ordered_Open': ada.feature_importances_[0], 
                                'Ada_Ordered_High': ada.feature_importances_[1], 
                                'Ada_Ordered_Low': ada.feature_importances_[2], 
                                'Ada_Ordered_Close': ada.feature_importances_[3], 
                                'Ada_Ordered_Volume': ada.feature_importances_[4], 
                                'Ada_Ordered_SMA_30': ada.feature_importances_[5], 
                                'Ada_Ordered_SMA_60': ada.feature_importances_[6], 
                                'Ada_Ordered_Patterns_Score': ada.feature_importances_[7], 
                                'Ada_Ordered_Base_Score_5': ada.feature_importances_[8], 
                                'Ada_Ordered_Base_Score_7': ada.feature_importances_[9], 
                                'Ada_Ordered_Base_Score_10': ada.feature_importances_[10], 
                                'Ada_Ordered_Trend': ada.feature_importances_[11], 
                                'Ada_Ordered_Average_Slope_5_7_10': ada.feature_importances_[12], 
                                'Ada_Ordered_Predicted_Value_For_End_Date': ada.feature_importances_[13], 
                                'Ada_Ordered_Earnings_Score': ada.feature_importances_[14], 
                                'Ada_Ordered_Number_Of_Times_Articles': ada.feature_importances_[15], 
                                'Ada_Ordered_NYTimes_Score': ada.feature_importances_[16], 
                                'Ada_Ordered_Number_Of_Google_Articles': ada.feature_importances_[17], 
                                'Ada_Ordered_Google_Score': ada.feature_importances_[18]}, index=[symbol])

    temp_df.index.names = ['Symbol']

    features_df = features_df.merge(temp_df, how='outer', on='Symbol')




    ### Testing Gradient Boosting: GradientBoostingRegressor
    gradient_boosting = GradientBoostingRegressor(n_estimators= Gradient_Boosting_Shuffled_Best_Trees)
    gradient_boosting.fit(shuffled_X_train, shuffled_y_train)

    best_gradient_boosting_accuracy_score_shuffled = gradient_boosting.score(shuffled_X_test, shuffled_y_test)

    best_gradient_boosting_predict_shuffled = gradient_boosting.predict(X_plus_last)

    temp_df = pandas.DataFrame({'Gradient_Boosting_Shuffled_Open': gradient_boosting.feature_importances_[0], 
                                'Gradient_Boosting_Shuffled_High': gradient_boosting.feature_importances_[1], 
                                'Gradient_Boosting_Shuffled_Low': gradient_boosting.feature_importances_[2], 
                                'Gradient_Boosting_Shuffled_Close': gradient_boosting.feature_importances_[3], 
                                'Gradient_Boosting_Shuffled_Volume': gradient_boosting.feature_importances_[4], 
                                'Gradient_Boosting_Shuffled_SMA_30': gradient_boosting.feature_importances_[5], 
                                'Gradient_Boosting_Shuffled_SMA_60': gradient_boosting.feature_importances_[6], 
                                'Gradient_Boosting_Shuffled_Patterns_Score': gradient_boosting.feature_importances_[7], 
                                'Gradient_Boosting_Shuffled_Base_Score_5': gradient_boosting.feature_importances_[8], 
                                'Gradient_Boosting_Shuffled_Base_Score_7': gradient_boosting.feature_importances_[9], 
                                'Gradient_Boosting_Shuffled_Base_Score_10': gradient_boosting.feature_importances_[10], 
                                'Gradient_Boosting_Shuffled_Trend': gradient_boosting.feature_importances_[11], 
                                'Gradient_Boosting_Shuffled_Average_Slope_5_7_10': gradient_boosting.feature_importances_[12], 
                                'Gradient_Boosting_Shuffled_Predicted_Value_For_End_Date': gradient_boosting.feature_importances_[13], 
                                'Gradient_Boosting_Shuffled_Earnings_Score': gradient_boosting.feature_importances_[14], 
                                'Gradient_Boosting_Shuffled_Number_Of_Times_Articles': gradient_boosting.feature_importances_[15], 
                                'Gradient_Boosting_Shuffled_NYTimes_Score': gradient_boosting.feature_importances_[16], 
                                'Gradient_Boosting_Shuffled_Number_Of_Google_Articles': gradient_boosting.feature_importances_[17], 
                                'Gradient_Boosting_Shuffled_Google_Score': gradient_boosting.feature_importances_[18]}, index=[symbol])

    temp_df.index.names = ['Symbol']

    features_df = features_df.merge(temp_df, how='outer', on='Symbol')


    gradient_boosting = GradientBoostingRegressor(n_estimators= Gradient_Boosting_Ordered_Best_Trees)
    gradient_boosting.fit(ordered_X_train, ordered_y_train)

    best_gradient_boosting_accuracy_score_ordered = gradient_boosting.score(shuffled_X_test, shuffled_y_test)

    best_gradient_boosting_predict_ordered = gradient_boosting.predict(X_plus_last)

    temp_df = pandas.DataFrame({'Gradient_Boosting_Ordered_Open': gradient_boosting.feature_importances_[0], 
                                'Gradient_Boosting_Ordered_High': gradient_boosting.feature_importances_[1], 
                                'Gradient_Boosting_Ordered_Low': gradient_boosting.feature_importances_[2], 
                                'Gradient_Boosting_Ordered_Close': gradient_boosting.feature_importances_[3], 
                                'Gradient_Boosting_Ordered_Volume': gradient_boosting.feature_importances_[4], 
                                'Gradient_Boosting_Ordered_SMA_30': gradient_boosting.feature_importances_[5], 
                                'Gradient_Boosting_Ordered_SMA_60': gradient_boosting.feature_importances_[6], 
                                'Gradient_Boosting_Ordered_Patterns_Score': gradient_boosting.feature_importances_[7], 
                                'Gradient_Boosting_Ordered_Base_Score_5': gradient_boosting.feature_importances_[8], 
                                'Gradient_Boosting_Ordered_Base_Score_7': gradient_boosting.feature_importances_[9], 
                                'Gradient_Boosting_Ordered_Base_Score_10': gradient_boosting.feature_importances_[10], 
                                'Gradient_Boosting_Ordered_Trend': gradient_boosting.feature_importances_[11], 
                                'Gradient_Boosting_Ordered_Average_Slope_5_7_10': gradient_boosting.feature_importances_[12], 
                                'Gradient_Boosting_Ordered_Predicted_Value_For_End_Date': gradient_boosting.feature_importances_[13], 
                                'Gradient_Boosting_Ordered_Earnings_Score': gradient_boosting.feature_importances_[14], 
                                'Gradient_Boosting_Ordered_Number_Of_Times_Articles': gradient_boosting.feature_importances_[15], 
                                'Gradient_Boosting_Ordered_NYTimes_Score': gradient_boosting.feature_importances_[16], 
                                'Gradient_Boosting_Ordered_Number_Of_Google_Articles': gradient_boosting.feature_importances_[17], 
                                'Gradient_Boosting_Ordered_Google_Score': gradient_boosting.feature_importances_[18]}, index=[symbol])

    temp_df.index.names = ['Symbol']

    features_df = features_df.merge(temp_df, how='outer', on='Symbol')

    ### Adding the labels (y/the close of the next day) back onto X, as well as the predictions:

    predictions_df = X_plus_last.merge(y, how= 'left', left_on= 'Date', right_on='Date')

    predictions_df['Best_Random_Forest_Predict_Shuffled'] = list(best_random_forest_predict_shuffled)
    predictions_df['Best_Random_Forest_Predict_Ordered'] = list(best_random_forest_predict_ordered)
    predictions_df['Best_Random_Forest_Predict_Ordered_No_Boot'] = list(best_random_forest_predict_ordered_no_boot)

    predictions_df['Best_Ada_Predict_Shuffled'] = list(best_ada_predict_shuffled)
    predictions_df['Best_Ada_Predict_Ordered'] = list(best_ada_predict_ordered)

    predictions_df['Best_Gradient_Boosting_Predict_Shuffled'] = list(best_gradient_boosting_predict_shuffled)
    predictions_df['Best_Gradient_Boosting_Predict_Ordered'] = list(best_gradient_boosting_predict_ordered)

    for q, p in predictions_df.iterrows():
        ### Averaging the predictions:
        average_prediction = numpy.average([p['Best_Random_Forest_Predict_Shuffled'], p['Best_Random_Forest_Predict_Ordered'], p['Best_Random_Forest_Predict_Ordered_No_Boot'], p['Best_Ada_Predict_Shuffled'], p['Best_Ada_Predict_Ordered'], p['Best_Gradient_Boosting_Predict_Shuffled'], p['Best_Gradient_Boosting_Predict_Ordered']])
        predictions_df.loc[q, 'Average_Prediction'] = average_prediction


        ### Calculating the difference between the Label - Close:
        Difference_Label_Close = p.Label - p.Close
        predictions_df.loc[q, 'Difference_Label_Close'] = Difference_Label_Close


        ### Calculating the difference between the Average_Prediction - Close:
        Difference_Prediction_Close = average_prediction - p.Close
        predictions_df.loc[q, 'Difference_Average_Close'] = Difference_Prediction_Close


        ### Calculating the difference between the Label - Average_Prediction:
        Difference_Label_Prediction = p.Label - average_prediction
        predictions_df.loc[q, 'Difference_Label_Average'] = Difference_Label_Prediction


        ### If the Difference_Label_Close is negative/positive & Difference_Prediction_Close is the same = 1:
        if Difference_Label_Close > 0 and Difference_Prediction_Close > 0: 
            Price_Direction = 1

        elif Difference_Label_Close < 0 and Difference_Prediction_Close < 0:
            Price_Direction = 1

        else:
            Price_Direction = 0

        predictions_df.loc[q, 'Average_Price_Direction'] = Price_Direction
        
        ### Calculating the difference between the Best_Random_Forest_Predict_Shuffled - Close:
        Difference_Prediction_Close = p['Best_Random_Forest_Predict_Shuffled'] - p.Close
        predictions_df.loc[q, 'Difference_RF_Shuffled_Close'] = Difference_Prediction_Close

        ### Calculating the difference between the Label - Best_Random_Forest_Predict_Shuffled:
        Difference_Label_Prediction = p.Label - p['Best_Random_Forest_Predict_Shuffled']
        predictions_df.loc[q, 'Difference_Label_RF_Shuffled'] = Difference_Label_Prediction


        ### If the Difference_Label_Close is negative/positive & Difference_Prediction_Close is the same = 1:
        if Difference_Label_Close > 0 and Difference_Prediction_Close > 0: 
            Price_Direction = 1

        elif Difference_Label_Close < 0 and Difference_Prediction_Close < 0:
            Price_Direction = 1

        else:
            Price_Direction = 0

        predictions_df.loc[q, 'RF_Shuffled_Price_Direction'] = Price_Direction
        
        ### Calculating the difference between the Best_Random_Forest_Predict_Ordered - Close:
        Difference_Prediction_Close = p['Best_Random_Forest_Predict_Ordered'] - p.Close
        predictions_df.loc[q, 'Difference_RF_Ordered_Close'] = Difference_Prediction_Close

        ### Calculating the difference between the Label - Best_Random_Forest_Predict_Ordered - Label:
        Difference_Label_Prediction = p.Label - p['Best_Random_Forest_Predict_Ordered']
        predictions_df.loc[q, 'Difference_Label_RF_Ordered'] = Difference_Label_Prediction


        ### If the Difference_Label_Close is negative/positive & Difference_Prediction_Close is the same = 1:
        if Difference_Label_Close > 0 and Difference_Prediction_Close > 0: 
            Price_Direction = 1

        elif Difference_Label_Close < 0 and Difference_Prediction_Close < 0:
            Price_Direction = 1

        else:
            Price_Direction = 0

        predictions_df.loc[q, 'RF_Ordered_Price_Direction'] = Price_Direction
        
        ### Calculating the difference between the Best_Random_Forest_Predict_Ordered_No_Boot - Close:
        Difference_Prediction_Close = p['Best_Random_Forest_Predict_Ordered_No_Boot'] - p.Close
        predictions_df.loc[q, 'Difference_RF_Ordered_No_Boot_Close'] = Difference_Prediction_Close

        ### Calculating the difference between the Label - Best_Random_Forest_Predict_Ordered_No_Boot:
        Difference_Label_Prediction = p.Label - p['Best_Random_Forest_Predict_Ordered_No_Boot']
        predictions_df.loc[q, 'Difference_Label_RF_Ordered_No_Boot'] = Difference_Label_Prediction


        ### If the Difference_Label_Close is negative/positive & Difference_Prediction_Close is the same = 1:
        if Difference_Label_Close > 0 and Difference_Prediction_Close > 0: 
            Price_Direction = 1

        elif Difference_Label_Close < 0 and Difference_Prediction_Close < 0:
            Price_Direction = 1

        else:
            Price_Direction = 0

        predictions_df.loc[q, 'RF_Ordered_No_Boot_Price_Direction'] = Price_Direction
        
        ### Calculating the difference between the Best_Ada_Predict_Shuffled - Close:
        Difference_Prediction_Close = p['Best_Ada_Predict_Shuffled'] - p.Close
        predictions_df.loc[q, 'Difference_Ada_Shuffled_Close'] = Difference_Prediction_Close

        ### Calculating the difference between the Label - Best_Ada_Predict_Shuffled:
        Difference_Label_Prediction = p.Label - p['Best_Ada_Predict_Shuffled']
        predictions_df.loc[q, 'Difference_Label_Ada_Shuffled'] = Difference_Label_Prediction


        ### If the Difference_Label_Close is negative/positive & Difference_Prediction_Close is the same = 1:
        if Difference_Label_Close > 0 and Difference_Prediction_Close > 0: 
            Price_Direction = 1

        elif Difference_Label_Close < 0 and Difference_Prediction_Close < 0:
            Price_Direction = 1

        else:
            Price_Direction = 0

        predictions_df.loc[q, 'Ada_Shuffled_Price_Direction'] = Price_Direction

        ### Calculating the difference between the Best_Ada_Predict_Ordered - Close:
        Difference_Prediction_Close = p['Best_Ada_Predict_Ordered'] - p.Close
        predictions_df.loc[q, 'Difference_Ada_Ordered_Close'] = Difference_Prediction_Close
        
        ### Calculating the difference between the Label - Best_Ada_Predict_Shuffled:
        Difference_Label_Prediction = p.Label - p['Best_Ada_Predict_Ordered']
        predictions_df.loc[q, 'Difference_Label_Ada_Ordered'] = Difference_Label_Prediction


        ### If the Difference_Label_Close is negative/positive & Difference_Prediction_Close is the same = 1:
        if Difference_Label_Close > 0 and Difference_Prediction_Close > 0: 
            Price_Direction = 1

        elif Difference_Label_Close < 0 and Difference_Prediction_Close < 0:
            Price_Direction = 1

        else:
            Price_Direction = 0

        predictions_df.loc[q, 'Ada_Ordered_Price_Direction'] = Price_Direction

        ### Calculating the difference between the Best_Gradient_Boosting_Predict_Shuffled - Close:
        Difference_Prediction_Close = p['Best_Gradient_Boosting_Predict_Shuffled'] - p.Close
        predictions_df.loc[q, 'Difference_Gradient_Boosting_Shuffled_Close'] = Difference_Prediction_Close

        ### Calculating the difference between the Label - Best_Ada_Predict_Shuffled:
        Difference_Label_Prediction = p.Label - p['Best_Gradient_Boosting_Predict_Shuffled']
        predictions_df.loc[q, 'Difference_Label_Gradient_Boosting_Shuffled'] = Difference_Label_Prediction


        ### If the Difference_Label_Close is negative/positive & Difference_Prediction_Close is the same = 1:
        if Difference_Label_Close > 0 and Difference_Prediction_Close > 0: 
            Price_Direction = 1

        elif Difference_Label_Close < 0 and Difference_Prediction_Close < 0:
            Price_Direction = 1

        else:
            Price_Direction = 0

        predictions_df.loc[q, 'Gradient_Boosting_Shuffled_Price_Direction'] = Price_Direction

        ### Calculating the difference between the Best_Gradient_Boosting_Predict_Ordered - Close:
        Difference_Prediction_Close = p['Best_Gradient_Boosting_Predict_Ordered'] - p.Close
        predictions_df.loc[q, 'Difference_Gradient_Boosting_Ordered_Close'] = Difference_Prediction_Close
        
        ### Calculating the difference between the Label - Best_Ada_Predict_Shuffled:
        Difference_Label_Prediction = p.Label - p['Best_Gradient_Boosting_Predict_Ordered']
        predictions_df.loc[q, 'Difference_Label_Gradient_Boosting_Ordered'] = Difference_Label_Prediction


        ### If the Difference_Label_Close is negative/positive & Difference_Prediction_Close is the same = 1:
        if Difference_Label_Close > 0 and Difference_Prediction_Close > 0: 
            Price_Direction = 1

        elif Difference_Label_Close < 0 and Difference_Prediction_Close < 0:
            Price_Direction = 1

        else:
            Price_Direction = 0

        predictions_df.loc[q, 'Gradient_Boosting_Ordered_Price_Direction'] = Price_Direction


    stats_df = pandas.DataFrame({'Symbol': symbol, 'Name': name, 
                                 'Correct_Price_Movement_Average': numpy.nanmean(list(predictions_df['Average_Price_Direction'].values)),
                                 'Correct_Price_Movement_RF_Shuffled': numpy.nanmean(list(predictions_df['RF_Shuffled_Price_Direction'].values)),
                                 'Correct_Price_Movement_RF_Ordered': numpy.nanmean(list(predictions_df['RF_Ordered_Price_Direction'].values)),
                                 'Correct_Price_Movement_RF_Ordered_No_Bootstrap': numpy.nanmean(list(predictions_df['RF_Ordered_No_Boot_Price_Direction'].values)),
                                 'Correct_Price_Movement_Ada_Shuffled': numpy.nanmean(list(predictions_df['Ada_Shuffled_Price_Direction'].values)),
                                 'Correct_Price_Movement_Ada_Ordered': numpy.nanmean(list(predictions_df['Ada_Ordered_Price_Direction'].values)),
                                 'Correct_Price_Movement_Gradient_Boosting_Shuffled': numpy.nanmean(list(predictions_df['Gradient_Boosting_Shuffled_Price_Direction'].values)),
                                 'Correct_Price_Movement_Gradient_Boosting_Ordered': numpy.nanmean(list(predictions_df['Gradient_Boosting_Ordered_Price_Direction'].values)),
                                 'Average_Difference_Average_Close': numpy.nanmean(list(predictions_df['Difference_Average_Close'].values)),
                                 'Average_Difference_RF_Shuffled_Close': numpy.nanmean(list(predictions_df['Difference_RF_Shuffled_Close'].values)),
                                 'Average_Difference_RF_Ordered_Close': numpy.nanmean(list(predictions_df['Difference_RF_Ordered_Close'].values)),
                                 'Average_Difference_RF_Ordered_No_Bootstrap_Close': numpy.nanmean(list(predictions_df['Difference_RF_Ordered_No_Boot_Close'].values)),
                                 'Average_Difference_Ada_Shuffled_Close': numpy.nanmean(list(predictions_df['Difference_Ada_Shuffled_Close'].values)),
                                 'Average_Difference_Ada_Ordered_Close': numpy.nanmean(list(predictions_df['Difference_Ada_Ordered_Close'].values)),
                                 'Average_Difference_Gradient_Boosting_Shuffled_Close': numpy.nanmean(list(predictions_df['Difference_Gradient_Boosting_Shuffled_Close'].values)),
                                 'Average_Difference_Gradient_Boosting_Ordered_Close': numpy.nanmean(list(predictions_df['Difference_Gradient_Boosting_Ordered_Close'].values)),
                                 'Average_Difference_Label_Average': numpy.nanmean(list(predictions_df['Difference_Label_Average'].values)),
                                 'Average_Difference_Label_RF_Shuffled': numpy.nanmean(list(predictions_df['Difference_Label_RF_Shuffled'].values)),
                                 'Average_Difference_Label_RF_Ordered': numpy.nanmean(list(predictions_df['Difference_Label_RF_Ordered'].values)),
                                 'Average_Difference_Label_RF_Ordered_No_Bootstrap': numpy.nanmean(list(predictions_df['Difference_Label_RF_Ordered_No_Boot'].values)),
                                 'Average_Difference_Label_Ada_Shuffled': numpy.nanmean(list(predictions_df['Difference_Label_Ada_Shuffled'].values)),
                                 'Average_Difference_Label_Ada_Ordered': numpy.nanmean(list(predictions_df['Difference_Label_Ada_Ordered'].values)),
                                 'Average_Difference_Label_Gradient_Boosting_Shuffled': numpy.nanmean(list(predictions_df['Difference_Label_Gradient_Boosting_Shuffled'].values)),
                                 'Average_Difference_Label_Gradient_Boosting_Ordered': numpy.nanmean(list(predictions_df['Difference_Label_Gradient_Boosting_Ordered'].values)),
                                 'Absolute_Average_Difference_Label_Average': numpy.nanmean(list(abs(predictions_df['Difference_Label_Average'].values))),
                                 'Absolute_Average_Difference_Label_RF_Shuffled': numpy.nanmean(list(abs(predictions_df['Difference_Label_RF_Shuffled'].values))),
                                 'Absolute_Average_Difference_Labelt_RF_Ordered': numpy.nanmean(list(abs(predictions_df['Difference_Label_RF_Ordered'].values))),
                                 'Absolute_Average_Difference_Label_RF_Ordered_No_Bootstrap': numpy.nanmean(list(abs(predictions_df['Difference_Label_RF_Ordered_No_Boot'].values))),
                                 'Absolute_Average_Difference_Label_Ada_Shuffled': numpy.nanmean(list(abs(predictions_df['Difference_Label_Ada_Shuffled'].values))),
                                 'Absolute_Average_Difference_Label_Ada_Ordered': numpy.nanmean(list(abs(predictions_df['Difference_Label_Ada_Ordered'].values))),
                                 'Absolute_Average_Difference_Label_Gradient_Boosting_Shuffled': numpy.nanmean(list(abs(predictions_df['Difference_Label_Gradient_Boosting_Shuffled'].values))),
                                 'Absolute_Average_Difference_Label_Gradient_Boosting_Ordered': numpy.nanmean(list(abs(predictions_df['Difference_Label_Gradient_Boosting_Ordered'].values))),
                                 'STD_Difference_Label_Average': numpy.nanstd(list(predictions_df['Difference_Label_Average'].values)),
                                 'STD_Difference_Label_RF_Shuffled': numpy.nanstd(list(predictions_df['Difference_Label_RF_Shuffled'].values)),
                                 'STD_Difference_Label_RF_Ordered': numpy.nanstd(list(predictions_df['Difference_Label_RF_Ordered'].values)),
                                 'STD_Difference_Label_RF_Ordered_No_Bootstrap': numpy.nanstd(list(predictions_df['Difference_Label_RF_Ordered_No_Boot'].values)),
                                 'STD_Difference_Label_Ada_Shuffled': numpy.nanstd(list(predictions_df['Difference_Label_Ada_Shuffled'].values)),
                                 'STD_Difference_Label_Ada_Ordered': numpy.nanstd(list(predictions_df['Difference_Label_Ada_Ordered'].values)),
                                 'STD_Difference_Label_Gradient_Boosting_Shuffled': numpy.nanstd(list(predictions_df['Difference_Label_Gradient_Boosting_Shuffled'].values)),
                                 'STD_Difference_Label_Gradient_Boosting_Ordered': numpy.nanstd(list(predictions_df['Difference_Label_Gradient_Boosting_Ordered'].values)),
                                 'Accuracy_Average': numpy.nanmean([best_random_forest_accuracy_score_shuffled,best_random_forest_accuracy_score_ordered,best_random_forest_accuracy_score_ordered_no_boot,best_ada_forest_accuracy_score_shuffled,best_ada_accuracy_score_ordered,best_gradient_boosting_accuracy_score_shuffled,best_gradient_boosting_accuracy_score_ordered]),
                                 'Accuracy_RF_Shuffled': best_random_forest_accuracy_score_shuffled,
                                 'Accuracy_RF_Ordered': best_random_forest_accuracy_score_ordered,
                                 'Accuracy_RF_Ordered_No_Bootstrap': best_random_forest_accuracy_score_ordered_no_boot,
                                 'Accuracy_Ada_Shuffled': best_ada_forest_accuracy_score_shuffled,
                                 'Accuracy_Ada_Ordered': best_ada_accuracy_score_ordered,
                                 'Accuracy_Gradient_Boosting_Shuffled': best_gradient_boosting_accuracy_score_shuffled,
                                 'Accuracy_Gradient_Boosting_Ordered': best_gradient_boosting_accuracy_score_ordered}, index=[symbol])

    if not os.path.exists(os.getcwd() + '\\Prediction Analysis\\'):
        os.makedirs(os.getcwd() + '\\Prediction Analysis\\')

    predictions_df.to_csv(os.getcwd() + '\\Prediction Analysis\\'  + symbol + ' - ' + name + ' - Prediction Analysis.csv')
    stats_df.to_csv(os.getcwd() + '\\Prediction Analysis\\'  + symbol + ' - ' + name + ' - Prediction Statistics.csv')
    features_df.to_csv(os.getcwd() + '\\Prediction Analysis\\'  + symbol + ' - ' + name + ' - Feature Importance.csv')
    
    return stats_df, features_df

In [None]:
### Looping:
stocks_and_names_with_indices = pandas.read_csv('merged_NYSE_AMEX_removed_intercept_pattern.csv')

for x, y in stocks_and_names_with_indices.iterrows():
    try:
        if x == 0:
            combined_stats_df, combined_features_df = prediction_analysis(symbol=y['Symbol'], name=y['Description'])  

        else:
            stats_temp_df, features_temp_df = prediction_analysis(symbol=y['Symbol'], name=y['Description'])  
            combined_stats_df = pandas.concat([combined_stats_df, stats_temp_df])
            combined_features_df = pandas.concat([combined_features_df, features_temp_df])

    except Exception:
        print('Error: ', y['Symbol'], y['Description'])
        pass
        
combined_stats_df = combined_stats_df.drop_duplicates()
combined_features_df = combined_features_df.drop_duplicates()

combined_stats_df.to_csv(os.getcwd() + '\\Prediction Analysis\\Overall Statistics.csv')
combined_features_df.to_csv(os.getcwd() + '\\Prediction Analysis\\Overall Feature Importance.csv')