In [None]:
import pandas, json, numpy, requests, os, datetime, statistics, math, pytz, tweepy, sqlite3, time, re, random, matplotlib.pyplot as plt, sklearn, statsmodels.api as sm
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

This script is to fit models to predict the price. 
Using random forest, AdaBoost, and gradient boosting.

In [None]:
def predictions(symbol, name):
    final_df = pandas.read_csv(os.getcwd() + '\\Final DataFrame\\'  + symbol + ' - ' + name + ' - Final DataFrame.csv', index_col= 'Date', parse_dates= True).sort_index(ascending=True)
    final_df = final_df.drop(['Unnamed: 0', 'Symbol', 'Name'], axis= 1)
    
    print(symbol, name)
    
    df_counter = 0

    for u, v in final_df.iterrows():

        if v.name != final_df.index[-1]:
            final_df.loc[u, 'Label'] = final_df.iloc[df_counter+1]['Close']

        if v.Trend == 'High Uptrend':
            final_df.loc[u, 'Trend'] = 3

        elif v.Trend == 'Uptrend':
            final_df.loc[u, 'Trend'] = 2

        elif v.Trend == 'Slight Uptrend':
            final_df.loc[u, 'Trend'] = 1

        elif v.Trend == 'No Trend':
            final_df.loc[u, 'Trend'] = 0

        elif v.Trend == 'Slight Downtrend':
            final_df.loc[u, 'Trend'] = -1

        elif v.Trend == 'Downtrend':
            final_df.loc[u, 'Trend'] = -2

        elif v.Trend == 'High Downtrend':
            final_df.loc[u, 'Trend'] = -3

        df_counter += 1


    ##### Because the last day on the df will not be labeled, it will need to be removed.
    final_df_last_row = final_df.iloc[[-1]]
    final_df_last_row = final_df_last_row.drop('Label', axis= 1)

    final_df = final_df.drop(final_df.index[-1], axis= 0)
    
    ### Splitting the label from the variables:
    ##### The first 5 rows do not have the 5, 7, or 10 day slopes and the modeling should not be affected since there are many 
    ##### data points avaliable for most stocks
    y = final_df.iloc[5:, 19]
    X = final_df.iloc[5:, :19]

    ## The reordering of the index is to include the newest dates, which has more data on the current market (coronavirus), when
    ## splitting with shuffle = False--forcing the train data to include the newest dates. 
    ## This should not affect if shuffle = True.
    y = y.sort_index(ascending=False)
    X = X.sort_index(ascending=False)
    
    ### Splitting the data: test_size = default = 0.25
    shuffled_X_train, shuffled_X_test, shuffled_y_train, shuffled_y_test = train_test_split(X, y)

    ordered_X_train, ordered_X_test, ordered_y_train, ordered_y_test = train_test_split(X, y, shuffle= False)

    ordered_X_train = ordered_X_train.sort_index(ascending=True)
    ordered_X_test = ordered_X_test.sort_index(ascending=True)
    ordered_y_train = ordered_y_train.sort_index(ascending=True)
    ordered_y_test = ordered_y_test.sort_index(ascending=True)
    
    ### Random Forest:
    #### According to a Harvard lecture, random forest for regression should have max_features = 'sqrt' else the default ('auto')
    #### will do bagging instead of random forest

    ## The best average cross validation score will be the determinant for best number of trees and the accuracy for those many 
    ## trees. Cross validation is set to fold = 5. Accuracy score is not the accuracy of the cv model but the accuracy of the
    ## model split at 75% train & 25% test.

    best_random_forest_trees_shuffled = 0
    best_random_forest_accuracy_shuffled = 0
    best_random_forest_avg_cv_shuffled = 0


    for q in range(64, 129):  ## Google search shows that 64-128 is the best range for number of trees
        random_forest = RandomForestRegressor(n_estimators= q, max_features= 'sqrt')
        random_forest.fit(shuffled_X_train, shuffled_y_train)
        random_forest_cv = cross_val_score(random_forest, X, y, cv = 5)
        average_cv_score = numpy.average(random_forest_cv)
        accuracy_score = random_forest.score(shuffled_X_test, shuffled_y_test)

        if average_cv_score > best_random_forest_avg_cv_shuffled:
            best_random_forest_trees_shuffled = q
            best_random_forest_accuracy_shuffled = accuracy_score
            best_random_forest_avg_cv_shuffled = average_cv_score


    best_random_forest_trees_ordered = 0
    best_random_forest_accuracy_ordered = 0
    best_random_forest_avg_cv_ordered = 0

    for q in range(64, 129):  
        random_forest = RandomForestRegressor(n_estimators= q, max_features= 'sqrt')
        random_forest.fit(ordered_X_train, ordered_y_train)
        random_forest_cv = cross_val_score(random_forest, X, y, cv = 5)
        average_cv_score = numpy.average(random_forest_cv)
        accuracy_score = random_forest.score(ordered_X_test, ordered_y_test)

        if average_cv_score > best_random_forest_avg_cv_ordered:
            best_random_forest_trees_ordered = q
            best_random_forest_accuracy_ordered = accuracy_score
            best_random_forest_avg_cv_ordered = accuracy_score

    best_random_forest_trees_ordered_no_boot = 0
    best_random_forest_accuracy_ordered_no_boot = 0
    best_random_forest_avg_cv_ordered_no_boot = 0

    for q in range(64, 129):  
        random_forest = RandomForestRegressor(n_estimators= q, max_features= 'sqrt', bootstrap= False)
        random_forest.fit(ordered_X_train, ordered_y_train)
        random_forest_cv = cross_val_score(random_forest, X, y, cv = 5)
        average_cv_score = numpy.average(random_forest_cv)
        accuracy_score = random_forest.score(ordered_X_test, ordered_y_test)

        if average_cv_score > best_random_forest_avg_cv_ordered_no_boot:
            best_random_forest_trees_ordered_no_boot = q
            best_random_forest_accuracy_ordered_no_boot = accuracy_score
            best_random_forest_avg_cv_ordered_no_boot = average_cv_score
    
    print('r')
    
    ### Obtaining the predicted values for the last row without a label using the best trees for random forest:
    if best_random_forest_trees_shuffled == 0:
        best_random_forest_trees_shuffled = 128
        random_forest = RandomForestRegressor(n_estimators= 128, max_features= 'sqrt')
        random_forest.fit(shuffled_X_train, shuffled_y_train)
        random_forest_cv = cross_val_score(random_forest, X, y, cv = 5)
        best_random_forest_avg_cv_shuffled = numpy.average(random_forest_cv)
        best_random_forest_accuracy_shuffled = random_forest.score(shuffled_X_test, shuffled_y_test)

    else:
        random_forest = RandomForestRegressor(n_estimators= best_random_forest_trees_shuffled, max_features= 'sqrt')
        random_forest.fit(shuffled_X_train, shuffled_y_train)

    best_random_forest_predict_shuffled = random_forest.predict(final_df_last_row)


    if best_random_forest_trees_ordered == 0:
        best_random_forest_trees_ordered = 128
        random_forest = RandomForestRegressor(n_estimators= 128, max_features= 'sqrt')
        random_forest.fit(ordered_X_train, ordered_y_train)
        random_forest_cv = cross_val_score(random_forest, X, y, cv = 5)
        best_random_forest_avg_cv_ordered = numpy.average(random_forest_cv)
        best_random_forest_accuracy_ordered = random_forest.score(ordered_X_test, ordered_y_test)

    else:
        random_forest = RandomForestRegressor(n_estimators= best_random_forest_trees_ordered, max_features= 'sqrt')
        random_forest.fit(ordered_X_train, ordered_y_train)

    best_random_forest_predict_ordered = random_forest.predict(final_df_last_row)


    if best_random_forest_trees_ordered_no_boot == 0:
        best_random_forest_trees_ordered_no_boot = 128
        random_forest = RandomForestRegressor(n_estimators= 128, max_features= 'sqrt', bootstrap= False)
        random_forest.fit(ordered_X_train, ordered_y_train)
        random_forest_cv = cross_val_score(random_forest, X, y, cv = 5)
        best_random_forest_avg_cv_ordered_no_boot = numpy.average(random_forest_cv)
        best_random_forest_accuracy_ordered_no_boot = random_forest.score(ordered_X_test, ordered_y_test)

    else:
        random_forest = RandomForestRegressor(n_estimators= best_random_forest_trees_ordered_no_boot, max_features= 'sqrt', bootstrap= False)
        random_forest.fit(ordered_X_train, ordered_y_train)

    best_random_forest_predict_ordered_no_boot = random_forest.predict(final_df_last_row)
    
    ### Testing AdaBoost:
    best_ada_trees_shuffled = 0
    best_ada_accuracy_shuffled = 0
    best_ada_avg_cv_shuffled = 0

    for q in range(64, 129):  ## Google search shows that 64-128 is the best range for number of trees
        ada = AdaBoostRegressor(n_estimators= q)
        ada.fit(shuffled_X_train, shuffled_y_train)
        ada_cv = cross_val_score(ada, X, y, cv = 5)
        average_cv_score = numpy.average(ada_cv)
        accuracy_score = ada.score(shuffled_X_test, shuffled_y_test)

        if average_cv_score > best_ada_avg_cv_shuffled:
            best_ada_trees_shuffled = q
            best_ada_accuracy_shuffled = accuracy_score
            best_ada_avg_cv_shuffled = average_cv_score

    best_ada_trees_ordered = 0
    best_ada_accuracy_ordered = 0
    best_ada_avg_cv_ordered = 0

    for q in range(64, 129):  
        ada = AdaBoostRegressor(n_estimators= q)
        ada.fit(ordered_X_train, ordered_y_train)
        ada_cv = cross_val_score(ada, X, y, cv = 5)
        average_cv_score = numpy.average(ada_cv)
        accuracy_score = ada.score(ordered_X_test, ordered_y_test)

        if average_cv_score > best_ada_avg_cv_ordered:
            best_ada_trees_ordered = q
            best_ada_accuracy_ordered = accuracy_score
            best_ada_avg_cv_ordered = accuracy_score
            
    print('a')
            
    ### Obtaining the predicted values for the last row without a label using the best trees for AdaBoost:
    if best_ada_trees_shuffled == 0:
        best_ada_trees_shuffled = 128
        ada = AdaBoostRegressor(n_estimators= best_ada_trees_shuffled)
        ada.fit(shuffled_X_train, shuffled_y_train)
        ada_cv = cross_val_score(ada, X, y, cv = 5)
        best_ada_avg_cv_shuffled = numpy.average(ada_cv)
        best_ada_accuracy_shuffled = ada.score(shuffled_X_test, shuffled_y_test)

    else:
        ada = AdaBoostRegressor(n_estimators= best_ada_trees_shuffled)
        ada.fit(shuffled_X_train, shuffled_y_train)

    best_ada_predict_shuffled = ada.predict(final_df_last_row)


    if best_ada_trees_ordered == 0:
        best_ada_trees_ordered = 128
        ada = AdaBoostRegressor(n_estimators= 128)
        ada.fit(ordered_X_train, ordered_y_train)
        ada_cv = cross_val_score(ada, X, y, cv = 5)
        best_ada_avg_cv_ordered = numpy.average(ada_cv)
        best_ada_accuracy_ordered = ada.score(ordered_X_test, ordered_y_test)

    else:
        ada = AdaBoostRegressor(n_estimators= best_ada_trees_ordered)
        ada.fit(ordered_X_train, ordered_y_train)

    best_ada_predict_ordered = ada.predict(final_df_last_row)
    
    ### Testing Gradient Boosting: GradientBoostingRegressor
    best_gradient_boosting_trees_shuffled = 0
    best_gradient_boosting_accuracy_shuffled = 0
    best_gradient_boosting_avg_cv_shuffled = 0

    for q in range(64, 129):  ## Google search shows that 64-128 is the best range for number of trees
        gradient_boosting = GradientBoostingRegressor(n_estimators= q)
        gradient_boosting.fit(shuffled_X_train, shuffled_y_train)
        gradient_boosting_cv = cross_val_score(gradient_boosting, X, y, cv = 5)
        gradient_boosting_cv_score = numpy.average(gradient_boosting_cv)
        accuracy_score = gradient_boosting.score(shuffled_X_test, shuffled_y_test)

        if average_cv_score > best_gradient_boosting_avg_cv_shuffled:
            best_gradient_boosting_trees_shuffled = q
            best_gradient_boosting_accuracy_shuffled = accuracy_score
            best_gradient_boosting_avg_cv_shuffled = average_cv_score


    best_gradient_boosting_trees_ordered = 0
    best_gradient_boosting_accuracy_ordered = 0
    best_gradient_boosting_avg_cv_ordered = 0

    for q in range(64, 129):  
        gradient_boosting = GradientBoostingRegressor(n_estimators= q)
        gradient_boosting.fit(ordered_X_train, ordered_y_train)
        gradient_boosting_cv = cross_val_score(gradient_boosting, X, y, cv = 5)
        gradient_boosting_cv_score = numpy.average(gradient_boosting_cv)
        accuracy_score = gradient_boosting.score(ordered_X_test, ordered_y_test)

        if average_cv_score > best_gradient_boosting_avg_cv_ordered:
            best_gradient_boosting_trees_ordered = q
            best_gradient_boosting_accuracy_ordered = accuracy_score
            best_gradient_boosting_avg_cv_ordered = gradient_boosting_cv_score
            
    print('g')
    
    ### Obtaining the predicted values for the last row without a label using the best trees for gradient boosting:
    if best_gradient_boosting_trees_shuffled == 0:
        best_gradient_boosting_trees_shuffled = 128
        gradient_boosting = GradientBoostingRegressor(n_estimators= 128)
        gradient_boosting.fit(shuffled_X_train, shuffled_y_train)
        gradient_boosting_cv = cross_val_score(gradient_boosting, X, y, cv = 5)
        best_gradient_boosting_avg_cv_shuffled = numpy.average(gradient_boosting_cv)
        best_gradient_boosting_accuracy_shuffled = gradient_boosting.score(shuffled_X_test, shuffled_y_test)

    else:
        gradient_boosting = GradientBoostingRegressor(n_estimators= best_gradient_boosting_trees_shuffled)
        gradient_boosting.fit(shuffled_X_train, shuffled_y_train)

    best_gradient_boosting_predict_shuffled = gradient_boosting.predict(final_df_last_row)


    if best_gradient_boosting_trees_ordered == 0:
        best_gradient_boosting_trees_ordered = 128
        gradient_boosting = GradientBoostingRegressor(n_estimators= 128)
        gradient_boosting.fit(ordered_X_train, ordered_y_train)
        gradient_boosting_cv = cross_val_score(gradient_boosting, X, y, cv = 5)
        best_gradient_boosting_avg_cv_ordered = numpy.average(gradient_boosting_cv)
        best_gradient_boosting_accuracy_ordered = gradient_boosting.score(ordered_X_test, ordered_y_test)

    else:
        gradient_boosting = GradientBoostingRegressor(n_estimators= best_gradient_boosting_trees_ordered)
        gradient_boosting.fit(ordered_X_train, ordered_y_train)

    best_gradient_boosting_predict_ordered = gradient_boosting.predict(final_df_last_row)
    
    
    ### Creating a summary DataFrame to export:
    previous_day_price = final_df.iloc[-1]['Close']
    predicted_date = final_df_last_row.index
    predicted_date = str(pandas.to_datetime(predicted_date, format='%Y/%m/%d').date[0])
    average_predicted_price = numpy.average([best_random_forest_predict_shuffled, best_random_forest_predict_ordered, best_random_forest_predict_ordered_no_boot, best_ada_predict_shuffled, best_ada_predict_ordered, best_gradient_boosting_predict_shuffled, best_gradient_boosting_predict_ordered])
    predicted_difference = average_predicted_price - previous_day_price
    
    if not os.path.exists(os.getcwd() + '\\Predictions\\'):
        os.makedirs(os.getcwd() + '\\Predictions\\')

    if os.path.exists(os.getcwd() + '\\Predictions\\'  + symbol + ' - ' + name + ' - Prediction Summary.csv') == True:
        old_df = pandas.read_csv(os.getcwd() + '\\Predictions\\'  + symbol + ' - ' + name + ' - Prediction Summary.csv', index_col= 0)
        summary_df = pandas.DataFrame({'Symbol': symbol, 'Name': name, 'Predicted_Date': predicted_date, 'Price_Of_Previous_Day': previous_day_price, 'Average_Predicted_Price': average_predicted_price, 'Predicted_Difference': predicted_difference, 'Random_Forest_Shuffled_Predicted_Price': best_random_forest_predict_shuffled, 'Random_Forest_Shuffled_Best_Trees': best_random_forest_trees_shuffled, 'Random_Forest_Shuffled_Best_Accuracy': best_random_forest_accuracy_shuffled, 'Random_Forest_Shuffled_Best_Cross_Validation_Accuracy':best_random_forest_avg_cv_shuffled, 'Random_Forest_Ordered_Predicted_Price': best_random_forest_predict_ordered, 'Random_Forest_Ordered_Best_Trees': best_random_forest_trees_ordered, 'Random_Forest_Ordered_Best_Accuracy': best_random_forest_accuracy_ordered, 'Random_Forest_Ordered_Best_Cross_Validation_Accuracy': best_random_forest_avg_cv_ordered, 'Random_Forest_Ordered_No_Bootstrap_Predicted_Price': best_random_forest_predict_ordered_no_boot, 'Random_Forest_Ordered_No_Bootstrap_Best_Trees': best_random_forest_trees_ordered_no_boot, 'Random_Forest_Ordered_No_Bootstrap_Best_Accuracy': best_random_forest_accuracy_ordered_no_boot, 'Random_Forest_Ordered_No_Bootstrap_Best_Cross_Validation_Accuracy': best_random_forest_avg_cv_ordered_no_boot, 'AdaBoost_Shuffled_Predicted_Price': best_ada_predict_shuffled, 'AdaBoost_Shuffled_Best_Trees': best_ada_trees_shuffled, 'AdaBoost_Shuffled_Best_Accuracy': best_ada_accuracy_shuffled, 'AdaBoost_Shuffled_Best_Cross_Validation_Accuracy': best_ada_avg_cv_shuffled, 'AdaBoost_Ordered_Predicted_Price': best_ada_predict_ordered, 'AdaBoost_Ordered_Best_Trees': best_ada_trees_ordered, 'AdaBoost_Ordered_Best_Accuracy': best_ada_accuracy_ordered, 'AdaBoost_Ordered_Best_Cross_Validation_Accuracy': best_ada_avg_cv_ordered, 'Gradient_Boosting_Shuffled_Predicted_Price': best_gradient_boosting_predict_shuffled, 'Gradient_Boosting_Shuffled_Best_Trees': best_gradient_boosting_trees_shuffled, 'Gradient_Boosting_Shuffled_Best_Accuracy': best_gradient_boosting_accuracy_shuffled, 'Gradient_Boosting_Shuffled_Best_Cross_Validation_Accuracy': best_gradient_boosting_avg_cv_shuffled, 'Gradient_Boosting_Ordered_Predicted_Price': best_gradient_boosting_predict_ordered, 'Gradient_Boosting_Ordered_Best_Trees': best_gradient_boosting_trees_ordered, 'Gradient_Boosting_Ordered_Best_Accuracy': best_gradient_boosting_accuracy_ordered, 'Gradient_Boosting_Ordered_Best_Cross_Validation_Accuracy': best_gradient_boosting_avg_cv_ordered}, index=[len(old_df)])

        if old_df.Predicted_Date[len(old_df)-1] != summary_df.Predicted_Date[len(old_df)]:
            new_df = old_df.merge(summary_df, how='outer')

    else:
        summary_df = pandas.DataFrame({'Symbol': symbol, 'Name': name, 'Predicted_Date': predicted_date, 'Price_Of_Previous_Day': previous_day_price, 'Average_Predicted_Price': average_predicted_price, 'Predicted_Difference': predicted_difference, 'Random_Forest_Shuffled_Predicted_Price': best_random_forest_predict_shuffled, 'Random_Forest_Shuffled_Best_Trees': best_random_forest_trees_shuffled, 'Random_Forest_Shuffled_Best_Accuracy': best_random_forest_accuracy_shuffled, 'Random_Forest_Shuffled_Best_Cross_Validation_Accuracy':best_random_forest_avg_cv_shuffled, 'Random_Forest_Ordered_Predicted_Price': best_random_forest_predict_ordered, 'Random_Forest_Ordered_Best_Trees': best_random_forest_trees_ordered, 'Random_Forest_Ordered_Best_Accuracy': best_random_forest_accuracy_ordered, 'Random_Forest_Ordered_Best_Cross_Validation_Accuracy': best_random_forest_avg_cv_ordered, 'Random_Forest_Ordered_No_Bootstrap_Predicted_Price': best_random_forest_predict_ordered_no_boot, 'Random_Forest_Ordered_No_Bootstrap_Best_Trees': best_random_forest_trees_ordered_no_boot, 'Random_Forest_Ordered_No_Bootstrap_Best_Accuracy': best_random_forest_accuracy_ordered_no_boot, 'Random_Forest_Ordered_No_Bootstrap_Best_Cross_Validation_Accuracy': best_random_forest_avg_cv_ordered_no_boot, 'AdaBoost_Shuffled_Predicted_Price': best_ada_predict_shuffled, 'AdaBoost_Shuffled_Best_Trees': best_ada_trees_shuffled, 'AdaBoost_Shuffled_Best_Accuracy': best_ada_accuracy_shuffled, 'AdaBoost_Shuffled_Best_Cross_Validation_Accuracy': best_ada_avg_cv_shuffled, 'AdaBoost_Ordered_Predicted_Price': best_ada_predict_ordered, 'AdaBoost_Ordered_Best_Trees': best_ada_trees_ordered, 'AdaBoost_Ordered_Best_Accuracy': best_ada_accuracy_ordered, 'AdaBoost_Ordered_Best_Cross_Validation_Accuracy': best_ada_avg_cv_ordered, 'Gradient_Boosting_Shuffled_Predicted_Price': best_gradient_boosting_predict_shuffled, 'Gradient_Boosting_Shuffled_Best_Trees': best_gradient_boosting_trees_shuffled, 'Gradient_Boosting_Shuffled_Best_Accuracy': best_gradient_boosting_accuracy_shuffled, 'Gradient_Boosting_Shuffled_Best_Cross_Validation_Accuracy': best_gradient_boosting_avg_cv_shuffled, 'Gradient_Boosting_Ordered_Predicted_Price': best_gradient_boosting_predict_ordered, 'Gradient_Boosting_Ordered_Best_Trees': best_gradient_boosting_trees_ordered, 'Gradient_Boosting_Ordered_Best_Accuracy': best_gradient_boosting_accuracy_ordered, 'Gradient_Boosting_Ordered_Best_Cross_Validation_Accuracy': best_gradient_boosting_avg_cv_ordered}, index=[0])
        summary_df.to_csv(os.getcwd() + '\\Predictions\\'  + symbol + ' - ' + name + ' - Prediction Summary.csv')  

In [None]:
### Looping:
stocks_and_names_with_indices = pandas.read_csv('merged_NYSE_AMEX_removed_intercept_pattern.csv')

for x, y in stocks_and_names_with_indices.iterrows():
    try:
        predictions(symbol=y['Symbol'], name=y['Description'])  
        
    except Exception:
        print('Error: ', y['Symbol'], y['Description'])
        continue