In [None]:
import pandas, json, numpy, requests, os, datetime, pytz, tweepy, sqlite3, time, re, random, matplotlib.pyplot as plt, sklearn, statsmodels.api as sm
from bs4 import BeautifulSoup
from scipy.signal import find_peaks

### This script is for stats analysis of the moving average intercepts pattern to examine the percentage that the price
### follows the pattern. Then if it does, how long does the trend continue (by fitting a linear regression).

#### See bottom of script for single example

#_________________________Mostly Copied from Patterns_Finder.py________________________________________
NYSE_csv = pandas.read_csv('NYSE.txt', sep="\t", header=0).set_index('Symbol')

AMEX_csv = pandas.read_csv('AMEX.txt', sep="\t", header=0).set_index('Symbol')

stock_exchange_ticks_and_names = pandas.merge(NYSE_csv.reset_index(), AMEX_csv.reset_index(), how='outer')
stock_exchange_ticks_and_names.to_csv('merged_NYSE_AMEX.csv')
stock_exchange_ticks_and_names_copy = stock_exchange_ticks_and_names.copy().dropna()

regex1 = re.compile('[@_!#$%^&*()<>?/\|}{~:[\].]')
regex2 = re.compile('Cl ')

stock_exchange_ticks_and_names_removed = pandas.DataFrame()

for x, y in stock_exchange_ticks_and_names_copy.iterrows():

    if bool(regex1.search(y['Description'])) == False and bool(regex2.search(y['Description'])) == False and bool(regex1.search(y['Symbol'])) == False:
        stock_exchange_ticks_and_names_removed.loc[x, 'Symbol'] = y['Symbol']
        stock_exchange_ticks_and_names_removed.loc[x, 'Description'] = y['Description']

stock_indices_df = pandas.DataFrame({'Description': ['S&P', 'Dow', "Nasdaq"], 'Symbol': ['.INX', '.DJI', ".IXIC"]})

stocks_and_names_with_indices = pandas.concat([stock_exchange_ticks_and_names_removed, stock_indices_df])
stocks_and_names_with_indices = stocks_and_names_with_indices.set_index('Symbol')
stocks_and_names_with_indices = stocks_and_names_with_indices.reset_index()
stocks_and_names_with_indices.to_csv('merged_NYSE_AMEX_removed_stats_linear.csv')

##_____________________________Defintion Start____________________________________________________________

def stats_linear(symbol, name, days=30):
    price_df = pandas.read_csv(os.getcwd() + '\\Daily Stock Prices\\' + symbol + ' - ' + name + '.csv', index_col= 'Date', parse_dates=True).reset_index().sort_values('Date', ignore_index=True)
    with open(os.getcwd() + '\\Patterns\\' + symbol + ' - ' + name + ' - Patterns.json', 'r') as json_reader:
        patterns_json = json.load(json_reader)

    ## SMA-30 is approaching SMA-60 from below. After intercept, SMA-30 is above SMA-60
    Moving_Average_30_Down_Before_Intercept_Dates = patterns_json['Moving_Average_30_Down_Before_Intercept_Dates']

    ## SMA-30 is approaching SMA-60 from above. After intercept, SMA-30 is below SMA-60
    Moving_Average_30_Up_Before_Intercept_Dates = patterns_json['Moving_Average_30_Up_Before_Intercept_Dates']

    # Creating two dataframes with the dates and prices:

    Moving_Average_30_Down_df = pandas.DataFrame()

    Moving_Average_30_Up_df = pandas.DataFrame()

    for q in Moving_Average_30_Down_Before_Intercept_Dates:
        date_index = price_df.index[price_df.Date == q][0]
        temp_dates_index_list = range(date_index, date_index + (days + 1), 1)  ## Days + 1 to account for the 0 start indexing + not counting the first date/date_index
        price_list = []
        pattern_list = []  # The values will be 'Before' and 'After": there should be 1 before for every set afters
        date_list = []

        for u in temp_dates_index_list:
            try:
                price = price_df.loc[u]['Close']
                price_list.append(price)
                date = price_df.loc[u]['Date']
                date_list.append(date)

                if u == temp_dates_index_list[0]:
                    pattern_list.append('Before')

                else:
                    pattern_list.append('After')

            except Exception:
                pass

        temp_dict = {'Date': date_list, 'Price': price_list, 'Pattern': pattern_list}
        temp_df = pandas.DataFrame(temp_dict)

        Moving_Average_30_Down_df = pandas.concat([Moving_Average_30_Down_df, temp_df])

    for q in Moving_Average_30_Up_Before_Intercept_Dates:
        date_index = price_df.index[price_df.Date == q][0]
        temp_dates_index_list = range(date_index, date_index + (days + 1), 1)  
        price_list = []
        pattern_list = [] 
        date_list = []

        for u in temp_dates_index_list:
            try:
                price = price_df.loc[u]['Close']
                price_list.append(price)
                date = price_df.loc[u]['Date']
                date_list.append(date)

                if u == temp_dates_index_list[0]:
                    pattern_list.append('Before')

                else:
                    pattern_list.append('After')
                    
            except Exception:
                pass

        temp_dict = {'Date': date_list, 'Price': price_list, 'Pattern': pattern_list}
        temp_df = pandas.DataFrame(temp_dict)

        Moving_Average_30_Up_df = pandas.concat([Moving_Average_30_Up_df, temp_df])

    Moving_Average_30_Down_df = Moving_Average_30_Down_df.reset_index()
    Moving_Average_30_Up_df = Moving_Average_30_Up_df.reset_index()

    ### _________________________________Percentage of times the trend follows the pattern____________________________

    number_of_Before_Down = 0
    number_of_Before_Up = 0
    number_of_Before_Down_list = []
    number_of_Before_Up_list = []
    average_Down = 0
    average_Up = 0

    for x, y in Moving_Average_30_Down_df.iterrows():
        if y['Pattern'] == 'Before':
            Moving_Average_30_Down_df.loc[x, 'Trend_From_Intercept'] = 0
            before = y['Price']
            number_of_Before_Down += 1
            if average_Down != 0:
                average_Down = numpy.average(average_Down)
                number_of_Before_Down_list.append(average_Down)
                average_Down = 0

        elif y['Pattern'] == 'After':
            Moving_Average_30_Down_df.loc[x, 'Trend_From_Intercept'] = y['Price'] - before
            average_Down += y['Price'] - before


    for x, y in Moving_Average_30_Up_df.iterrows():
        if y['Pattern'] == 'Before':
            Moving_Average_30_Up_df.loc[x, 'Trend_From_Intercept'] = 0
            before = y['Price']
            number_of_Before_Up += 1
            if average_Down != 0:
                average_Up = numpy.average(average_Up)
                number_of_Before_Up_list.append(average_Up)
                average_Up = 0

        elif y['Pattern'] == 'After':
            Moving_Average_30_Up_df.loc[x, 'Trend_From_Intercept'] = y['Price'] - before
            average_Up += y['Price'] - before

    # The average 30 day value of the price following the pattern/intercept:
    average_Down_30 = 0
    average_Up_30 = 0

    for x in number_of_Before_Down_list:
        average_Down_30 += x

    for x in number_of_Before_Up_list:
        average_Up_30 +=x

    average_Down_30 = np.average(average_Down_30)
    average_Up_30 = np.averageaverage_Up_30)

    # The percentage of times the price followed the pattern/intercept (frequency):
    Down_pos_count = 0
    Up_neg_count = 0

    ## SMA-30 is approaching SMA-60 from below. After intercept, SMA-30 is above SMA-60
    for x in number_of_Before_Down_list:
        if x > 0:
            Down_pos_count += 1

    ## SMA-30 is approaching SMA-60 from above. After intercept, SMA-30 is below SMA-60
    for x in number_of_Before_Up_list:
        if x < 0:
            Up_neg_count += 1

    Down_30_percentage = Down_pos_count/len(number_of_Before_Down_list)

    Up_30_percentage = Up_neg_count/len(number_of_Before_Up_list)

    #####___________________________ Fitting a Linear Fit_______________________________________
    
    X = numpy.array(list(range(0, days + 1)))  # X = days
    Y_line = []
    R_Score = []
    slope_list = []
    y_intercept_list = []
    counter = 0

    for x, y in Moving_Average_30_Down_df.iterrows():
        if y['Pattern'] == 'Before':
            if counter == 0:
                Y_line.append(y['Price'])

            else:
                Y = numpy.array(Y_line)

                denominator = X.dot(X) - X.mean() * X.sum()

                slope = ( X.dot(Y) - Y.mean() * X.sum() ) / denominator
                y_intercept = ( Y.mean() * X.dot(X) - X.mean() * X.dot(Y) ) / denominator

                line_of_best_fit = slope * X + y_intercept

                first_equation = Y - line_of_best_fit
                second_equation = Y - Y.mean()

                slope_list.append(slope)
                y_intercept_list.append(y_intercept)

                R_Score.append(( 1 - (first_equation.dot(first_equation) / (second_equation.dot(second_equation))) ))

                counter = 0
                Y_line = []
                Y_line.append(y['Price'])

        elif y['Pattern'] == 'After':
            Y_line.append(y['Price'])    
            counter += 1

            if x == len(Moving_Average_30_Up_df) - 1:
                if len(X) != len(Y_line):
                    X = numpy.array(list(range(0, len(Y_line))))
                Y = numpy.array(Y_line)

                denominator = X.dot(X) - X.mean() * X.sum()

                slope = ( X.dot(Y) - Y.mean() * X.sum() ) / denominator
                y_intercept = ( Y.mean() * X.dot(X) - X.mean() * X.dot(Y) ) / denominator

                line_of_best_fit = slope * X + y_intercept

                first_equation = Y - line_of_best_fit
                second_equation = Y - Y.mean()

                slope_list.append(slope)
                y_intercept_list.append(y_intercept)

                R_Score.append(( 1 - (first_equation.dot(first_equation) / (second_equation.dot(second_equation))) ))

    average_down_R_Score = numpy.average(R_Score)
    average_down_slope = numpy.average(slope)

    X = numpy.array(list(range(0, days + 1)))  # X = days
    Y_line = []
    R_Score = []
    slope_list = []
    y_intercept_list = []
    counter = 0

    for x, y in Moving_Average_30_Up_df.iterrows():
        if y['Pattern'] == 'Before':
            if counter == 0:
                Y_line.append(y['Price'])

            else:
                Y = numpy.array(Y_line)

                denominator = X.dot(X) - X.mean() * X.sum()

                slope = ( X.dot(Y) - Y.mean() * X.sum() ) / denominator
                y_intercept = ( Y.mean() * X.dot(X) - X.mean() * X.dot(Y) ) / denominator

                line_of_best_fit = slope * X + y_intercept

                first_equation = Y - line_of_best_fit
                second_equation = Y - Y.mean()

                slope_list.append(slope)
                y_intercept_list.append(y_intercept)

                R_Score.append(( 1 - (first_equation.dot(first_equation) / (second_equation.dot(second_equation))) ))

                counter = 0
                Y_line = []
                Y_line.append(y['Price'])

        elif y['Pattern'] == 'After':
            Y_line.append(y['Price'])    
            counter += 1

            if x == len(Moving_Average_30_Up_df) - 1:
                if len(X) != len(Y_line):
                    X = numpy.array(list(range(0, len(Y_line))))
                Y = numpy.array(Y_line)

                denominator = X.dot(X) - X.mean() * X.sum()

                slope = ( X.dot(Y) - Y.mean() * X.sum() ) / denominator
                y_intercept = ( Y.mean() * X.dot(X) - X.mean() * X.dot(Y) ) / denominator

                line_of_best_fit = slope * X + y_intercept

                first_equation = Y - line_of_best_fit
                second_equation = Y - Y.mean()

                slope_list.append(slope)
                y_intercept_list.append(y_intercept)

                R_Score.append(( 1 - (first_equation.dot(first_equation) / (second_equation.dot(second_equation))) ))


    average_up_R_Score = numpy.average(R_Score)
    average_up_slope = numpy.average(slope)

    ###____________________________ Creating a Overall DataFrame ________________________________

    summary_dict = {'Symbol': symbol, 'Name': name, 'Days': days, 'Average_Down_Difference': average_Down_30, 'Average_Up_Difference': average_Up_30, 'Down_Percentage_Frequency': Down_30_percentage, 'Up_Percentage_Frequency' : Up_30_percentage, 'Down_Linear_R_Sqaure': average_down_R_Score, 'Up_Linear_R_Sqaure': average_up_R_Score, 'Down_Linear_Slope': average_down_slope, 'Up_Linear_Slope': average_up_slope}
    
    return summary_dict




### _________________________________________________________________Looping Begins:_________________________

days_list = list(range(1, 31))

for x, y in stocks_and_names_with_indices.iterrows():
    combined_df = pandas.DataFrame()
    symbol_list = []
    name_list = []
    day_range_list = []
    down_diff_list = []
    up_diff_list = []
    down_freq_list = []
    up_freq_list = []
    down_R_list = []
    up_R_list = []
    down_slope_list = []
    up_slope_list = []

    print(y['Symbol'], y['Description'])
    try:
        for u in days_list:
            summary_dict = stats_linear(symbol= y['Symbol'], name= y['Description'], days= u)
            symbol_list.append(summary_dict['Symbol'])
            name_list.append(summary_dict['Name'])
            day_range_list.append(summary_dict['Days'])
            down_diff_list.append(summary_dict['Average_Down_Difference'])
            up_diff_list.append(summary_dict['Average_Up_Difference'])
            down_freq_list.append(summary_dict['Down_Percentage_Frequency'])
            up_freq_list.append(summary_dict['Up_Percentage_Frequency'])
            down_R_list.append(summary_dict['Down_Linear_R_Sqaure'])
            up_R_list.append(summary_dict['Up_Linear_R_Sqaure'])
            down_slope_list.append(summary_dict['Down_Linear_Slope'])
            up_slope_list.append(summary_dict['Up_Linear_Slope'])


        combined_df = pandas.DataFrame({'Symbol': symbol_list, 'Name': name_list, 'Days': day_range_list, 'Average_Down_Difference': down_diff_list, 'Average_Up_Difference': up_diff_list, 'Down_Percentage_Frequency': down_freq_list, 'Up_Percentage_Frequency' : up_freq_list, 'Down_Linear_R_Sqaure': down_R_list, 'Up_Linear_R_Sqaure': up_R_list, 'Down_Linear_Slope' : down_slope_list, 'Up_Linear_Slope': up_slope_list})

        if not os.path.exists(os.getcwd() + '\\Linear Fits\\'):
            os.makedirs(os.getcwd() + '\\Linear Fits\\')

        combined_df.to_csv(os.getcwd() + '\\Linear Fits\\'  + y['Symbol'] + ' - ' + y['Description'] + '.csv')

        print(1)
    except Exception:
        pass
# Overall best length of how long the effects of the pattern lasts: (RUN TWO SETS OF CODE BELOW INSTEAD)

#-----best_days_summary = {'R': numpy.average(days_of_best_average_R), 'Down_Price_Difference': numpy.average(days_of_best_average_down_difference), 'Up_Price_Difference': numpy.average(days_of_best_average_up_difference), 'Down_Frequency': numpy.average(days_of_best_average_down_frequency), 'Up_Frequency': numpy.average(days_of_best_average_up_frequency)}

#-----best_days_summary_df = pandas.DataFrame(best_days_summary, index=[0])
#-----best_days_summary_df.to_csv(os.getcwd() + '\\Linear Fits\\Overall Averages.csv')





# For best days/time range
days_of_best_average_down_R = []
days_of_best_average_down_difference = []
days_of_best_average_up_difference = []
days_of_best_average_down_frequency = []
days_of_best_average_up_frequency = []
days_of_best_average_up_slope = []
days_of_best_average_down_slope = []
days_of_best_average_up_R = []

for x, y in stocks_and_names_with_indices.iterrows():
    try:
        difference_df = pandas.read_csv(os.getcwd() + '\\Linear Fits\\' + y['Symbol'] + ' - ' + y['Description'] + '.csv')
        print(y['Symbol'], y['Description'])
        days_of_best_average_down_R.append(numpy.nanmean(difference_df.Days[difference_df['Down_Linear_R_Sqaure'] == max(difference_df.loc[1:,'Down_Linear_R_Sqaure'])]))
        days_of_best_average_up_R.append(numpy.nanmean(difference_df.Days[difference_df['Up_Linear_R_Sqaure'] == max(difference_df.loc[1:,'Up_Linear_R_Sqaure'])]))
        days_of_best_average_down_difference.append(numpy.nanmean(difference_df.Days[difference_df['Average_Down_Difference'] == max(difference_df['Average_Down_Difference'])]))
        days_of_best_average_up_difference.append(numpy.nanmean(difference_df.Days[difference_df['Average_Up_Difference'] == min(difference_df['Average_Up_Difference'])]))
        days_of_best_average_up_frequency.append(numpy.nanmean(difference_df.Days[difference_df['Up_Percentage_Frequency'] == max(difference_df['Up_Percentage_Frequency'])]))
        days_of_best_average_down_frequency.append(numpy.nanmean(difference_df.Days[difference_df['Down_Percentage_Frequency'] == max(difference_df['Down_Percentage_Frequency'])]))
        days_of_best_average_up_slope.append(numpy.nanmean(difference_df.Days[difference_df['Up_Linear_Slope'] == min(difference_df['Up_Linear_Slope'])]))
        days_of_best_average_down_slope.append(numpy.nanmean(difference_df.Days[difference_df['Down_Linear_Slope'] == max(difference_df['Down_Linear_Slope'])]))        
        print(7)
    except Exception:
        pass


best_days_summary = {'Below_R': numpy.nanmean(days_of_best_average_down_R), 'Above_R': numpy.nanmean(days_of_best_average_up_R), 'Down_Price_Difference': numpy.nanmean(days_of_best_average_down_difference), 'Up_Price_Difference': numpy.nanmean(days_of_best_average_up_difference), 'Down_Frequency': numpy.nanmean(days_of_best_average_down_frequency), 'Up_Frequency': numpy.nanmean(days_of_best_average_up_frequency), 'Down_Slope': numpy.nanmean(days_of_best_average_down_slope), 'Up_Slope': numpy.nanmean(days_of_best_average_up_slope)}

best_days_summary_df = pandas.DataFrame(best_days_summary, index=['Days'])

# For the stats for the best days
days_of_best_average_down_R = []
days_of_best_average_down_difference = []
days_of_best_average_up_difference = []
days_of_best_average_down_frequency = []
days_of_best_average_up_frequency = []
days_of_best_average_up_slope = []
days_of_best_average_down_slope = []
days_of_best_average_up_R = []

for x, y in stocks_and_names_with_indices.iterrows():
    try:
        difference_df = pandas.read_csv(os.getcwd() + '\\Linear Fits\\' + y['Symbol'] + ' - ' + y['Description'] + '.csv')
        print(y['Symbol'], y['Description'])
        days_of_best_average_down_R.append(numpy.nanmean(difference_df['Down_Linear_R_Sqaure'][difference_df['Down_Linear_R_Sqaure'] == max(difference_df.loc[1:, 'Down_Linear_R_Sqaure'])]))
        days_of_best_average_up_R.append(numpy.nanmean(difference_df['Up_Linear_R_Sqaure'][difference_df['Up_Linear_R_Sqaure'] == max(difference_df.loc[1:,'Up_Linear_R_Sqaure'])]))
        days_of_best_average_down_difference.append(numpy.nanmean(difference_df['Average_Down_Difference'][difference_df['Average_Down_Difference'] == max(difference_df['Average_Down_Difference'])]))
        days_of_best_average_up_difference.append(numpy.nanmean(difference_df['Average_Up_Difference'][difference_df['Average_Up_Difference'] == min(difference_df['Average_Up_Difference'])]))
        days_of_best_average_up_frequency.append(numpy.nanmean(difference_df['Up_Percentage_Frequency'][difference_df['Up_Percentage_Frequency'] == max(difference_df['Up_Percentage_Frequency'])]))
        days_of_best_average_down_frequency.append(numpy.nanmean(difference_df['Down_Percentage_Frequency'][difference_df['Down_Percentage_Frequency'] == max(difference_df['Down_Percentage_Frequency'])]))
        days_of_best_average_up_slope.append(numpy.nanmean(difference_df['Up_Linear_Slope'][difference_df['Up_Linear_Slope'] == min(difference_df['Up_Linear_Slope'])]))
        days_of_best_average_down_slope.append(numpy.nanmean(difference_df['Down_Linear_Slope'][difference_df['Down_Linear_Slope'] == max(difference_df['Down_Linear_Slope'])]))
        print(9)
    except Exception:
        pass

new_difference_df = []

for p in days_of_best_average_down_R:
    if p != float('-inf'):
        new_difference_df.append(p)

best_days_stats = {'Below_R': numpy.nanmean(new_difference_df), 'Above_R': numpy.nanmean(days_of_best_average_up_R), 'Down_Price_Difference': numpy.nanmean(days_of_best_average_down_difference), 'Up_Price_Difference': numpy.nanmean(days_of_best_average_up_difference), 'Down_Frequency': numpy.nanmean(days_of_best_average_down_frequency), 'Up_Frequency': numpy.nanmean(days_of_best_average_up_frequency), 'Down_Slope': numpy.nanmean(days_of_best_average_down_slope), 'Up_Slope': numpy.nanmean(days_of_best_average_up_slope)}

best_days_stats_summary_df = pandas.DataFrame(best_days_stats, index=['Stats'])

best_summary_df = pandas.concat([best_days_summary_df, best_days_stats_summary_df])
best_summary_df.to_csv(os.getcwd() + '\\Linear Fits\\Overall Averages.csv')

aggg = pandas.read_csv(os.getcwd() + '\\Linear Fits\\Overall Averages.csv')