In [6]:
import os
import pandas as pd
from glob import glob

# Define the directory containing the CSV files
path = 'C:/Users/atlas/OneDrive/Desktop/usstock/'

# Get all CSV files in the directory
all_csv_files = glob(os.path.join(path, '*.csv'))

# Read all CSV files and concatenate them into a single DataFrame
dfs = []
for csv_file in all_csv_files:
    df = pd.read_csv(csv_file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
daily_price = pd.concat(dfs, ignore_index=True)

  daily_price = pd.concat(dfs, ignore_index=True)


In [15]:
daily_price = daily_price[['Date','Close','Ticker']]
daily_price = daily_price.rename(columns={'Date':'time','Close':'close','Ticker':'ticker'})

In [16]:
daily_price.head()

Unnamed: 0,time,close,ticker
0,1999-11-18,31.473534,A
1,1999-11-19,28.880545,A
2,1999-11-22,31.473534,A
3,1999-11-23,28.612303,A
4,1999-11-24,29.372318,A


In [17]:
import pandas as pd
import numpy as np
from vnstock import *
from datetime import datetime, timedelta
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta



def yearly_price(df):
    df['time'] = pd.to_datetime(df['time'])
    df['year'] = df['time'].dt.year 
    df = df.loc[df.groupby(['ticker', 'year'])['time'].idxmax()]
    df = df[['time', 'ticker', 'close','year']]  
    df = df.copy()
    df['next year price'] = df.groupby('ticker')['close'].shift(-1)
    df['return'] = df['next year price'] / df['close'] - 1
    df['next date'] = df.groupby('ticker')['time'].shift(-1)

    return df


def ranking(df):
    df['ranking'] = df.groupby('year')['return'].transform(lambda x: x.rank(ascending=False))
    df = pd.pivot_table(data=df, values = "ranking", index = "ticker", columns = "time", aggfunc = 'sum', fill_value = 0)
    df.reset_index(drop=False, inplace=True)
    df = df.T
    df.reset_index(drop=False, inplace=True)
    df.columns = df.iloc[0]
    df = df.drop(0)
    df.reset_index(drop=True, inplace=True)
    df.rename(columns={'ticker': 'time'}, inplace=True)
    df['time'] = pd.to_datetime(df['time'])
    df['time'] = df['time'].dt.strftime('%Y-%m-%d')

    return df



def markov(df):
    unique_dates = df['time'].unique()
    #unique_dates = ['2023-12-31']
    final_result_append = []
    
    for threshold_date in unique_dates:
        
        df_copy = df.copy()   
        df_copy.rename(columns={'ticker': 'time'}, inplace=True)
        df_copy['time'] = pd.to_datetime(df_copy['time'])
        df_copy['time'] = df_copy['time'].dt.strftime('%Y-%m-%d')
        threshold_date_timestamp = pd.Timestamp(threshold_date)
        
        # Filter the DataFrame to include only dates before the current threshold date
        con2 = df_copy['time'] < threshold_date_timestamp.strftime('%Y-%m-%d')
        df_filtered = df_copy[con2]
        df_filtered = df_filtered.sort_values(by='time')
    
        # Create an empty list to store the results
        results = []
    
        # Loop over each column after the second column (as the first one is 'time')
        for col_name in df_filtered.columns[1:]:
            second_column = df_filtered[col_name]
    
            # Manually input 400 rankings
            rankings = [i for i in range(1, 401)]
    
            # Calculate transition matrix using the second column
            transition_matrix = pd.crosstab(second_column.shift(), second_column, normalize='index')
    
            # Reindex transition matrix to include all possible rankings
            transition_matrix = transition_matrix.reindex(index=rankings, columns=rankings, fill_value=0)
    
            # Check if the transition matrix satisfies Markov Chain assumptions
            # if not all(transition_matrix.sum(axis=1).round(8) == 1):
                #print("Transition matrix does not satisfy Markov Chain assumptions")
    
            # Example transition matrix dimensions (400x400)
            matrix_size = 400
            # Create a random transition matrix
            transition_matrix_data = np.random.rand(matrix_size, matrix_size)
            # Normalize each row to ensure that row sums are equal to 1
            transition_matrix_data = transition_matrix_data / transition_matrix_data.sum(axis=1, keepdims=True)
            # Create DataFrame with random values and rankings as indices and columns
            transition_matrix_df = pd.DataFrame(transition_matrix_data, index=range(1, matrix_size + 1),
                                                columns=range(1, matrix_size + 1))
    
            # Example stationary distribution dimensions (400x1)
            # Create a random stationary distribution
            stationary_distribution_data = np.random.rand(matrix_size, 1)
            # Normalize the distribution to ensure that the sum equals 1
            stationary_distribution_data = stationary_distribution_data / stationary_distribution_data.sum()
            # Create DataFrame with random values and rankings as index
            stationary_distribution_df = pd.DataFrame(stationary_distribution_data, index=range(1, matrix_size + 1),
                                                      columns=['Probability'])
    
            # Assuming stationary_distribution_df contains the stationary distribution and transition_matrix_df contains the transition matrix
    
            stationary_distribution_transposed = stationary_distribution_df.T
            predicted_distribution = stationary_distribution_transposed.dot(transition_matrix_df)
            predicted_distribution_with_ticker = pd.concat([pd.DataFrame({'Ticker': [col_name]}),
                                                            predicted_distribution.reset_index(drop=True)], axis=1)
    
            # Append the current result to the results list
            results.append(predicted_distribution_with_ticker)
    
        # Concatenate all results into a single DataFrame
        final_result = pd.concat(results, ignore_index=True)
    
        # Convert the 'time' column to datetime format
        df_filtered['time'] = pd.to_datetime(df_filtered['time'])
    
        max_time = df_filtered['time'].max()
    
        if not pd.isnull(max_time):
            final_result['time'] = max_time ### this time is reporting date, not forecast date. Forecast date should be the date after that
            final_result_append.append(final_result)
        else:
            print("Skipping threshold date:", threshold_date, "because maximum date is NaN")
    
    final_result_df = pd.concat(final_result_append, ignore_index=True)

    return final_result_df


path = 'C:/Users/atlas/OneDrive/Desktop/usstock2/'
#yearly price
yearly_price = yearly_price(daily_price)
yearly_price.to_csv(path + '/yearly_price.csv', index=False)

#raking
ranking = ranking(yearly_price)
ranking.to_csv(path + '/ranking.csv', index=False)


#markov
df = markov(ranking)
df.to_csv(path + '/df.csv', index=False)


Skipping threshold date: 1962-12-31 because maximum date is NaN



KeyboardInterrupt

