In [None]:
import pandas as pd
import numpy as np
from vnstock import *
from datetime import datetime, timedelta
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta

def yearly_price(csv_file_path):
    df = pd.read_csv(csv_file_path)
    df['time'] = pd.to_datetime(df['time'])
    df['year'] = df['time'].dt.year 
    df = df.loc[df.groupby(['ticker', 'year'])['time'].idxmax()]
    df = df[['time', 'ticker', 'close','year']]  
    df = df.copy()
    df['next year price'] = df.groupby('ticker')['close'].shift(-1)
    df['return'] = df['next year price'] / df['close'] - 1
    df['next date'] = df.groupby('ticker')['time'].shift(-1)
    return df

def ranking(df):
    df['ranking'] = df.groupby('year')['return'].transform(lambda x: x.rank(ascending=False))
    df = pd.pivot_table(data=df, values="ranking", index="ticker", columns="time", aggfunc='sum', fill_value=0)
    df.reset_index(drop=False, inplace=True)
    df = df.T
    df.reset_index(drop=False, inplace=True)
    df.columns = df.iloc[0]
    df = df.drop(0)
    df.reset_index(drop=True, inplace=True)
    df.rename(columns={'ticker': 'time'}, inplace=True)
    df['time'] = pd.to_datetime(df['time'])
    df['time'] = df['time'].dt.strftime('%Y-%m-%d')
    return df

def markov(df):
    #unique_dates = df['time'].unique()
    unique_dates = ['2023-12-31']
    final_result_append = []
    
    for threshold_date in unique_dates:
        df_copy = df.copy()   
        df_copy.rename(columns={'ticker': 'time'}, inplace=True)
        df_copy['time'] = pd.to_datetime(df_copy['time'])
        df_copy['time'] = df_copy['time'].dt.strftime('%Y-%m-%d')
        threshold_date_timestamp = pd.Timestamp(threshold_date)
        
        con2 = df_copy['time'] < threshold_date_timestamp.strftime('%Y-%m-%d')
        df_filtered = df_copy[con2]
        df_filtered = df_filtered.sort_values(by='time')
    
        results = []
    
        for col_name in df_filtered.columns[1:]:
            second_column = df_filtered[col_name]
            rankings = [i for i in range(1, 401)]
            transition_matrix = pd.crosstab(second_column.shift(), second_column, normalize='index')
            transition_matrix = transition_matrix.reindex(index=rankings, columns=rankings, fill_value=0)
            matrix_size = 400
            transition_matrix_data = np.random.rand(matrix_size, matrix_size)
            transition_matrix_data = transition_matrix_data / transition_matrix_data.sum(axis=1, keepdims=True)
            transition_matrix_df = pd.DataFrame(transition_matrix_data, index=range(1, matrix_size + 1),
                                                columns=range(1, matrix_size + 1))
            stationary_distribution_data = np.random.rand(matrix_size, 1)
            stationary_distribution_data = stationary_distribution_data / stationary_distribution_data.sum()
            stationary_distribution_df = pd.DataFrame(stationary_distribution_data, index=range(1, matrix_size + 1),
                                                      columns=['Probability'])
            stationary_distribution_transposed = stationary_distribution_df.T
            predicted_distribution = stationary_distribution_transposed.dot(transition_matrix_df)
            predicted_distribution_with_ticker = pd.concat([pd.DataFrame({'Ticker': [col_name]}),
                                                            predicted_distribution.reset_index(drop=True)], axis=1)
            results.append(predicted_distribution_with_ticker)
    
        final_result = pd.concat(results, ignore_index=True)
        df_filtered['time'] = pd.to_datetime(df_filtered['time'])
        max_time = df_filtered['time'].max()
    
        if not pd.isnull(max_time):
            final_result['time'] = max_time
            final_result_append.append(final_result)
        else:
            print("Skipping threshold date:", threshold_date, "because maximum date is NaN")
    
    final_result_df = pd.concat(final_result_append, ignore_index=True)
    return final_result_df

def run_program(input_path, output_path_prefix):
    yearly_price_df = yearly_price(input_path + '/daily_price.csv')
    yearly_price_df.to_csv(output_path_prefix + '_yearly_price.csv', index=False)

    ranking_df = ranking(yearly_price_df)
    ranking_df.to_csv(output_path_prefix + '_ranking.csv', index=False)

    df = markov(ranking_df)
    df.to_csv(output_path_prefix + '_df.csv', index=False)

# Run the program 3 times with different output file names
for i in range(1, 4):
    input_path = 'C:/Users/atlas/OneDrive/Desktop/vnstock'
    output_path_prefix = f'C:/Users/atlas/OneDrive/Desktop/vnstock/df{i}'
    run_program(input_path, output_path_prefix)
