In [15]:
from pandas_datareader import data as web
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [14]:
def get_stock(ticker, start_date, end_date, s_window, l_window):
    try:
        df = web.get_data_yahoo(ticker, start=start_date, end=end_date)
        df['Return'] = df['Adj Close'].pct_change()
        df['Return'].fillna(0, inplace = True)
        df['Date'] = df.index
        df['Date'] = pd.to_datetime(df['Date'])
        df['Month'] = df['Date'].dt.month
        df['Year'] = df['Date'].dt.year 
        df['Day'] = df['Date'].dt.day
        for col in ['Open', 'High', 'Low', 'Close', 'Adj Close']:
            df[col] = df[col].round(2)
#        df['Weekday'] = df['Date'].dt.weekday_name  
        df['Weekday'] = df['Date'].dt.day_name       
        df['Week_Number'] = df['Date'].dt.strftime('%U')
        df['Year_Week'] = df['Date'].dt.strftime('%Y-%U')
        df['Short_MA'] = df['Adj Close'].rolling(window=s_window, min_periods=1).mean()
        df['Long_MA'] = df['Adj Close'].rolling(window=l_window, min_periods=1).mean()        
        col_list = ['Date', 'Year', 'Month', 'Day', 'Weekday', 
                    'Week_Number', 'Year_Week', 'Open', 
                    'High', 'Low', 'Close', 'Volume', 'Adj Close',
                    'Return', 'Short_MA', 'Long_MA']
        num_lines = len(df)
        df = df[col_list]
        print('read ', num_lines, ' lines of data for ticker: ' , ticker)
        return df
    except Exception as error:
        print(error)
        return None
    
try:
    ticker='LVMUY'
    input_dir = r'D:\systemdefalt\desktop\CS677\assignment2'
    output_file = os.path.join(input_dir, ticker + '.csv')
    df = get_stock(ticker, start_date='2015-01-01', end_date='2019-12-31', 
               s_window=14, l_window=50)
    df.to_csv(output_file, index=False)
    print('wrote ' + str(len(df)) + ' lines to file: ' + output_file)
except Exception as e:
    print(e)
    print('failed to get Yahoo stock data for ticker: ', ticker)

read  1258  lines of data for ticker:  LVMUY
wrote 1258 lines to file: D:\systemdefalt\desktop\CS677\assignment2\LVMUY.csv


In [16]:
    def weekly_return_volatility(data, start_date, end_date):
        """
        calculate the weekly mean return and volatility
        & create a new file to contain these infor
        """
        try:
            df_2 = data[data['Date'] >= start_date]
            df_2 = df_2[df_2['Date'] <= end_date]
            df_2 = df_2[['Year', 'Week_Number', 'Open', 'Adj Close', 'Return']]
            df_2.index = range(len(df_2))
            df_grouped = df_2.groupby(['Year', 'Week_Number'])['Return'].agg([np.mean, np.std])
            df_grouped.reset_index(['Year', 'Week_Number'], inplace=True)
            df_grouped.rename(columns={'mean': 'mean_return', 'std':'volatility'}, inplace=True)
            df_grouped.fillna(0, inplace=True)
            df_grouped["Open"] = df_2.groupby(["Year", "Week_Number"])["Open"].head(1).\
                                 reset_index(drop = True).copy()
            df_grouped["Adj Close"] = df_2.groupby(["Year", "Week_Number"])["Adj Close"].tail(1).\
                                      reset_index(drop = True).copy()
            return df_grouped
        except Exception as error:
            print(error)
            return None

In [21]:
    try:
        df_weekly = weekly_return_volatility(df, start_date='2018-01-01', end_date='2019-12-31')
    except Exception as e:
        print("Error in weekly_return_volatility: ", end = " ")
        print(e)

In [22]:
    def weekly_label(data, year):
        try:
            df_label = data[data["Year"] == year].copy()
            mean_return_percent50 = np.percentile(df_label["mean_return"], 50)
            volatility_percent50 = np.percentile(df_label["volatility"], 50)      
            df_label["True Label"] = np.where((df_label["mean_return"] >= mean_return_percent50) & \
                                              (df_label["volatility"] <= volatility_percent50), "Green", "Red")
            return df_label
        except Exception as error:
            print(error)
            return None

In [24]:
pip install tabulate

Collecting tabulate
  Downloading tabulate-0.8.10-py3-none-any.whl (29 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.8.10
Note: you may need to restart the kernel to use updated packages.


In [25]:
from tabulate import tabulate

In [26]:
    try:
        df_labeling = pd.DataFrame()
        for year in [2018, 2019]:
            df_year_label = weekly_label(df_weekly, year)
            label_count = df_year_label.groupby("True Label")["True Label"].size().to_frame(name = "Freq")
            print("Label Count for Year {0}".format(year))
            print(tabulate(label_count, headers = "keys", numalign = "right"), end = "\n\n")         
            df_labeling = df_labeling.append(df_year_label, ignore_index = True)
        df_labeling["Week_Number"] = df_labeling["Week_Number"].astype(int)
    except Exception as e:
        print("Error in weekly_label:", end = " ")
        print(e)

Label Count for Year 2018
True Label      Freq
------------  ------
Green             15
Red               38

Label Count for Year 2019
True Label      Freq
------------  ------
Green             14
Red               39



In [27]:
    def Logistic_Reg(train_data, test_data, predictor):
        # train the logistic regression model by stock data in year 1
        train_X = train_data[predictor].values
        scaler = StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
        train_Y = train_data["True Label"].values
        classifier = LogisticRegression(solver = "lbfgs")
        classifier.fit(train_X, train_Y)
        
        # predict the labels in year 2
        test_X = test_data[predictor].values
        test_X = scaler.transform(test_X)
        pred_Y = classifier.predict(test_X)
        return pred_Y

In [28]:
    def kNN(train_data, test_data, predictor, num_neighbors):
        # train the kNN model by stock data in year 1
        train_X = train_data[predictor].values  
        scaler = StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
        train_Y = train_data["True Label"].values
        classifier = KNeighborsClassifier(n_neighbors = num_neighbors, p = 2)
        classifier.fit(train_X, train_Y)
        
        # predict the labels in year 2
        test_X = test_data[predictor].values
        test_X = scaler.transform(test_X)
        pred_Y = classifier.predict(test_X)
        return pred_Y

In [29]:
    def Linear_model(train_data, test_data, predictor):
        # train the linear model by stock data in year 1
        train_X = train_data[predictor].values
        train_Y = train_data["Adj Close"].values
        lin_reg = LinearRegression(fit_intercept = True)
        lin_reg.fit(train_X, train_Y)
        
        # predict the labels in year 2
        predicted_labels = []
        prev_label = "None"
        for i in range(len(test_data)):
            test_X = np.array(test_data.iloc[i][predictor]).reshape(1, -1)
            pred_Y = lin_reg.predict(test_X)
            if i == 0:
                prev_price = train_data.iloc[-1]["Adj Close"]
            else:
                prev_price = test_data.iloc[i - 1]["Adj Close"]
            if pred_Y > prev_price:
                prev_label = "Green"
                predicted_labels.append(prev_label)
            elif pred_Y < prev_price:
                prev_label = "Red"
                predicted_labels.append(prev_label)
            else:
                if prev_label == "None":
                    predicted_labels.append(train_data.iloc[-1]["True Label"])
                else:
                    predicted_labels.append(prev_label)
        return np.asarray(predicted_labels)

In [30]:
    def AccuracyCal(actual, pred):
        cm = confusion_matrix(actual, pred)
        diagonal_sum = cm.trace()
        sum_of_all_elements = cm.sum()
        accuracy = diagonal_sum / sum_of_all_elements
        return accuracy

In [33]:
    print("\n"+ " Q1 " +"\n")
    try:
        df_2018 = df_labeling.loc[df_labeling["Year"] == 2018].copy().reset_index(drop = True)
        df_2019 = df_labeling.loc[df_labeling["Year"] == 2019].copy().reset_index(drop = True)
        accuracy_table = pd.DataFrame(columns = ["Accuracy", "Accuracy\n(w/o μ)", "Accuracy\n(w/o σ)",
                                                 "Delta_1\n(w/o μ)", "Delta_2\n(w/o σ)"],
                                      index = ["Log. Reg", "k-NN", "Linear Model"])
        actual_labels = df_2019["True Label"].values
        
        # different models' accuracy with different predictors
        predictor_list = [["mean_return", "volatility"], ["mean_return"], ["volatility"]]
        num_neighbors = 3
        for i in range(len(predictor_list)):
            features = predictor_list[i]
            log_pred = Logistic_Reg(df_2018, df_2019, features)
            kNN_pred = kNN(df_2018, df_2019, features, num_neighbors)
            lin_pred = Linear_model(df_2018, df_2019, features)
            pred_table = [log_pred, kNN_pred, lin_pred]
            accuracy_list = []
            for ele in pred_table:
                accuracy_list.append(AccuracyCal(actual_labels, ele))
            accuracy_table.iloc[:, i] = accuracy_list
        accuracy_table["Delta_1\n(w/o μ)"] = accuracy_table["Accuracy"] - accuracy_table["Accuracy\n(w/o μ)"]
        accuracy_table["Delta_2\n(w/o σ)"] = accuracy_table["Accuracy"] - accuracy_table["Accuracy\n(w/o σ)"]
        
        print("\n" + " " * 10 + "Different Models\' Accuracy with Different Predictors\n")
        print(tabulate(accuracy_table.round(3), headers = "keys", numalign = "right"), end = "\n\n") 
        
    
    except Exception as e:
        print("Error in Question 1:", end = " ")
        print(e)
    


 Q1 


          Different Models' Accuracy with Different Predictors

                Accuracy    Accuracy    Accuracy    Delta_1    Delta_2
                             (w/o μ)     (w/o σ)    (w/o μ)    (w/o σ)
------------  ----------  ----------  ----------  ---------  ---------
Log. Reg           0.792       0.717       0.774      0.075      0.019
k-NN               0.755       0.774       0.623     -0.019      0.132
Linear Model       0.679       0.679       0.679          0          0

