In [1]:
import pandas as pd
import numpy as np
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


In [2]:
def read_csv(csv_name):  
    # Loading data
    file_path = Path("../input_data/aapl_input.csv")
    df = pd.read_csv(file_path)
    df.head()
    
    return df

In [3]:
def adjusted_df(df):
    # Dataframe with Date, Adj close,  Volume, ts_polarity, twitter_volume of APPL
    df = df[["Date", "Adj Close", "Volume", "compound", "Count"]]
    df.head()
    
    return df

In [4]:
def set_index(df):
    # Setting Index as Date
    df = df.dropna()
    df.set_index("Date", inplace = True)
    
    return df

In [5]:
def assign_values(df):   
    # Sorting ts_polarity into Positive, Negative and Neutral sentiment

    sentiment = [] 
    for score in df['compound']:
        if score >= 0.05 :
              sentiment.append("Positive") 
        elif score <= - 0.05 : 
              sentiment.append("Negative")        
        else : 
            sentiment.append("Neutral")   

    df["Sentiment"] = sentiment
    
    return df

In [6]:
def set_trend(df):   
    #Stock Trend based on difference between current price to previous day price and coverting them to '0' as fall and '1' as rise in stock price
    df['Price Diff'] = df['Adj Close'].diff()
    df.dropna(inplace = True)
    df['Trend'] = np.where(
        df['Price Diff'] > 0 , 1, 0)

    df.head()
    
    return df

In [7]:
def binary_encoding(df):  
    # Binary encoding Sentiment column
    trend = df[["Adj Close", "Volume", 'Count', "Sentiment", "Trend"]]
    trend = pd.get_dummies(trend, columns=["Sentiment"])
    trend.head()
    
    return trend

In [8]:
def define_features(trend):    
    # Defining features set
    X = trend.copy()
    X.drop("Trend", axis=1, inplace=True)
    X.head()
    
    return X


In [9]:
def define_target(trend):   
    # Defining target vector
    y = trend["Trend"].values.reshape(-1, 1)
    y[:5]
    
    return y

In [10]:
def split_data(X, y): 
    # Splitting into Train and Test sets
    split = int(0.7 * len(X))

    X_train = X[: split]
    X_test = X[split:]

    y_train = y[: split]
    y_test = y[split:]
    
    return X_train, X_test, y_train, y_test

In [11]:
def scale_data(X_train, X_test):    
    # Using StandardScaler to scale features data
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled

In [12]:
def fit_model(X_train_scaled, y_train):   
    # Create RFClassifier model
    rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

    # Fit the model
    rf_model = rf_model.fit(X_train_scaled, y_train.ravel())  
    
    return rf_model

In [29]:
def make_prediction(model, X_test_scaled, y_test, stock_name):   
    # Make predictions
    predictions = model.predict(X_test_scaled)
    pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()}).head(20)

    # Generate accuracy score for predictions using y_test
    acc_score = accuracy_score(y_test, predictions)
    print(f"Accuracy Score : {acc_score} for " + stock_name)
    
    return predictions

In [14]:
def generate_conf_matrix(y_test, predictions):    
    # Generating the confusion matrix
    cm = confusion_matrix(y_test, predictions)
    cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"],
        columns=["Predicted 0", "Predicted 1"]
    )

    # Displaying results
    display(cm_df)

In [15]:
aapl_df = read_csv("../input_data/aapl_input.csv")
btc_df = read_csv("../input_data/btc_input.csv")
nflx_df = read_csv("../input_data/nflx_input.csv")
pfe_df = read_csv("../input_data/pfe_input.csv")
msft_df = read_csv("../input_data/msft_input.csv")
tsla_df = read_csv("../input_data/tsla_input.csv")

In [16]:
aapl_df = adjusted_df(aapl_df)
btc_df = adjusted_df(btc_df)
nflx_df = adjusted_df(nflx_df)
pfe_df = adjusted_df(pfe_df)
msft_df = adjusted_df(msft_df)
tsla_df = adjusted_df(tsla_df)

In [17]:
aapl_df = set_index(aapl_df)
btc_df = set_index(btc_df)
nflx_df = set_index(nflx_df)
pfe_df = set_index(pfe_df)
msft_df = set_index(msft_df)
tsla_df = set_index(tsla_df)

In [18]:
aapl_df = assign_values(aapl_df)
btc_df = assign_values(btc_df)
nflx_df = assign_values(nflx_df)
pfe_df = assign_values(pfe_df)
msft_df = assign_values(msft_df)
tsla_df = assign_values(tsla_df)

In [19]:
aapl_df = set_trend(aapl_df)
btc_df = set_trend(btc_df)
nflx_df = set_trend(nflx_df)
pfe_df = set_trend(pfe_df)
msft_df = set_trend(msft_df)
tsla_df = set_trend(tsla_df)

In [20]:
aapl_trend = binary_encoding(aapl_df)
btc_trend = binary_encoding(btc_df)
nflx_trend = binary_encoding(nflx_df)
pfe_trend = binary_encoding(pfe_df)
msft_trend = binary_encoding(msft_df)
tsla_trend = binary_encoding(tsla_df)

In [21]:
X_aapl = define_features(aapl_trend)
X_btc = define_features(btc_trend)
X_nflx = define_features(nflx_trend)
X_pfe = define_features(pfe_trend)
X_msft = define_features(msft_trend)
X_tsla = define_features(tsla_trend)

In [22]:
y_aapl = define_target(aapl_trend)
y_btc = define_target(btc_trend)
y_nflx = define_target(nflx_trend)
y_pfe = define_target(pfe_trend)
y_msft = define_target(msft_trend)
y_tsla = define_target(tsla_trend)

In [30]:
X_train_aapl, X_test_aapl, y_train_aapl, y_test_aapl = split_data(X_aapl, y_aapl)
X_train_btc, X_test_btc, y_train_btc, y_test_btc = split_data(X_btc, y_btc)
X_train_nflx, X_test_nflx, y_train_nflx, y_test_nflx = split_data(X_nflx, y_nflx)
X_train_pfe, X_test_pfe, y_train_pfe, y_test_pfe = split_data(X_pfe, y_pfe)
X_train_msft, X_test_msft, y_train_msft, y_test_msft = split_data(X_msft, y_msft)
X_train_tsla, X_test_tsla, y_train_tsla, y_test_tsla = split_data(X_tsla, y_tsla)

In [31]:
X_train_scaled_aapl, X_test_scaled_aapl = scale_data(X_train_aapl, X_test_aapl)
X_train_scaled_btc, X_test_scaled_btc = scale_data(X_train_btc, X_test_btc)
X_train_scaled_nflx, X_test_scaled_nflx = scale_data(X_train_nflx, X_test_nflx)
X_train_scaled_pfe, X_test_scaled_pfe = scale_data(X_train_pfe, X_test_pfe)
X_train_scaled_msft, X_test_scaled_msft = scale_data(X_train_msft, X_test_msft)
X_train_scaled_tsla, X_test_scaled_tsla = scale_data(X_train_tsla, X_test_tsla)

In [32]:
model_aapl = fit_model(X_train_scaled_aapl, y_train_aapl)
model_btc = fit_model(X_train_scaled_btc, y_train_btc)
model_nflx = fit_model(X_train_scaled_nflx, y_train_nflx)
model_pfe = fit_model(X_train_scaled_pfe, y_train_pfe)
model_msft = fit_model(X_train_scaled_msft, y_train_msft)
model_tsla = fit_model(X_train_scaled_tsla, y_train_tsla)

In [33]:
prediction_aapl = make_prediction(model_aapl, X_test_scaled_aapl, y_test_aapl, "Apple")
prediction_btc = make_prediction(model_btc, X_test_scaled_btc, y_test_btc, "Bitcoin")
prediction_nflx = make_prediction(model_nflx, X_test_scaled_nflx, y_test_nflx, "Netflix")
prediction_pfe = make_prediction(model_pfe, X_test_scaled_pfe, y_test_pfe, "Pfizer")
prediction_msft = make_prediction(model_msft, X_test_scaled_msft, y_test_msft, "Microsoft")
prediction_tsla = make_prediction(model_tsla, X_test_scaled_tsla, y_test_tsla, "Tesla")

Accuracy Score : 0.4933920704845815 for Apple
Accuracy Score : 0.4933920704845815 for Bitcoin
Accuracy Score : 0.4933920704845815 for Netflix
Accuracy Score : 0.4933920704845815 for Pfizer
Accuracy Score : 0.4933920704845815 for Microsoft
Accuracy Score : 0.4933920704845815 for Tesla


In [34]:
generate_conf_matrix(y_test_aapl, prediction_aapl)
generate_conf_matrix(y_test_btc, prediction_btc)
generate_conf_matrix(y_test_nflx, prediction_nflx)
generate_conf_matrix(y_test_pfe, prediction_pfe)
generate_conf_matrix(y_test_msft, prediction_msft)
generate_conf_matrix(y_test_tsla, prediction_tsla)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,103
Actual 1,12,98


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,103
Actual 1,12,98


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,103
Actual 1,12,98


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,103
Actual 1,12,98


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,103
Actual 1,12,98


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,103
Actual 1,12,98
