In [1]:
import pandas as pd 
import numpy as np 
from path import Path 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
def read_csv(csv_name):
    #Read TSLA.csv contains open, high, low, close, Adj close, Volume of Apple stock with twitter polarity scores and twitter volumes
    file_path = Path(csv_name)
    df = pd.read_csv(file_path)
    #Drop null values
    df.dropna(inplace=True)
    
    return df

In [3]:
def adjusted_df(df):
    #Dataframe with Adj close, ts_polarity, twitter_volume
    df = df[["Date", "Close", "Volume", "compound", "Count"]]
    
    #pct change based on Adj close value
    df["Pct_change"] = df["Close"].pct_change()
    
    df.set_index("Date", inplace=True)

    #Drop null values 
    df.dropna(inplace=True)
    df.head()
    
    return df

In [4]:
#Sorting compound into Positive, Negative and Neutral sentiment
def assign_sentiments(df):
    sentiment = []
    for score in df["compound"]: 
        if score >= 0.05:
            sentiment.append("Positive")
        elif score <= -0.05: 
            sentiment.append("Negative")
        else: 
            sentiment.append("Neutral")

    df["Sentiment"] = sentiment
    df.head()
    
    return df

In [5]:
def sentiment_count(df):
    df["Sentiment"].value_counts()
    
    return df

In [6]:
#Stock Trend based on difference between current price to previous day price and converting them to "0" as fall and "1"
#as rise in stock price
def get_stock_trend(df):
    
    df["Price Difference"] = df["Close"].diff()
    df.dropna(inplace = True)
    df["Trend"] = np.where(df["Price Difference"] > 0, 1, 0)
    
    df.drop("compound", axis=1, inplace=True)
    df.head()
    
    return df

In [7]:
#Binary encoding Sentiment column
def binary_encoding(df):   
    trend = df[["Close", "Volume", "Count", "Sentiment", "Trend"]]
    trend = pd.get_dummies(trend, columns=["Sentiment"])
    trend.head()
    
    return trend

In [8]:
#Defining features set
def define_feature(trend):   
    X = trend.copy()
    X.drop("Trend", axis=1, inplace=True)
    X.head()
    
    return X

In [9]:
#Defining target vector
def define_target(trend):
    y = trend["Trend"].values.reshape(-1,1)
    y[:5]
    return y

In [10]:
#Splitting into Train and Test data
def split_data(X, y):    
    split = int(0.7 * len(X))

    X_train = X[: split]
    X_test = X[split:]

    y_train = y[: split]
    y_test = y[split:]
    
    return X_train, X_test, y_train, y_test

In [11]:
#Using StandardScaler to scale features data
def scale_feature(X_train, X_test):  
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled

In [12]:
def fit_model(X_train_scaled, y_train):   
    
    #Create RFClassifier model
    rf_model = RandomForestClassifier(n_estimators = 500, random_state=78)

    #Fit the model
    rf_model = rf_model.fit(X_train_scaled, y_train.ravel())
    
    return rf_model

In [13]:
#Make predictions 
def make_prediction(rf_model, X_test_scaled):
    predictions = rf_model.predict(X_test_scaled)
    pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()}).head(20)

    #Generate accuracy score for predictions using y_test
    acc_score = accuracy_score(y_test, predictions)
    print(f"Accuracy Score: {acc_score}")
    
    return predictions, acc_score

In [14]:
#Generating the confusion matrix
def generate_confusion_matrix(y_test, predictions):
    cm = confusion_matrix(y_test, predictions)
    cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"],
        columns=["Predicted 0", "Predicted 1"]
    )

    #Displaying results
    display(cm_df)

In [15]:
#Generating classification report 
def generate_report(y_test, predictions):  
    print("Classification Report")
    print(classification_report(y_test, predictions))

In [16]:
aapl_df = read_csv("../input_data/aapl_input.csv")
btc_df = read_csv("../input_data/btc_input.csv")
jnj_df = read_csv("../input_data/jnj_input.csv")
msft_df = read_csv("../input_data/msft_input.csv")
nflx_df = read_csv("../input_data/nflx_input.csv")
pfe_df = read_csv("../input_data/pfe_input.csv")
tsla_df = read_csv("../input_data/tsla_input.csv")

In [17]:
aapl_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,compound,Count
0,2019-10-28,61.855,62.3125,61.68,62.262501,60.91547,96572800.0,0.196256,443
1,2019-10-29,62.2425,62.4375,60.642502,60.822498,59.506626,142839600.0,0.235016,449
2,2019-10-30,61.189999,61.325001,60.302502,60.814999,59.499287,124522000.0,0.204393,404
3,2019-10-31,61.810001,62.2925,59.314999,62.189999,60.84454,139162000.0,0.146388,435
4,2019-11-01,62.384998,63.982498,62.290001,63.955002,62.571358,151125200.0,0.162248,430


In [18]:
aapl_df = adjusted_df(aapl_df)
btc_df = adjusted_df(btc_df)
jnj_df = adjusted_df(jnj_df)
msft_df = adjusted_df(msft_df)
nflx_df = adjusted_df(nflx_df)
pfe_df = adjusted_df(pfe_df)
tsla_df = adjusted_df(tsla_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [19]:
aapl_df.head()

Unnamed: 0_level_0,Close,Volume,compound,Count,Pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-10-29,60.822498,142839600.0,0.235016,449,-0.023128
2019-10-30,60.814999,124522000.0,0.204393,404,-0.000123
2019-10-31,62.189999,139162000.0,0.146388,435,0.02261
2019-11-01,63.955002,151125200.0,0.162248,430,0.028381
2019-11-04,64.375,103272000.0,0.18278,443,0.006567


In [20]:
aapl_df = assign_sentiments(aapl_df)
btc_df = assign_sentiments(btc_df)
jnj_df = assign_sentiments(jnj_df)
msft_df = assign_sentiments(msft_df)
nflx_df = assign_sentiments(nflx_df)
pfe_df = assign_sentiments(pfe_df)
tsla_df = assign_sentiments(tsla_df)

In [21]:
aapl_df.head()

Unnamed: 0_level_0,Close,Volume,compound,Count,Pct_change,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-10-29,60.822498,142839600.0,0.235016,449,-0.023128,Positive
2019-10-30,60.814999,124522000.0,0.204393,404,-0.000123,Positive
2019-10-31,62.189999,139162000.0,0.146388,435,0.02261,Positive
2019-11-01,63.955002,151125200.0,0.162248,430,0.028381,Positive
2019-11-04,64.375,103272000.0,0.18278,443,0.006567,Positive


In [22]:
aapl_df = get_stock_trend(aapl_df)
btc_df = get_stock_trend(btc_df)
jnj_df = get_stock_trend(jnj_df)
msft_df = get_stock_trend(msft_df)
nflx_df = get_stock_trend(nflx_df)
pfe_df = get_stock_trend(pfe_df)
tsla_df = get_stock_trend(tsla_df)

In [23]:
aapl_df.head()

Unnamed: 0_level_0,Close,Volume,Count,Pct_change,Sentiment,Price Difference,Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-10-30,60.814999,124522000.0,404,-0.000123,Positive,-0.0075,0
2019-10-31,62.189999,139162000.0,435,0.02261,Positive,1.375,1
2019-11-01,63.955002,151125200.0,430,0.028381,Positive,1.765003,1
2019-11-04,64.375,103272000.0,443,0.006567,Positive,0.419998,1
2019-11-05,64.282501,79897600.0,433,-0.001437,Positive,-0.092499,0


In [24]:
trend_aapl = binary_encoding(aapl_df)
trend_btc = binary_encoding(btc_df)
trend_jnj = binary_encoding(jnj_df)
trend_msft = binary_encoding(msft_df)
trend_nflx = binary_encoding(nflx_df)
trend_pfe = binary_encoding(pfe_df)
trend_tsla = binary_encoding(tsla_df)

In [25]:
trend_aapl.head()

Unnamed: 0_level_0,Close,Volume,Count,Trend,Sentiment_Negative,Sentiment_Neutral,Sentiment_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-10-30,60.814999,124522000.0,404,0,0,0,1
2019-10-31,62.189999,139162000.0,435,1,0,0,1
2019-11-01,63.955002,151125200.0,430,1,0,0,1
2019-11-04,64.375,103272000.0,443,1,0,0,1
2019-11-05,64.282501,79897600.0,433,0,0,0,1


In [26]:
X_aapl = define_feature(trend_aapl)
X_btc = define_feature(trend_btc)
X_jnj = define_feature(trend_jnj)
X_msft = define_feature(trend_msft)
X_nflx = define_feature(trend_nflx)
X_pfe = define_feature(trend_pfe)
X_tsla = define_feature(trend_tsla)

In [27]:
X_aapl.head()

Unnamed: 0_level_0,Close,Volume,Count,Sentiment_Negative,Sentiment_Neutral,Sentiment_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-10-30,60.814999,124522000.0,404,0,0,1
2019-10-31,62.189999,139162000.0,435,0,0,1
2019-11-01,63.955002,151125200.0,430,0,0,1
2019-11-04,64.375,103272000.0,443,0,0,1
2019-11-05,64.282501,79897600.0,433,0,0,1


In [28]:
y_aapl = define_target(trend_aapl)
y_btc = define_target(trend_btc)
y_jnj = define_target(trend_jnj)
y_msft = define_target(trend_msft)
y_nflx = define_target(trend_nflx)
y_pfe = define_target(trend_pfe)
y_tsla = define_target(trend_tsla)

In [29]:
X_train_aapl, X_test_aapl, y_train_aapl, y_test_aapl = split_data(X_aapl, y_aapl)
X_train_btc, X_test_btc, y_train_btc, y_test_btc = split_data(X_btc, y_btc)
X_train_jnj, X_test_jnj, y_train_jnj, y_test_jnj = split_data(X_jnj, y_jnj)
X_train_msft, X_test_msft, y_train_msft, y_test_msft = split_data(X_msft, y_msft)
X_train_nflx, X_test_nflx, y_train_nflx, y_test_nflx = split_data(X_nflx, y_nflx)
X_train_pfe, X_test_pfe, y_train_pfe, y_test_pfe = split_data(X_pfe, y_pfe)
X_train_tsla, X_test_tsla, y_train_tsla, y_test_tsla = split_data(X_tsla, y_tsla)

In [30]:
X_train_aapl

Unnamed: 0_level_0,Close,Volume,Count,Sentiment_Negative,Sentiment_Neutral,Sentiment_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-10-30,60.814999,124522000.0,404,0,0,1
2019-10-31,62.189999,139162000.0,435,0,0,1
2019-11-01,63.955002,151125200.0,430,0,0,1
2019-11-04,64.375000,103272000.0,443,0,0,1
2019-11-05,64.282501,79897600.0,433,0,0,1
...,...,...,...,...,...,...
2021-11-24,161.940002,69463600.0,339,0,0,1
2021-11-26,156.809998,76959800.0,385,0,0,1
2021-11-29,160.240005,88748200.0,399,0,0,1
2021-11-30,165.300003,174048100.0,404,0,0,1


In [31]:
y_train_aapl.ravel()

array([0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,

In [33]:
X_train_scaled_aapl = scale_feature(X_train_aapl, X_test_aapl)
X_train_scaled_btc = scale_feature(X_train_btc, X_test_btc)
X_train_scaled_jnj = scale_feature(X_train_jnj, X_test_jnj)
X_train_scaled_msft = scale_feature(X_train_msft, X_test_msft)
X_train_scaled_pfe = scale_feature(X_train_pfe, X_test_pfe)
X_train_scaled_tsla = scale_feature(X_train_tsla, X_test_tsla)

In [34]:
X_train_scaled_aapl

(array([[-1.72111866,  0.02411223,  0.4656897 , -0.06172134, -0.15264656,
          0.16519821],
        [-1.67464611,  0.25842738,  1.16902451, -0.06172134, -0.15264656,
          0.16519821],
        [-1.61499213,  0.44989999,  1.05558341, -0.06172134, -0.15264656,
          0.16519821],
        ...,
        [ 1.63927003, -0.54845222,  0.3522486 , -0.06172134, -0.15264656,
          0.16519821],
        [ 1.81028896,  0.81678407,  0.4656897 , -0.06172134, -0.15264656,
          0.16519821],
        [ 1.79237594,  0.46474156,  0.01192531, -0.06172134, -0.15264656,
          0.16519821]]),
 array([[ 1.75823941,  0.21965014,  0.55644258, -0.06172134, -0.15264656,
          0.16519821],
        [ 1.69334688, -0.07990354, -0.12420401, -0.06172134, -0.15264656,
          0.16519821],
        [ 1.81096507, -0.24837517,  0.3522486 , -0.06172134, -0.15264656,
          0.16519821],
        ...,
        [ 1.2745869 , -0.75277856, -0.35108621, -0.06172134, -0.15264656,
          0.16519821],
  

In [35]:
model_aapl = fit_model(X_train_scaled_aapl, y_train_aapl)
model_btc = fit_model(X_train_scaled_btc, y_train_btc)
model_jnj = fit_model(X_train_scaled_jnj, y_train_jnj)
model_msft = fit_model(X_train_scaled_msft, y_train_msft)
model_pfe = fit_model(X_train_scaled_pfe, y_train_pfe)
model_tsla = fit_model(X_train_scaled_tsla, y_train_tsla)

ValueError: setting an array element with a sequence.