In [None]:
import numpy as np 
import pandas as pd 

In [None]:
def read_goog_sp500_dataframe():
    """Returns a dataframe with the results (percentage change of adjusted prices with months) 
    for Google and S&P 500."""
    
    filePath = "E:\\Eskills-Academy-projects\\LearningTensorFlow\\data\\"
    googFile = filePath + "GOOG.csv"
    spFILE = filePath + "SP_500.csv"

    goog = pd.read_csv(googFile, sep=",", usecols=[0, 5], names=['Date', 'Goog'], header=0)
    sp = pd.read_csv(spFILE, sep=",", usecols=[0, 5], names=['Date', 'SP500'], header=0)

    goog['SP500'] = sp['SP500']

    # Convert string format to date 
    goog['Date'] = pd.to_datetime(goog['Date'], format='%Y-%m-%d')

    goog = goog.sort_values(['Date'], ascending=[True])
    returns = goog[[key for key in dict(goog.dtypes) if dict(goog.dtypes)[key] in ['float64', 'int64']]].pct_change()

    return returns


In [None]:
def read_goog_sp500_data():
    """Returns a tuple with 2 fields, the returns for Google and S&P 500.
    Each of the returns are in the form of a 1D array"""

    returns = read_goog_sp500_dataframe()

    # Filter out the very first row which doesnot have any value for returns.
    xData = np.array(returns["SP500"])[1:]
    yData = np.array(returns["Goog"])[1:]

    return (xData, yData)


In [None]:
from sklearn import linear_model

xData, yData = read_goog_sp500_data()


In [None]:
# Set up a linear model
googModel = linear_model.LinearRegression()
googModel.fit(xData.reshape(-1,1), yData.reshape(-1,1))


In [None]:
# Find the coefficient and intercept of this linear model
print(googModel.coef_)
print(googModel.intercept_)


In [None]:
import matplotlib.pyplot as plt 

plt.scatter(xData, yData, alpha=0.5, s=60)
margin=1.1
x = np.linspace(margin*xData.min(), margin*xData.max(), 100)
y = googModel.coef_.item() * x + googModel.intercept_.item()
plt.plot(x, y, color="k", lw=2.5)
plt.title("Google vs S&P 500 stock prices change in percentage")
plt.xlabel("Google [percent change]")
plt.ylabel("S&P 500 [percent change]")
plt.show()

In [None]:
import tensorflow as tf 
tf.compat.v1.disable_eager_execution()
# Model linear regression y = W*x + b
W = tf.Variable(tf.zeros([1,1]), name="W")
b = tf.Variable(tf.zeros([1]), name="b")
x = tf.compat.v1.placeholder(tf.float32, [None, 1], name="x")

In [None]:
# x has many rows and 1 column, W is 1X1 matrix
Wx = tf.matmul(x, W)
y = Wx + b

In [None]:
y_ = tf.compat.v1.placeholder(tf.float32, [None, 1], name="y_")  # Actual value
cost = tf.reduce_mean(tf.square(y_ - y))
train_step_ftrl = tf.compat.v1.train.FtrlOptimizer(1.0).minimize(cost)

# Total number of points for our x values
dataset_size = len(xData)

In [None]:
def trainWithMultiplePointsPerEpoch(steps, train_step, batch_size):
    init = tf.compat.v1.global_variables_initializer()

    with tf.compat.v1.Session() as sess:
        sess.run(init)
        for step in range(steps):

            if dataset_size == batch_size:
                batch_start_idx = 0
            elif dataset_size < batch_size:
                raise ValueError("data set size: %d, must be greater than the batch_size: %d"%(dataset_size, batch_size))
            else:
                batch_start_idx =(step*batch_size) % dataset_size

            batch_end_idx = batch_start_idx + batch_size

            # Access the x and y values in batches
            batch_xs = xData[batch_start_idx : batch_end_idx]
            batch_ys = yData[batch_start_idx : batch_end_idx]

            # Reshape the 1-D arrays as 2D feature vectors with many rows and 1-column
            feed = {x: batch_xs.reshape(-1,1), y_:batch_ys.reshape(-1,1)}
            sess.run(train_step, feed_dict=feed)

            # Print result to scren for every 500 iterations
            if (step + 1) % 500 == 0:
                print("After %d iterations"%step)
                print("W: %f"%sess.run(W))
                print("b: %f"%sess.run(b))
                print("cost: %f" % sess.run(cost, feed_dict=feed))
                


In [None]:
trainWithMultiplePointsPerEpoch(steps=5000, train_step=train_step_ftrl, batch_size=dataset_size)

In [None]:
def read_xom_oil_nasdaq_data():
    """Returns a tuple with 3 fields, the returns for Exxon Mobil, Nasdaq and oil prices.
    Each of the returns are in the form of a 1D array"""

    def readFile(fileName):
        # Only read in the date and price at columns 0 and 5
        data = pd.read_csv(fileName, sep=",", usecols=[0, 5], names=["Date", "Price"], header=0)

        # sort the data in ascending order of date so returns can be computed
        data = data.sort_values(["Date"], ascending=[True])

        # Exclude the date from the precentage change calculation
        returns = data[[key for key in dict(data.dtypes) if dict(data.dtypes)[key] in ['float64', 'int64']]].pct_change()

        # Filter out the very first row which is NAN
        return np.array(returns['Price'])[1:]
    
    filePath = "E:\\Eskills-Academy-projects\\LearningTensorFlow\\data\\"
    nasdaqFile = filePath + "NASDAQ.csv"
    oilFile = filePath + "USO.csv"
    xomFile = filePath + "XOM.csv"
    nasdaqData = readFile(nasdaqFile)
    oilData = readFile(oilFile)
    xomData = readFile(xomFile)

    return (nasdaqData, oilData, xomData)

In [None]:
import numpy as np
from sklearn import datasets, linear_model

In [None]:
nasdaqData, oilData, xomData = read_xom_oil_nasdaq_data()
combined = np.vstack((nasdaqData, oilData)).T


In [None]:
xomNasdaqOilModel = linear_model.LinearRegression()
xomNasdaqOilModel.fit(combined, xomData)
print("score:", xomNasdaqOilModel.score(combined, xomData))


In [None]:
print("W:", xomNasdaqOilModel.coef_)
print("b:", xomNasdaqOilModel.intercept_)

In [None]:
# Model linear regression y = W1*x1 + W2*x2 + b
nasdaq_W = tf.Variable(tf.zeros([1,1]), name="nasdaq_W")
oil_W = tf.Variable(tf.zeros([1,1]), name="oil_W")
b = tf.Variable(tf.zeros([1]), name="b")
nasdaq_x = tf.compat.v1.placeholder(tf.float32, [None, 1], name="nasdaq_x")
oil_x = tf.compat.v1.placeholder(tf.float32, [None, 1], name="oil_x")

nasdaq_Wx = tf.matmul(nasdaq_x, nasdaq_W)
oil_Wx = tf.matmul(oil_x, oil_W)

y = nasdaq_Wx + oil_Wx + b
y_ = tf.compat.v1.placeholder(tf.float32, [None, 1])
cost = tf.reduce_mean(tf.square(y_ - y))

In [None]:
train_step_ftrl = tf.compat.v1.train.FtrlOptimizer(1.0).minimize(cost)

all_x_nasdaq = nasdaqData.reshape(-1,1)
all_x_oil = oilData.reshape(-1,1)
all_ys = xomData.reshape(-1,1)

In [None]:
dataset_size = len(oilData)

In [None]:
def trainWithMultiplePointsPerEpoch2(steps, train_step, batch_size):
    init = tf.compat.v1.global_variables_initializer()

    with tf.compat.v1.Session() as sess:
        sess.run(init)
        for epoch in range(steps):

            if dataset_size == batch_size:
                batch_start_idx = 0
            elif dataset_size < batch_size:
                raise ValueError("data set size: %d, must be greater than the batch_size: %d"%(dataset_size, batch_size))
            else:
                batch_start_idx =(epoch*batch_size) % dataset_size

            batch_end_idx = batch_start_idx + batch_size

            # Access the x and y values in batches
            batch_x_nasdaq = all_x_nasdaq[batch_start_idx : batch_end_idx]
            batch_x_oil = all_x_oil[batch_start_idx : batch_end_idx]
            batch_ys = all_ys[batch_start_idx : batch_end_idx]

            # Reshape the 1-D arrays as 2D feature vectors with many rows and 1-column
            feed = {nasdaq_x: batch_x_nasdaq, oil_x: batch_x_oil, y_: batch_ys}
            sess.run(train_step, feed_dict=feed)

            # Print result to scren for every 500 iterations
            if (epoch + 1) % 500 == 0:
                print("After %d iterations"%epoch)
                print("W1: %f"%sess.run(nasdaq_W))
                print("W2: %f"%sess.run(oil_W))
                print("b: %f"%sess.run(b))
                print("cost: %f" % sess.run(cost, feed_dict=feed))

In [None]:
trainWithMultiplePointsPerEpoch2(5000, train_step_ftrl, len(oilData))