In [2]:
#  Importing libraries
from __future__ import print_function
from pprint import pprint
from json import dumps
import time, json
from datetime import datetime
# for API requests
import requests

In [3]:
#  For usability: Bulk-Downloads are only available once a minute
key1 = "OjMxMTlhMWRjMDQ5NDUxZTRhNjg0NDM5M2UwYzc3Njdk"
key2 = "OjIwMWQ4MWU5ZGE5ZDcxMjRlY2NjOWE0N2ZlODI4Yzlj"


#  API request function
def get_hist(identifier, pages, key, stdate, enddate):
    url = "https://api-v2.intrinio.com/securities/"  + str(identifier) + "/prices?start_date=" + str(stdate) + "-01&end_date=" + str(enddate) + "&frequency=daily&page_size=" + str(pages) + "&api_key=" + str(key)
    resp = requests.get(url).json()
    #resp_json = json.loads(resp, encoding="utf-8")
    return resp

In [4]:
aapl1980 = get_hist("AAPL", 10000, key2,"1980-01-01", "2020-01-31")

In [5]:
aapl1980["stock_prices"]

[{'date': '2020-01-31',
  'intraperiod': False,
  'frequency': 'daily',
  'open': 320.93,
  'high': 322.68,
  'low': 308.29,
  'close': 309.51,
  'volume': 49897096.0,
  'adj_open': 320.170133759724,
  'adj_high': 321.915990283202,
  'adj_low': 307.560061498724,
  'adj_close': 308.777172903662,
  'adj_volume': 49897096.0},
 {'date': '2020-01-30',
  'intraperiod': False,
  'frequency': 'daily',
  'open': 320.5435,
  'high': 324.09,
  'low': 318.75,
  'close': 323.87,
  'volume': 31685808.0,
  'adj_open': 319.784548876111,
  'adj_high': 323.322651824975,
  'adj_low': 317.995295347622,
  'adj_close': 323.103172719166,
  'adj_volume': 31685808.0},
 {'date': '2020-01-29',
  'intraperiod': False,
  'frequency': 'daily',
  'open': 324.45,
  'high': 327.85,
  'low': 321.38,
  'close': 324.34,
  'volume': 54149928.0,
  'adj_open': 323.681799452661,
  'adj_high': 327.073749269703,
  'adj_low': 320.619068294333,
  'adj_close': 323.572059899757,
  'adj_volume': 54149928.0},
 {'date': '2020-01-28',

In [6]:
#  Exporting history data
with open('data1.txt', 'w') as outfile:
    json.dump(aapl1980["stock_prices"], outfile)

In [7]:
import os, sys
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 --jars /usr/local/jar_files/spark-streaming-kafka-0-8-assembly_2.11-2.3.2.jar pyspark-shell'

In [8]:
# Import findspark 
import findspark

# Initialize and provide path
findspark.init("/usr/share/spark/spark-2.3.2-bin-hadoop2.7/")

# Or use this alternative
findspark.init()

In [9]:
#    Spark
from pyspark import SparkContext
#    Spark Streaming
from pyspark.streaming import StreamingContext
#    json parsing
import json
#    Import pyspark functions
import pyspark.sql.functions as functions

In [10]:
# Import SparkSession
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("LinearRegressionModel") \
   .getOrCreate()

# Create SparkContext    
sc = spark.sparkContext
path = "./data1.txt"
# Load JSON to SparkDataframe
data = spark.read.json(path)

In [11]:
##  Taking a first look into the data
# Data types
data.printSchema()
data.show()
#dropping columns not providing any additional information
data = data.drop("date", "intraperiod", "frequency", "close", "low", "high", "open", "volume")

root
 |-- adj_close: double (nullable = true)
 |-- adj_high: double (nullable = true)
 |-- adj_low: double (nullable = true)
 |-- adj_open: double (nullable = true)
 |-- adj_volume: double (nullable = true)
 |-- close: double (nullable = true)
 |-- date: string (nullable = true)
 |-- frequency: string (nullable = true)
 |-- high: double (nullable = true)
 |-- intraperiod: boolean (nullable = true)
 |-- low: double (nullable = true)
 |-- open: double (nullable = true)
 |-- volume: double (nullable = true)

+----------------+----------------+----------------+----------------+-----------+------+----------+---------+--------+-----------+--------+--------+-----------+
|       adj_close|        adj_high|         adj_low|        adj_open| adj_volume| close|      date|frequency|    high|intraperiod|     low|    open|     volume|
+----------------+----------------+----------------+----------------+-----------+------+----------+---------+--------+-----------+--------+--------+-----------+
|308.7

In [12]:
## determine categorical and numerical columns
categorical_cols = [item[0] for item in data.dtypes if item[1].startswith('string')]
print(categorical_cols)

numerical_cols = [item[0] for item in data.dtypes if item[1].startswith('int') | item[1].startswith('double')][:-1]
print(numerical_cols)

print("The data consists of " + str(len(categorical_cols)) + ' categorical features')
print("The data consists of " + str(len(numerical_cols)) + ' numerical features')

[]
['adj_close', 'adj_high', 'adj_low', 'adj_open']
The data consists of 0 categorical features
The data consists of 4 numerical features


In [13]:
# Import `DenseVector`
from pyspark.ml.linalg import DenseVector

# Define the `input_data` 
input_data = data.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

# Creating new dataframe with label (adj_close) and feature-vector
df = spark.createDataFrame(input_data, ["label", "features"])

In [14]:
#  Import StandardScaler 
from pyspark.ml.feature import StandardScaler

#  Initialize the standardScaler
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

#  Fit the DataFrame to the scaler
scaler = standardScaler.fit(df)

#  Transform the data in `df` with the scaler
scaled_df = scaler.transform(df)

In [15]:
# Split the data into train and test sets
train_data, test_data = scaled_df.randomSplit([.8,.2],seed=1)

# Import LinearRegression from pyspark.ml.regression
from pyspark.ml.regression import LinearRegression

# train the Linear Regression Model 
lr = LinearRegression(labelCol="label", maxIter=10000, regParam=0.01, elasticNetParam=0.4)

# Fit the data to the model
lrModel = lr.fit(train_data)

In [16]:
# Generate predictions
predicted = lrModel.transform(test_data)

# Extract the predictions and the "known" correct labels 
predictions = predicted.select("prediction").rdd.map(lambda x: x[0])
labels = predicted.select("label").rdd.map(lambda x: x[0])

# Zip `predictions` and `labels` into a list
predictionAndLabel = predictions.zip(labels).collect()

# Print out first 5 instances of `predictionAndLabel` 
predictionAndLabel[:5]

[(0.16854485387290377, 0.164995414131856),
 (0.18166298394863112, 0.177338149324867),
 (0.1863872095806624, 0.182587358544883),
 (0.18733277137446325, 0.182587358544883),
 (0.18984055502822214, 0.18627599205084)]

In [21]:
#  Insert individual accuracy
for i in range(0, len(predictionAndLabel)):
    predictionAndLabel[i] = predictionAndLabel[i] + (1-abs((predictionAndLabel[i][1]-predictionAndLabel[i][0])/predictionAndLabel[i][0]),)
#  Calculate average accuracy
import statistics as stat
sumAcc = []
x = 0
for i in range(0, len(predictionAndLabel)):
    x = x + predictionAndLabel[i][2]
avgAcc = x / len(predictionAndLabel)
print("Assuming the lowest and highest price of a trading day is already reached, the model predicts \
the closing price of this day with an accuracy of " + str(round(avgAcc*100,4)) + "%")

Assuming the lowest and highest price of a trading day is already reached, the model predicts the closing price of this day with an accuracy of 98.9273%


In [18]:
# Taking a look into the model parameters
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set
trainingSummary = lrModel.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
print("numIterations: %d" % trainingSummary.totalIterations)
trainingSummary.residuals.show()
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))

Coefficients: [0.4156679726541322,0.4880211808552577,0.09702150558981235,0.0]
Intercept: 0.0027245318742287997
RMSE: 0.435191
r2: 0.999935
numIterations: 945
+--------------------+
|           residuals|
+--------------------+
|-0.00370826187854...|
|-0.00360578941128...|
|-0.00360710009176...|
|-0.00378603847992...|
|-0.00379107955870...|
|-0.00372086457549...|
|-0.00361587156884...|
|-0.00361587156884...|
|-0.00379480995700...|
|-0.00372469579536...|
|-0.00356073175751...|
|-0.00356073175751...|
|-0.00372590565427...|
|-0.00372590565427...|
|-0.00362091264762...|
|-0.00362091264762...|
| -0.0037998510357794|
|-0.00380116171626...|
|-0.00380116171626...|
|-0.00356456297738...|
+--------------------+
only showing top 20 rows

objectiveHistory: [0.5, 0.3342325795143886, 0.08416331582105427, 0.0003713417345797951, 0.00024413028553591138, 0.00013593909855555064, 0.00013439615767872696, 0.0001343955693783578, 0.00013439124376068997, 0.00013439066806800165, 0.00013438634537057788, 0.0001343

In [19]:
#pip install mlflow

In [20]:
# Saving the model
import mlflow.spark
mlflow.spark.save_model(lrModel, "./model")