In [None]:
# All imported libraries are listed herefor readbility

import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
from zlib import crc32


# Data pipeline, scaling, normalizing, etc
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Transforming & Manipulating data
from sklearn.compose import ColumnTransformer


# The Linear Regression model
from sklearn.linear_model import LinearRegression



In [None]:
# Define parameters for data retrieval
INSTRUMENT_ID = '1'
START = "2021-01-20T00:00"
END = "2021-01-21T00:00"


In [None]:
def load_olo_data(instrument_id, start, end):
    url = f'http://tzvolcano.chordsrt.com/api/v1/data/{instrument_id}.csv?start={start}&end={end}'
    print(url)

    
    return pd.read_csv(url,
#                     index_col='Time', 
                    parse_dates=['Time'],
                    header=18
                    )


# original_data.info()
# original_data.describe()
# original_data.head()
    


In [None]:
original_data = load_olo_data(INSTRUMENT_ID, START, END)

In [None]:
# Calculate the seconds_since_epoch variable
original_data["seconds_since_epoch"] = original_data['Time'].astype(np.int64)

# Show all the keys (variable names) in the loaded data set
original_data.keys()

In [None]:
# Plot histograms for the data
original_data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Create scatter plots for Longitude & Latitude
# Scatter plots are useful to identify patterns between the variables 
# and to see which ones will be useful for machine learning)
original_data.plot(kind="scatter", x="Longitude", y="Latitude", alpha=0.1)
original_data.plot(kind="scatter", x="Height", y="Longitude", alpha=0.1)
original_data.plot(kind="scatter", x="Height", y="Latitude", alpha=0.1)

In [None]:

original_data.plot(x="seconds_since_epoch", y="Height")
original_data.plot(x="seconds_since_epoch", y="Longitude")
original_data.plot(x="seconds_since_epoch", y="Latitude")

In [None]:
# Show correclations between the variables
original_data.corr()

In [None]:
# Show an overview of info about the data
original_data.info()

In [None]:
original_data.keys()

In [None]:
def split_train_test(data, test_ratio):
    #     Seed ramdom generator so the same indices are retrieved every time
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

# Split the original data in to two sets: one used for training and one used for testing the generated model
#  .2  = 20% of the total data set will be used as test data
train_data, test_data = split_train_test(original_data, 0.2)

# Show how many entries are in the total, training and test sets (jsut for confirmation)
print(len(original_data))
print(len(train_data))
print(len(test_data))

In [None]:
train_data.keys()

In [None]:
# Remove the "Time" variable from the data set, as it is not useful for the model
numerical_data = original_data.drop(["Time"], axis=1)

# Print out the remaing variable names, just to see we did remove "Time"
numerical_data.keys()

In [None]:
# Define a pipline to clean numerical data
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

# Test that the pipeline works
original_transformed = num_pipeline.fit_transform(numerical_data)

In [None]:
# Definte the pipleine for ALL the data 
# (right now we only have numerical data, but this is not always the case)

num_attributes = list(numerical_data)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attributes)
])

# Prepare the full set of training data
training_prepared = full_pipeline.fit_transform(numerical_data)

In [None]:
# Build and train the Linear Regression Model
lin_reg = LinearRegression()
lin_reg.fit(training_prepared,training_prepared)

In [None]:
# Run a few predictions
some_data = numerical_data.iloc[:5]
# some_labels = numerical_data.iloc[:5].drop('Latitude', axis=1).drop('Longitude', axis=1).drop('Height', axis=1)
some_labels = numerical_data.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions", lin_reg.predict(some_data_prepared))
print("Labels", list(some_labels))