In [1]:
"""
    Data visualization and multiple linear regression
    Copyright (c) 2023, natdosan
    Author: [natdosan]
    Date Created: 210224
    Last updated: 230305
"""

import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import plotly.express as px
import matplotlib.pyplot as pyplot
import pickle
from matplotlib import style

In [2]:
data = pd.read_csv("../../datasets/formatted.csv", sep=",")

# what you want to predict
predict = "Total"

# return new data frame without our prediction value as training data
# design matrix
X = data.drop(columns=[predict, 'ID'], axis=1)
# prediction vector
y = data[predict]

In [3]:
# find the best model
best = 0
accuracies = [0]
accuracy_threshold = 8

n = 1000
for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    multiple_model = LinearRegression()
    multiple_model.fit(X_train, y_train)

    # get metrics
    y_pred = multiple_model.predict(X_test)
    r_squared = multiple_model.score(X_test, y_pred)
    coeffs = multiple_model.coef_
    print('Coefficients: ', coeffs)

    # accuracy 
    accuracy = multiple_model.score(X_test, y_test)
    print('Accuracy: ', accuracy)
    accuracies.append(accuracy)

    if accuracy > best:
        best = accuracy
    if best > accuracy_threshold:
        with open("multiple_model.pickle", "wb") as file:
            pickle.dump(multiple_model, file)
        break

Coefficients:  [ 0.29648223 -0.11358351  0.74484109 -0.77133385 -0.19669288]
Accuracy:  0.48440786873794417
Coefficients:  [ 0.13083586  0.06410792  0.60640074 -0.92650597 -0.114807  ]
Accuracy:  0.48775271789023045
Coefficients:  [ 0.1352023  -0.11712836  0.71354371 -0.87224049 -0.31342696]
Accuracy:  0.22552836657085074
Coefficients:  [ 0.31677791 -0.37885664  0.99465139 -0.54235015 -0.4543112 ]
Accuracy:  0.19342890537944646
Coefficients:  [ 0.18147078 -0.164805    0.85316567 -0.70256606 -0.54626087]
Accuracy:  0.34943789193314134
Coefficients:  [ 0.15562133 -0.00444437  0.78310813 -0.5533815  -0.15556676]
Accuracy:  0.42936782630212
Coefficients:  [ 0.01583764 -0.09984279  0.7985565  -0.78726532 -0.18744307]
Accuracy:  0.42864185184265746
Coefficients:  [ 0.27685453 -0.00212803  0.56680037 -1.15417117 -0.21390829]
Accuracy:  0.3071184279264677
Coefficients:  [ 0.45543559  0.02282679  0.71009017 -1.02147204 -0.36172833]
Accuracy:  -0.4027882412788808
Coefficients:  [ 0.13335133 -0.1

In [6]:
average = sum(accuracies) / len(accuracies) - 1

print('Best accuracy: ', best)
print('Average accuracy: ', average)

Best accuracy:  0.9114326222594271
Average accuracy:  -0.6474602942480983


In [8]:
pickle_in = open("../../models/multiple_model.pickle", "rb")
linear = pickle.load(pickle_in)

print("Coefficients of ", X.shape[1], " dimensions:\n", linear.coef_)
print("Intercept: \n", linear.intercept_)

# predictions
predictions = linear.predict(X_test)

Coefficients of  5  dimensions:
 [ 0.20742586 -0.2484355   0.82525031 -1.25729598 -0.78618164]
Intercept: 
 2.1858080886286526
