In [11]:
"""
    Data visualization and multiple linear regression
    Copyright (c) 2023, natdosan
    Author: [natdosan]
    Date Created: 210224
    Last updated: 230305
"""

import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import plotly.express as px
import matplotlib.pyplot as pyplot
import pickle
from matplotlib import style

In [12]:
data = pd.read_csv("../../datasets/formatted.csv", sep=",")

# what you want to predict
predict = "Total"

# return new data frame without our prediction value as training data
# design matrix
X = data.drop(columns=[predict, 'ID'], axis=1)
# prediction vector
y = data[predict]

In [13]:
# find the best model
best = 0
accuracies = [0]
accuracy_threshold = 8

n = 1000
for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    multiple_model = LinearRegression()
    multiple_model.fit(X_train, y_train)

    # get metrics
    y_pred = multiple_model.predict(X_test)
    r_squared = multiple_model.score(X_test, y_pred)
    coeffs = multiple_model.coef_
    print('Coefficients: ', coeffs)

    # accuracy 
    accuracy = multiple_model.score(X_test, y_test)
    print('Accuracy: ', accuracy)
    accuracies.append(accuracy)

    if accuracy > best:
        best = accuracy
    if best > accuracy_threshold:
        with open("multiple_model.pickle", "wb") as file:
            pickle.dump(multiple_model, file)
        break

Coefficients:  [ 0.06276476 -0.1862345   0.69486223 -1.17117485 -0.16359644]
Accuracy:  0.3543364615670662
Coefficients:  [ 0.11361135 -0.24634473  0.79400095 -1.07825847 -0.3892132 ]
Accuracy:  0.5047671203728913
Coefficients:  [ 0.22525264 -0.201888    0.90195806 -0.93193821 -0.36559896]
Accuracy:  0.5109493995904812
Coefficients:  [ 0.22642016 -0.19256698  0.7623932  -0.95594076 -0.28987058]
Accuracy:  0.5239617171892171
Coefficients:  [ 0.21229546 -0.19086024  0.75268441 -0.99833641 -0.32474083]
Accuracy:  0.646600656174259
Coefficients:  [ 0.10680605 -0.0062649   0.70608917 -0.85946443 -0.31083278]
Accuracy:  0.6922095328956194
Coefficients:  [ 0.28938999 -0.09762323  0.83405556 -0.87785775 -0.42388192]
Accuracy:  0.22383761825403548
Coefficients:  [ 0.10329748 -0.09930027  0.65008967 -1.01775855 -0.13448688]
Accuracy:  0.5921322676897802
Coefficients:  [ 0.15544095 -0.06799438  0.67116853 -0.90018639 -0.11107502]
Accuracy:  0.7778610781522071
Coefficients:  [ 0.07995602 -0.106287

In [16]:
average = sum(accuracies) / len(accuracies) - 1

print('Best accuracy: ', best)
print('Average accuracy: ', average)

Best accuracy:  0.8917699271355081
Average accuracy:  -0.6623389443210177


In [32]:
pickle_in = open("../../models/multiple_model.pickle", "rb")
linear = pickle.load(pickle_in)

print("Coefficients of ", X.shape[1], " dimensions:\n", linear.coef_)
print("Intercept: \n", linear.intercept_)

# predictions
predictions = linear.predict(X_test)
result = linear.score(X_test, y_test)

#print(X_test, y_test)
for i in range(len(predictions)):
    print(X_test.iloc[i], end = '\n')
    print(predictions[i], y_test.iloc[i], end = '\n')
    print('\n')
print('Overall Accuracy: ', result)


Coefficients of  5  dimensions:
 [ 0.20742586 -0.2484355   0.82525031 -1.25729598 -0.78618164]
Intercept: 
 2.1858080886286526
Visits         2
Items          2
Age            4
Solo?          1
Duplicates?    0
Name: 29, dtype: int64
4.147494059152265 5


Visits         3
Items          2
Age            3
Solo?          0
Duplicates?    0
Name: 26, dtype: int64
4.78696559811213 5


Visits         1
Items          1
Age            2
Solo?          1
Duplicates?    0
Name: 2, dtype: int64
2.5380030825076902 2


Visits         4
Items          2
Age            2
Solo?          0
Duplicates?    0
Name: 14, dtype: int64
4.169141154306457 5


Visits         2
Items          2
Age            2
Solo?          1
Duplicates?    0
Name: 4, dtype: int64
2.4969934426359934 2


Visits         2
Items          1
Age            2
Solo?          1
Duplicates?    0
Name: 3, dtype: int64
2.7454289469601534 2


Visits         1
Items          1
Age            2
Solo?          1
Duplicates?    0
Name: 15,