# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow.csv

## Importing the dataset

In [2]:
dataset = pd.read_csv('datapoint-task-test-mini.csv', low_memory=False)
#dataset = pd.read_csv('datapoint-task-full.csv', low_memory=False)

# sanity check
if list(dataset.columns[0:4]) != ['AVAILABLE NODES COUNT', 'TASK ID', 'CPU REQUIRED', 'MEMORY REQUIRED']:
    raise Exception('illegal columns')

# remove task duplicates
dataset = dataset.drop_duplicates(subset='TASK ID', keep='last')

X = dataset.iloc[:, 2:].values # from 3rd column
y = dataset.iloc[:, 0].values # first column

features_count = len(X[0])
print(f"Features count = {features_count}")

task_ids = dataset.iloc[:, 1].values # first column
print(f"Tasks count = {len(task_ids)}")

# fill in missing values with zeros
#from sklearn.impute import SimpleImputer
#imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
#imputer.fit(X)
#X = imputer.transform(X)

print(X)


Features count = 5
Tasks count = 27
[[1 0.0003975 nan nan nan]
 [2 0.0003975 nan nan '${D} == ']
 [3 0.0003975 nan nan '${D} == ']
 [4 0.0003975 nan nan nan]
 [5 0.0003975 nan nan '${D} == ']
 [6 0.009924 '${B} <> 0' nan '${D} == ']
 [7 0.009924 '${B} <> 0' nan '${D} == ']
 [8 0.001398 '${B} <> 0, 1' nan '${D} == ']
 [9 0.001398 '${B} <> 0, 1' nan '${D} == ']
 [10 0.00159 nan nan '${D} == ']
 [11 0.009192 '${B} <> 0' nan '${D} == ']
 [12 0.004773 nan nan nan]
 [13 0.001088 nan nan nan]
 [14 0.008813 '${B} <> 0' nan '${D} == ']
 [15 0.008813 '${B} <> 0' nan '${D} == ']
 [16 0.01041 nan nan nan]
 [17 0.001193 nan nan nan]
 [18 0.002081 nan nan nan]
 [19 0.0006218 nan nan nan]
 [20 0.004816 '${B} <> 0' nan nan]
 [21 0.004816 nan nan nan]
 [22 0.004816 '${B} <> 0' nan nan]
 [23 0.004816 nan nan nan]
 [24 0.009937 '${B} <> 0' nan '${D} == ']
 [25 0.009937 '${B} <> 0' nan '${D} == ']
 [26 0.009937 '${B} <> 0' nan '${D} == ']
 [27 0.009937 '${B} <> 0' nan '${D} == ']]


## Encoding categorical data

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
columns_list = [i for i in range(2, features_count)]
#print(f'Columns: {columns_list}')
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), columns_list)], 
    remainder='passthrough', 
    sparse_threshold=0, 
    n_jobs=-1,
    verbose=False,
    verbose_feature_names_out=False
)
X_cat_encoded = np.array(ct.fit_transform(X))
print(ct.get_feature_names_out())
print(f"Encoded columns count: {len(X_cat_encoded[0])}")
print(X_cat_encoded)

['x2_${B} <> 0' 'x2_${B} <> 0, 1' 'x2_nan' 'x3_nan' 'x4_${D} == ' 'x4_nan'
 'x0' 'x1']
Encoded columns count: 8
[[0.0 0.0 1.0 1.0 0.0 1.0 1 0.0003975]
 [0.0 0.0 1.0 1.0 1.0 0.0 2 0.0003975]
 [0.0 0.0 1.0 1.0 1.0 0.0 3 0.0003975]
 [0.0 0.0 1.0 1.0 0.0 1.0 4 0.0003975]
 [0.0 0.0 1.0 1.0 1.0 0.0 5 0.0003975]
 [1.0 0.0 0.0 1.0 1.0 0.0 6 0.009924]
 [1.0 0.0 0.0 1.0 1.0 0.0 7 0.009924]
 [0.0 1.0 0.0 1.0 1.0 0.0 8 0.001398]
 [0.0 1.0 0.0 1.0 1.0 0.0 9 0.001398]
 [0.0 0.0 1.0 1.0 1.0 0.0 10 0.00159]
 [1.0 0.0 0.0 1.0 1.0 0.0 11 0.009192]
 [0.0 0.0 1.0 1.0 0.0 1.0 12 0.004773]
 [0.0 0.0 1.0 1.0 0.0 1.0 13 0.001088]
 [1.0 0.0 0.0 1.0 1.0 0.0 14 0.008813]
 [1.0 0.0 0.0 1.0 1.0 0.0 15 0.008813]
 [0.0 0.0 1.0 1.0 0.0 1.0 16 0.01041]
 [0.0 0.0 1.0 1.0 0.0 1.0 17 0.001193]
 [0.0 0.0 1.0 1.0 0.0 1.0 18 0.002081]
 [0.0 0.0 1.0 1.0 0.0 1.0 19 0.0006218]
 [1.0 0.0 0.0 1.0 0.0 1.0 20 0.004816]
 [0.0 0.0 1.0 1.0 0.0 1.0 21 0.004816]
 [1.0 0.0 0.0 1.0 0.0 1.0 22 0.004816]
 [0.0 0.0 1.0 1.0 0.0 1.0 23 0.0048

## Splitting the dataset into the Training set and Test set

In [4]:
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X_cat_encoded, y, test_size = 0.35, random_state = 0)

In [5]:
print(y_train)

[12360 12360 12360 12475 12360 12475 12360 12360 12475 12475]


## Training the Multiple Linear Regression model on the Training set

In [13]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [16]:
y_test_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_test_pred.reshape(len(y_test_pred),1), y_test.reshape(len(y_test),1)),1))

[[12475. 12475.]
 [12360. 12360.]
 [12360. 12360.]
 [12360. 12360.]
 [12360. 12360.]
 [12360. 11580.]
 [12360. 12360.]
 [12360. 12360.]
 [12475. 12475.]
 [12360. 12360.]
 [12360. 11580.]
 [12475. 12475.]
 [12475. 12475.]
 [12475. 12475.]
 [12475. 12475.]
 [12475. 12475.]
 [12475. 12475.]]


## Save category encoded csv

In [17]:
y_pred = regressor.predict(X_cat_encoded)
dataset_cat_encoded = pd.DataFrame(X_cat_encoded, columns=ct.get_feature_names_out())
dataset_cat_encoded.insert(0, "TASK ID", task_ids)
dataset_cat_encoded.insert(1, "AVAILABLE NODES COUNT", y)
dataset_cat_encoded.insert(2, "PREDICTED AVAILABLE NODES COUNT", y_pred)
dataset_cat_encoded.to_csv('datapoint-task-category-encoded.csv', index=False)