# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow.csv

## Importing the dataset

In [2]:
#dataset = pd.read_csv('datapoint-task-test-mini.csv', low_memory=False)
dataset = pd.read_csv('datapoint-task-full.csv', low_memory=False)

# sanity check
if list(dataset.columns[0:4]) != ['AVAILABLE NODES COUNT', 'TASK ID', 'CPU REQUIRED', 'MEMORY REQUIRED']:
    raise Exception('illegal columns')

# remove task duplicates
dataset = dataset.drop_duplicates(subset='TASK ID', keep='last')

X = dataset.iloc[:, 2:].values # from 3rd column
y = dataset.iloc[:, 0].values # first column

features_count = len(X[0])
print(f"Features count = {features_count}")

task_ids = dataset.iloc[:, 1].values # first column
print(f"Tasks count = {len(task_ids)}")

# fill in missing values with zeros
#from sklearn.impute import SimpleImputer
#imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
#imputer.fit(X)
#X = imputer.transform(X)

print(X)


Features count = 69
Tasks count = 145184
[[0.004376 0.0003975 nan ... nan nan nan]
 [0.004376 0.0003975 nan ... nan nan nan]
 [0.004376 0.0003975 nan ... nan nan nan]
 ...
 [0.0006874 0.0003109 nan ... nan nan nan]
 [0.0006874 0.0003109 nan ... nan nan nan]
 [0.0006874 0.0003109 nan ... nan nan nan]]


## Encoding categorical data

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

columns_list = [i for i in range(2, features_count)]
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), columns_list)], 
    remainder='passthrough', 
    sparse_threshold=0, 
    n_jobs=-1
)
X_cat_encoded = np.array(ct.fit_transform(X))

print(f"Encoded columns count: {len(X_cat_encoded[0])}")
print(ct.get_feature_names_out())
print(X_cat_encoded)

Encoded columns count: 100
['encoder__x2_nan' 'encoder__x3_nan' 'encoder__x4_nan' 'encoder__x5_nan'
 'encoder__x6_nan' 'encoder__x7_nan' 'encoder__x8_${AG} <> 0'
 'encoder__x8_nan' 'encoder__x9_${AH} == 2' 'encoder__x9_${AH} >= 1'
 'encoder__x9_nan' 'encoder__x10_nan' 'encoder__x11_nan'
 "encoder__x12_${AK} <> 'hp'"
 "encoder__x12_${AK} <> 'wh', 'wi', 'wj', 'wk'" 'encoder__x12_nan'
 'encoder__x13_nan' 'encoder__x14_${AM} >= 2' 'encoder__x14_${AM} >= 5'
 'encoder__x14_nan' 'encoder__x15_${AN} == 2' 'encoder__x15_nan'
 'encoder__x16_nan' 'encoder__x17_nan' 'encoder__x18_nan'
 'encoder__x19_nan' 'encoder__x20_${AS} == ' 'encoder__x20_nan'
 'encoder__x21_nan' 'encoder__x22_nan' 'encoder__x23_nan'
 'encoder__x24_nan' 'encoder__x25_nan' 'encoder__x26_nan'
 'encoder__x27_nan' 'encoder__x28_nan' 'encoder__x29_nan'
 'encoder__x30_nan' 'encoder__x31_nan' 'encoder__x32_nan'
 'encoder__x33_nan' 'encoder__x34_nan' 'encoder__x35_nan'
 'encoder__x36_nan' 'encoder__x37_nan' 'encoder__x38_nan'
 'encode

## Splitting the dataset into the Training set and Test set

In [4]:
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X_cat_encoded, y, test_size = 0.9, random_state = 0)

In [5]:
print(y_train)

[12475 12475 12360 ... 12475 12475 12475]


## Training the Multiple Linear Regression model on the Training set

In [6]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [9]:
y_test_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_test_pred.reshape(len(y_test_pred),1), y_test.reshape(len(y_test),1)),1))

[[12474.98 12475.  ]
 [12475.   12475.  ]
 [12474.99 12475.  ]
 ...
 [12474.99 12475.  ]
 [12475.   12475.  ]
 [12475.   12475.  ]]


## Save category encoded csv

In [19]:
y_pred = regressor.predict(X_cat_encoded)
y_pred = np.around(y_pred, 6)

dataset_cat_encoded = pd.DataFrame(X_cat_encoded, columns=ct.get_feature_names_out())

dataset_cat_encoded.insert(0, "TASK ID", task_ids)
dataset_cat_encoded.insert(1, "AVAILABLE NODES COUNT", y)
dataset_cat_encoded.insert(2, "PREDICTED AVAILABLE NODES COUNT", y_pred)
dataset_cat_encoded.to_csv('datapoint-task-category-encoded.csv', index=False)