# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow.csv

## Importing the dataset

In [2]:
#dataset = pd.read_csv('datapoint-task-test-mini.csv', low_memory=False)
dataset = pd.read_csv('datapoint-task-full.csv', low_memory=False)

# sanity check
if list(dataset.columns[0:4]) != ['AVAILABLE NODES COUNT', 'TASK ID', 'CPU REQUIRED', 'MEMORY REQUIRED']:
    raise Exception('illegal columns')

# remove task duplicates
dataset = dataset.drop_duplicates(subset='TASK ID', keep='last')

X = dataset.iloc[:, 2:].values # from 3rd column
y = dataset.iloc[:, 0].values # first column

features_count = len(X[0])
print(f"Features count = {features_count}")

task_ids = dataset.iloc[:, 1].values # first column
print(f"Tasks count = {len(task_ids)}")

# fill in missing values with zeros
#from sklearn.impute import SimpleImputer
#imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
#imputer.fit(X)
#X = imputer.transform(X)

print(X)


Features count = 69
Tasks count = 145184
[[0.004376 0.0003975 nan ... nan nan nan]
 [0.004376 0.0003975 nan ... nan nan nan]
 [0.004376 0.0003975 nan ... nan nan nan]
 ...
 [0.0006874 0.0003109 nan ... nan nan nan]
 [0.0006874 0.0003109 nan ... nan nan nan]
 [0.0006874 0.0003109 nan ... nan nan nan]]


## Encoding categorical data

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
columns_list = [i for i in range(2, features_count)]
#print(f'Columns: {columns_list}')
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), columns_list)], 
    remainder='passthrough', 
    sparse_threshold=0, 
    n_jobs=-1,
    verbose=False,
    verbose_feature_names_out=False
)
X_cat_encoded = np.array(ct.fit_transform(X))
print(ct.get_feature_names_out())
print(f"Encoded columns count: {len(X_cat_encoded[0])}")
print(X_cat_encoded)

['x2_nan' 'x3_nan' 'x4_nan' 'x5_nan' 'x6_nan' 'x7_nan' 'x8_${AG} <> 0'
 'x8_nan' 'x9_${AH} == 2' 'x9_${AH} >= 1' 'x9_nan' 'x10_nan' 'x11_nan'
 "x12_${AK} <> 'hp'" "x12_${AK} <> 'wh', 'wi', 'wj', 'wk'" 'x12_nan'
 'x13_nan' 'x14_${AM} >= 2' 'x14_${AM} >= 5' 'x14_nan' 'x15_${AN} == 2'
 'x15_nan' 'x16_nan' 'x17_nan' 'x18_nan' 'x19_nan' 'x20_${AS} == '
 'x20_nan' 'x21_nan' 'x22_nan' 'x23_nan' 'x24_nan' 'x25_nan' 'x26_nan'
 'x27_nan' 'x28_nan' 'x29_nan' 'x30_nan' 'x31_nan' 'x32_nan' 'x33_nan'
 'x34_nan' 'x35_nan' 'x36_nan' 'x37_nan' 'x38_nan' 'x39_nan' 'x40_nan'
 'x41_nan' 'x42_nan' 'x43_nan' 'x44_${B} <> 0' 'x44_${B} <> 0, 1'
 'x44_nan' 'x45_nan' 'x46_${D} == ' 'x46_nan' 'x47_${E} <> 0'
 'x47_${E} >= 0' 'x47_${E} >= 2' 'x47_nan' 'x48_nan' 'x49_${G} <> 0'
 'x49_${G} == 2' 'x49_nan' 'x50_nan' 'x51_${I} == 4' 'x51_${I} >= 1'
 'x51_nan' 'x52_nan' 'x53_nan' 'x54_nan' 'x55_nan'
 "x56_${N} <> 'hs', 'ht', 'hu'" 'x56_nan' 'x57_nan' 'x58_nan' 'x59_nan'
 'x60_${R} >= 0' 'x60_nan' 'x61_nan' 'x62_nan' '

## Splitting the dataset into the Training set and Test set

In [4]:
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X_cat_encoded, y, test_size = 0.2, random_state = 0)

In [5]:
print(y_train)

[12475 12475 12360 ... 12475 12475 12360]


## Training the Multiple Linear Regression model on the Training set

In [6]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [7]:
y_test_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_test_pred.reshape(len(y_test_pred),1), y_test.reshape(len(y_test),1)),1))

[[12359.21 12360.  ]
 [12475.3  12475.  ]
 [12474.94 12475.  ]
 ...
 [12474.94 12475.  ]
 [12475.01 12475.  ]
 [12475.   12475.  ]]


## Save category encoded csv

In [8]:
y_pred = regressor.predict(X_cat_encoded)
dataset_cat_encoded = pd.DataFrame(X_cat_encoded, columns=ct.get_feature_names_out())
dataset_cat_encoded.insert(0, "TASK ID", task_ids)
dataset_cat_encoded.insert(1, "AVAILABLE NODES COUNT", y)
dataset_cat_encoded.insert(2, "PREDICTED AVAILABLE NODES COUNT", y_pred)
dataset_cat_encoded.to_csv('datapoint-task-category-encoded.csv', index=False)