# Codes for implementing prediction model for forecasting service-time in a repair service company

### Author: Mohammad Mosaffa, mohammadmosaffa@ubc.ca
The most majority of these codes are in Python, and the minority in SQL Server which include the section of selecting data from dataset.


### Section 1, Feature Engieering:
In the following codes, the number of laptops that have been in the queue for being repaired is calculated for each record. This feature can significantly influence the service time, and its effectiveness will be discussed comprehensively.

As a result, by having time and date of entering for each device, two new variables were created:

First, the cumulative number of devices have been in the queue

The second is the total number of devices that are collected for a day when a specific device arrives.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df=pd.read_excel('C:/Users/moham/OneDrive/Desktop/Article/New folder (2)/Final_Df.xlsx',na_values=' ')

df['Enter_Date']=df['Enter_Date'].agg(pd.Timestamp)
df['Exit_Date']=df['Exit_Date'].agg(pd.Timestamp)
df['Exit_Date']-df['Enter_Date']

df['TotalDays'] = df['Exit_Date']-df['Enter_Date']
df["Received_in_Day"] = 0
df["In_Queue"] = 0

for i in range(len(df)):
    print(i)
    counter = i+1
    while df.iloc[i,7] >= df.iloc[counter,1]:
        if df.iloc[i,7] == df.iloc[counter,1]:
            df.iloc[counter,19] = df.iloc[counter,19] + 1
        else:
            df.iloc[counter,18] = df.iloc[counter,18] + 1
        counter = counter + 1
        if counter == len(df):
            break
        
df.to_excel('C:/Users/moham/OneDrive/Desktop/Article/New folder (2)/saved_file.xlsx')
df.loc[(df['Exit_Date']-df['Enter_Date'])<90]

df['TotalDays'] = round(df.iloc[:,6]/(3600*24))

df[df['Enter_Year']==2018].iloc[:,20]

By coding properly, the name of each days were obtained in order to split a date into name of the day, name of the month, and the year.

In [None]:
df['Enter_Date']=df['Enter_Date'].agg(pd.Timestamp)
df['Exit_Date']=df['Exit_Date'].agg(pd.Timestamp)
df.loc[0,'Enter_Date'].day_name()
for i in range(len(df)):
    df.loc[i,'Enter_Day_Name'] = df.loc[i,'Enter_Date'].day_name()

### Section 2, Obtaining the distribution of the actual service-time:

By using fitter from the fitter package, some well-known statistical distributions were fitted to obtain the best distribution with the suitable parameter.

In [None]:
sns.set_style('white')
sns.set_context("paper", font_scale = 2)
sns.displot(data=dfff.loc[dfff['TotalDays']<=30,'TotalDays'], bins = 30, aspect = 1.5)
Days = data=dfff.loc[dfff['TotalDays']<=10000,'TotalDays']
Dayss = data=dfff.loc[dfff['TotalDays']<=30,'TotalDays']

import numpy as np
import pandas as pd
import seaborn as sns
from fitter import Fitter, get_common_distributions, get_distributions

f = Fitter(Days,
           distributions=[
            'chi2',
            'expon',
            'exponpow',
            'gamma'])
f.fit()

f.summary()

get_common_distributions()

f.fitted_param['expon']
f.fitted_pdf['expon']

dist = scipy.stats.expon
param = (0.0, 4.5623891469594597)
X = linspace(0,30, 10)
pdf_fitted = dist.pdf(X, *param)
plot(X, pdf_fitted, 'o-')

plt.figure(figsize=(14, 10))
plt.title('Accuracy of Approximated Exponential Dist for The Actual Waiting-Time')
plt.xlabel('Days')
plt.ylabel('Density')
sns.kdeplot(dfff.loc[dfff['TotalDays']<=30,'TotalDays'], shade=True, color='olive')
plot(X, pdf_fitted, 'o-')
plt.grid(linestyle='-', linewidth=0.5)
plt.legend(['Actual Density','Predicted Exponential'])
plt.show()

### Section 3, Preprocessing the dataset:
In this section, all columns except the text on was investigated in order to transform them into a form which is suitable for predictive model.

This section's main challenge is cleaning the **Estimation** column. It must be numeric since each device is a repairing cost guess. However, it had text, numbers, and symbols. Also, some records had an interval estimated cost, which the average of them was calculated.

In [None]:
#Transforming time to the number
from datetime import *
df.iloc[:,5]=df.iloc[:,5].agg(pd.Timestamp)
Time = np.zeros((len(df),1))
for i in range(len(df)):
    Time[i] = df.iloc[i,5].hour + (df.iloc[i,5].minute/60)
df.iloc[:,5] = Time 

#Normalazing the Brand column
df.iloc[:,6] = df.iloc[:,6].agg(lambda x: x.replace(' ','').lower())

#Normalazing the Model column
df.iloc[:,7] = df.iloc[:,7].apply(str)
df.iloc[:,7] = df.iloc[:,7].agg(lambda x: x.replace(' ','').upper())

#Normalazing the Estimated column
dfest=df['Estimated_Price'].copy()

dfest2 = dfest.copy()
dfest = dfest.apply(str)
dfest2 = dfest.apply(str)

for i in range(len(dfest)):
    print(i)
    if len([int(s) for s in re.findall(r'\b\d+\b', dfest[i])]) == 0:
        dfest2[i] = 0
    else:
        dfest2[i] = sum([int(s) for s in re.findall(r'\b\d+\b', dfest[i])])/len([int(s) for s in re.findall(r'\b\d+\b', dfest[i])])
    
dfest=df['Estimated_Price'] = dfest2

plt.hist(dfest2, bins=range(0,30000000,1000000), edgecolor="black")


#### Section 4, Preprocesing, Text:
In this section, first, the text was normalized, then by creating a matrix of words similarities, an aautomatic approach for currecting misspelling words was developed, and finally, by eliminating words had a few repeated in the dataset, others were transformed into numeric variables by using Bag of Words (BOW) techniques:

In [None]:
#Working on text column
diag = df['Explanation'].apply(str).copy()
diag2 = diag.copy()
re.sub('[^\w\s]',"",diag[2])
re.findall('\w',diag[2])

re.findall('\w\s',diag[2])#Find word characters followed by a white space
re.findall('\d+\$',diag[2])

#Eleminating all foreign charachters like english and numbers
for i in range(len(diag)):
    diag[i] = re.sub("[a-zA-Z0-9]+", "",diag2[i])
    diag[i] = re.sub('[^\w\s]', "",diag[i])

#Normalizing two kind of 'ye' in farsi    
for i in range(len(diag)):
    diag[i] = diag[i].replace('ي', 'ی')

#Removing the stopwords in farsi  
prstop=pd.read_excel('C:/Users/moham/OneDrive/Desktop/Article/Dataset/stopwords.xlsx') 
Row_list =[]
for i in range(len(prstop)):
    Row_list.append(prstop.iloc[i,0])
    
words = diag[1].split()
for word in words:
    if word in Row_list:
        print(word)
        words.remove(word)
words = ' '.join(words)

#Removing some bold mistakes in the dataset in writing some explanations
diag3 = diag2.copy()
for i in range(len(diag3)):
    words = diag3[i].split()
    for word in words:
        if word in Row_list or word == 'نميشود' or word == 'ميشود' or word == 'و' or word == 'گي' or word == 'در':
            words.remove(word)
    diag3[i] = ' '.join(words)
    

#Removing some bold mistakes in the dataset in writing some explanations part 2
z1 = []
z2 = []
z3 = []
z4 = []
z5 = []
z = ['ا','آ']
for i in range(len(diag3)):
    z1 = re.findall('ال دي',diag3.iloc[i])
    z2 = re.findall('ال سي دي',diag3.iloc[i])
    z3 = re.findall('مادر برد',diag3.iloc[i])
    z4 = re.findall('يو اس',diag3.iloc[i])
    z5 = re.findall('ال سي ذي',diag3.iloc[i])
    re.sub(z[0],z[1],diag.iloc[i])
    if z1 != []:
        diag3.iloc[i] = re.sub(z1[0],'الايدي',diag3.iloc[i])
    z1 = []
    if z2 != []:
        diag3.iloc[i] = re.sub(z2[0],'السيدي',diag3.iloc[i])
    z2 = []
    if z3 != []:
        diag3.iloc[i] = re.sub(z3[0],'مادربرد',diag3.iloc[i])
    z3 = []
    if z4 != []:
        diag3.iloc[i] = re.sub(z4[0],'يواسبس',diag3.iloc[i])
    z4 = []
    if z5 != []:
        diag3.iloc[i] = re.sub(z5[0],'السيدي',diag3.iloc[i])
    z5 = []

#Counting the number of each words in dataset
numberofwords = diag3.str.split(expand=True).stack().value_counts()
ExpWords = numberofwords[numberofwords>=20]

#Removing some alone charachter in the text
a = ['ا','آ','ب','پ','ت','ث','ج','چ','ح','خ','د','ذ','ر','ز','ژ','س','ش','ص','ض','ط','ظ','ع','غ','ک','گ','ل','م','ن','و','ه','ی','ئ','ي','ف','ق',' ']
for i in range(len(diag3)):
    words = diag3[i].split()
    for word in words:
        if word in a:
            words.remove(word)
    diag3[i] = ' '.join(words)   
    
numberofwords.index
numberofwords.to_excel('C:/Users/moham/OneDrive/Desktop/Article/Dataset/matrix.xlsx')
numberofwords2=pd.read_excel('C:/Users/moham/OneDrive/Desktop/Article/Dataset/matrix.xlsx') 
list(numberofwords2.iloc[9,0])[0] in list(numberofwords2.iloc[9,0])

numberofwords2back = numberofwords2.copy()
count = 0
for i in range(len(numberofwords2)):
    print(i)
    word1 = list(numberofwords2.iloc[i,0])
    for j in range(len(numberofwords2)):
        word2 = list(numberofwords2.iloc[j,0])
        for k in range(len(word1)):
            if word1[k] in word2:
                count = count + 1
        numberofwords2.iloc[i,j+1] = count/len(word1)
        count = 0

In the following section, codes were programmed in order to create a misspelling matrix gives similarity precentage of each two words. By sorting words based on repeated in the dataset, a word with high similarity with the frequent form of that can be replaced. In this paper, words with similarity of 90% or higher with original form were replaced.

It should be mentioned that this approach is a semi-intelligent it would work better by supervising a human.

In [None]:
#Missplelling words deal
numberofwords2back = numberofwords2.copy()

count = 0
for i in range(len(numberofwords2)):
    print(i)
    word1 = list(numberofwords2.iloc[i,0])
    for j in range(len(numberofwords2)):
        word2 = list(numberofwords2.iloc[j,0])
        if len(word1) > len(word2):
            lenn = len(word2)
        else:
            lenn = len(word1)
        for k in range(0,lenn):
            if word1[k] == word2[k]:
                count = count + 1
        numberofwords2.iloc[i,j+1] = count/lenn
        count = 0
        
numberofwords2.to_excel('C:/Users/moham/OneDrive/Desktop/Article/Dataset/matrix2.xlsx')

numberofwords2back = numberofwords2back.drop(numberofwords2back.index[[29, 40, 45, 51, 66, 76, 80, 92, 98, 109]])
numberofwords2back = numberofwords2back.reset_index()
del df['index']
numberofwords2back = numberofwords2back.drop(numberofwords2back.index[[58]])
numberofwords2back = numberofwords2back.reset_index()
del df['index']
numberofwords2back.loc[58,numberofwords2back.iloc[58,:] == 1]
برداشته برد
numberofwords2.loc[15,numberofwords2.iloc[15,:] == 1]
numberofwords2back.loc[8,numberofwords2back.iloc[8,:] == 1].index[15]

diag[49]   

numberofwords2back.loc[8,numberofwords2back.iloc[8,:] == 1].index[15] in diag[49]


diag4 = diag3.copy()
listt = []
for i in range(len(diag4)):
    print(i)
    for t in range(len(numberofwords2back)):
        for k in range(len(numberofwords2back.loc[t,numberofwords2back.iloc[t,:] == 1])):
            if numberofwords2back.loc[t,numberofwords2back.iloc[t,:] == 1].index[k] in diag4[i]:
                for j in range(len(diag4[i].split())):
                    if diag4[i].split()[j] == numberofwords2back.loc[t,numberofwords2back.iloc[t,:] == 1].index[k]:    
                        word = diag4[i].split()[j].split(numberofwords2back.loc[t,numberofwords2back.iloc[t,:] == 1].index[k])
                        listt.append(word[1])
                        listt.append(numberofwords2back.loc[t,numberofwords2back.iloc[t,:] == 1].index[k])
                    else:
                        listt.append(diag4[i].split()[j])
                diag4[i] = ' '.join(listt)
                listt = []

Eliminating extranous words from the dataset and replace suspicious words with the confidence form of that.

In [None]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

diag4 = diag3.copy()
listt = []
for j in range(len(diag4)):
    for i in range(0,100):
        listt = []
        if (len(intersection(diag4[j].split(),numberofwords2back.loc[i,numberofwords2back.iloc[i,:] == 1].index[1:])) > 0) 
        and (len(diag4[j].split(numberofwords2back.loc[i,numberofwords2back.iloc[i,:] == 1].index[0])) > 1) 
        and (numberofwords2back.loc[i,numberofwords2back.iloc[i,:] == 1].index[0] not in diag4[j].split()) :
            
            print(intersection(diag4[j].split(),numberofwords2back.loc[i,numberofwords2back.iloc[i,:] == 1].index[1:]))
            print(j)
            print(numberofwords2back.loc[i,numberofwords2back.iloc[i,:] == 1].index[0])
            listt = (diag4[j].split(numberofwords2back.loc[i,numberofwords2back.iloc[i,:] == 1].index[0])) 
            print(listt)
            listt.append(numberofwords2back.loc[i,numberofwords2back.iloc[i,:] == 1].index[0])
            print(listt)
            listt = ' '.join(listt) 
            print(listt)
            diag4[j] = listt

Transforming text into numeric by using Bag of Words

In [None]:
diag4 = diag4.apply(str)
Words = ''
for i in diag4:
    word = ' '.join(i.split())
    Words = Words +' ' + word

FinalWords = diag4.agg(lambda x:' '.join(x))

z = diag4[diag4 != 'nan']

word_counts = diag4[diag4 != 'nan'].str.split(expand=True).stack().value_counts()

Final_Words = word_counts[word_counts>=20]


dff = df.copy()
allfeatures = np.zeros((dff.shape[0],Final_Words.shape[0]))
dff['Explanation'] = diag4

dff[dff['Explanation'] == 'nan']['Explanation'] = ' '

for i in range(len(dff)):
    if dff.iloc[i,8] == 'nan':
        dff.iloc[i,8] = ' '
        
dff['Explanation'] = dff['Explanation'].apply(str)
for i in np.arange(Final_Words.shape[0]):
    allfeatures[:,i] = dff['Explanation'].agg(lambda x:len(re.search(Final_Words.index[i],x)))

allfeatures.to_excel('C:/Users/moham/OneDrive/Desktop/Article/Dataset/allfeatures.xlsx')

Concatenating all features into one dataframe

In [None]:
allfeatures[]
allfeatures.iloc[:,allfeatures.iloc[0,:].values == 5]
from sklearn.preprocessing import LabelEncoder

Complete_data = pd.concat([dff.iloc[:,[2,3,4,5,6,7,8,9,10,11,14]],allfeatures],1)

Complete_data = pd.concat([dff.iloc[:,[2,3,4,5,6,7,8,9,10,11,14]]],1)

Complete_data.to_excel('C:/Users/moham/OneDrive/Desktop/Article/Dataset/PreparedData.xlsx')

### Section 4, Prepared final data for predictive models:


In [None]:
Complete_data = pd.read_csv('/content/PreparedData2.csv',na_values=' ') 

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

plt.title('Without In_Queue number, Without adding weight')
plt.hist(Complete_data['DaysToFinished'], bins=range(0,11,1), edgecolor="black", color='purple')

Complete_data2 = Complete_data.copy()

Complete_data2 = Complete_data2[Complete_data2['DaysToFinished'] <= 8]

#Complete_data2 = Complete_data2[Complete_data2['Explanation'].notna()]

X = Complete_data2.loc[:, Complete_data2.columns != 'DaysToFinished']

X = X.loc[:, X.columns != 'Explanation']

Y = Complete_data2['DaysToFinished']

enc=LabelEncoder()

enc.fit(X['Enter_Month'])
X['Enter_Month'] = enc.transform(X['Enter_Month'])

enc.fit(X['Enter_Day_Name'])
X['Enter_Day_Name'] = enc.transform(X['Enter_Day_Name'])

enc.fit(X['Enter_Year'])
X['Enter_Year'] = enc.transform(X['Enter_Year'])

enc.fit(X['Brand'])
X['Brand'] = enc.transform(X['Brand'])

enc.fit(X['Model'])
X['Model'] = enc.transform(X['Model'])

enc.fit(Y)
y = enc.transform(Y)

X.describe()

#### Variable selection and ranking:

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt

#Chi2 method
X = X.loc[:, X.columns != 'Model']
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features


#Decision Tree variables selection
model = ExtraTreesClassifier()
model.fit(X,y)

print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')
plt.show()

#Selecting top 100 variables from the dataset
X.shape
X_new = SelectKBest(chi2, k=100).fit_transform(X, y)
X_new.shape

As we know, categorical variables may be treated by number ranking. As a result, it would be reasonable to transform categorical variables into 0 and 1 form by using dummy variables techniques:

In [None]:
X = pd.concat([X,pd.get_dummies(X['Enter_Month'])],1)
X = pd.concat([X,pd.get_dummies(X['Enter_Day_Name'])],1)
X = pd.concat([X,pd.get_dummies(X['Enter_Year'])],1)
X = pd.concat([X,pd.get_dummies(X['Brand'])],1)
#X = pd.concat([X,pd.get_dummies(X['Model'])],1)

X = X.loc[:, X.columns != 'Enter_Month']
X = X.loc[:, X.columns != 'Enter_Day_Name']
X = X.loc[:, X.columns != 'Enter_Year']
X = X.loc[:, X.columns != 'Brand']
X = X.loc[:, X.columns != 'Model']

X.shape[1]
X

Data selection into test and train. Also, all variebales were normalized by using MinMax scaler technique.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15) 

X_train_N = X_train.copy()
X_test_N = X_test.copy()

X_train_N = X_train_N.loc[:, X_train_N.columns != 'In_Queue']
X_train_N = X_train_N.loc[:, X_train_N.columns != 'Received_in_Day']

X_test_N = X_test_N.loc[:, X_test_N.columns != 'In_Queue']
X_test_N = X_test_N.loc[:, X_test_N.columns != 'Received_in_Day']

mms = MinMaxScaler()

mms.fit(X_train)
X_train = mms.transform(X_train)
mms.fit(X_test)
X_test = mms.transform(X_test)

mms.fit(X_train_N)
X_train_N = mms.transform(X_train_N)
mms.fit(X_test_N)
X_test_N = mms.transform(X_test_N)

from sklearn.metrics import mean_squared_error,mean_absolute_error
from math import sqrt

### Section 5, Applying machine learning methods (Regression):
By using cross-validation techniques, optmized hyperparameters of **RandomForestRegressor** and **GradientBoostingRegressor** were obtained

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor


regressor1 = LinearRegression()
regressor2 = PoissonRegressor()
regressor3 = BayesianRidge()
regressor4 = SVR(kernel = 'rbf')
regressor5 = RandomForestRegressor(n_estimators=500, max_features='auto', random_state=0 , max_depth = 20)
regressor6 = GradientBoostingRegressor(loss='squared_error', n_estimators = 200, max_depth = 20, max_features = 'auto')

regressor1.fit(X_train, y_train)
regressor2.fit(X_train, y_train)
regressor3.fit(X_train, y_train)
regressor4.fit(X_train, y_train)
regressor5.fit(X_train, y_train)
regressor6.fit(X_train, y_train)

y_pred1 = regressor1.predict(X_test)
y_pred2 = regressor2.predict(X_test)
y_pred3 = regressor3.predict(X_test)
y_pred4 = regressor4.predict(X_test)
y_pred5 = regressor5.predict(X_test)
y_pred6 = regressor6.predict(X_test)

### Section 6, Applying deep learning method:

In [None]:
from tensorflow.keras.layers import Input, Dense, Activation,Dropout
from tensorflow.keras.models import Model

#Model one
f = lambda a: (abs(a)+a)/2 
input_layer = Input(shape=(X.shape[1],))
dense_layer_1 = Dense(512, activation='relu')(input_layer)
dense_layer_2 = Dense(128, activation='relu')(dense_layer_1)
dense_layer_3 = Dense(64, activation='relu')(dense_layer_2)
output = (Dense(1)(dense_layer_3))

model = Model(inputs=input_layer, outputs=output)
model.compile(loss="mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])

history = model.fit(X_train, y_train, batch_size=256, epochs=30, verbose=1, validation_split=0.1)

pred_train = model.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train)))

pred = model.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,f(pred))))

pred = model.predict(X_test)
print((mean_absolute_error(y_test,pred)))


#Model two
f=lambda a: (abs(a)+a)/2 
input_layer = Input(shape=(X.shape[1],))
dense_layer_1 = Dense(256, activation='relu')(input_layer)
dense_layer_2 = Dense(64, activation='relu')(dense_layer_1)
dense_layer_3 = Dense(32, activation='relu')(dense_layer_2)
output = (Dense(1)(dense_layer_3))

model2 = Model(inputs=input_layer, outputs=output)
model2.compile(loss="mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])

history2 = model2.fit(X_train, y_train, batch_size=256, epochs=30, verbose=1, validation_split=0.1)

pred_train2 = model2.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train2)))

pred2 = model2.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred2)))
print((mean_absolute_error(y_test,(pred2))))

### Section 7, Visualization and comparing methods:

In [None]:
pred_train = model.predict(X_train)
print(mean_absolute_error(y_train,pred_train))

pred = f(model.predict(X_test))
pred2 = np.floor(f(model2.predict(X_test)))

print(mean_absolute_error(y_test,pred))
print(((8-mean_absolute_error(y_test,pred))/8)*100)

print(mean_absolute_error(y_test,pred2))
print(((8-mean_absolute_error(y_test,pred2))/8)*100)

plt.figure(figsize=(14, 10))
plt.title('Histogram of the GradientBoostingRegressor prediction with actual labels')
plt.xlabel('Days')
plt.ylabel('Number')
#plt.hist(pred, bins=range(0,10,1), edgecolor="black", color='red', alpha=0.5)
plt.hist(y_pred6, bins=range(0,10,1), edgecolor="black", color='blue', alpha=0.5)
plt.hist(y_test, bins=range(0,10,1), edgecolor="black", color='red', alpha=0.5)

plt.grid(linestyle='-', linewidth=0.5)
plt.legend(['Predicted with GradientBoostingRegressor','Actual'])
plt.show()



lenn = 50
f=lambda a: (abs(a)+a)/2 
l = np.zeros((8*lenn,2))
l = pd.DataFrame(l)
counter = 0
for i in range(lenn):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15) 
  mms.fit(X_test)
  X_test = mms.transform(X_test)
  pred = f(model.predict(X_test))
  l.iloc[counter,0] = ((8 - mean_absolute_error(y_test,f(pred)))/8)*100
  l.iloc[counter,1] = 'DL_512_Nodes'
  counter = counter + 1
  pred2 = f(model2.predict(X_test))
  l.iloc[counter,0] = ((8 - mean_absolute_error(y_test,f(pred2)))/8)*100
  l.iloc[counter,1] = 'DL_256_Nodes'
  counter = counter + 1
  y_pred1 = regressor1.predict(X_test)
  l.iloc[counter,0] = ((8 - mean_absolute_error(y_test,f(y_pred1)))/8)*100
  l.iloc[counter,1] = 'LinearRegression'
  counter = counter + 1
  y_pred2 = regressor2.predict(X_test)
  l.iloc[counter,0] = ((8 - mean_absolute_error(y_test,f(y_pred2)))/8)*100
  l.iloc[counter,1] = 'PoissonRegressor'
  counter = counter + 1
  y_pred3 = regressor3.predict(X_test)
  l.iloc[counter,0] = ((8 - mean_absolute_error(y_test,f(y_pred3)))/8)*100
  l.iloc[counter,1] = 'BayesianRidge'
  counter = counter + 1
  y_pred4 = regressor4.predict(X_test)
  l.iloc[counter,0] = ((8 - mean_absolute_error(y_test,f(y_pred4)))/8)*100
  l.iloc[counter,1] = 'SVR'
  counter = counter + 1
  y_pred5 = regressor5.predict(X_test)
  l.iloc[counter,0] = ((8 - mean_absolute_error(y_test,f(y_pred5)))/8)*100
  l.iloc[counter,1] = 'RandomForestRegressor'
  counter = counter + 1
  y_pred6 = regressor6.predict(X_test)
  l.iloc[counter,0] = ((8 - mean_absolute_error(y_test,f(y_pred6)))/8)*100
  l.iloc[counter,1] = 'GradientBoostingRegressor'
  counter = counter + 1

sns.boxplot(x=l.iloc[:,0], y=l.iloc[:,1], linewidth=5)
plt.show()



plt.figure(figsize=(14, 10))
plt.subplot(5, 2, 1)
plt.hist(pred, bins=range(0,10,1), edgecolor="black", color='green', alpha=0.5)
plt.legend(['DL_512_Nodes'])
plt.xlabel('Days')
plt.ylabel('Number')
plt.grid(linestyle='-', linewidth=0.5)
plt.subplot(5, 2, 2)
plt.hist(pred2, bins=range(0,10,1), edgecolor="black", color='green', alpha=0.5)
plt.legend(['DL_256_Nodes'])
plt.xlabel('Days')
plt.ylabel('Number')
plt.grid(linestyle='-', linewidth=0.5)
plt.subplot(5, 2, 3)
plt.hist(y_pred1, bins=range(0,10,1), edgecolor="black", color='red', alpha=0.5)
plt.legend(['LinearRegression'])
plt.xlabel('Days')
plt.ylabel('Number')
plt.grid(linestyle='-', linewidth=0.5)
plt.subplot(5, 2, 4)
plt.hist(y_pred2, bins=range(0,10,1), edgecolor="black", color='red', alpha=0.5)
plt.legend(['PoissonRegressor'])
plt.xlabel('Days')
plt.ylabel('Number')
plt.grid(linestyle='-', linewidth=0.5)
plt.subplot(5, 2, 5)
plt.hist(y_pred3, bins=range(0,10,1), edgecolor="black", color='black', alpha=0.5)
plt.legend(['BayesianRidge'])
plt.xlabel('Days')
plt.ylabel('Number')
plt.grid(linestyle='-', linewidth=0.5)
plt.subplot(5, 2, 6)
plt.hist(y_pred4, bins=range(0,10,1), edgecolor="black", color='black', alpha=0.5)
plt.legend(['SVR'])
plt.xlabel('Days')
plt.ylabel('Number')
plt.grid(linestyle='-', linewidth=0.5)
plt.subplot(5, 2, 7)
plt.hist(y_pred5, bins=range(0,10,1), edgecolor="black", color='black', alpha=0.5)
plt.legend(['RandomForestRegressor'])
plt.xlabel('Days')
plt.ylabel('Number')
plt.grid(linestyle='-', linewidth=0.5)
plt.subplot(5, 2, 8)
plt.hist(y_pred6, bins=range(0,10,1), edgecolor="black", color='green', alpha=0.5)
plt.legend(['GradientBoostingRegressor'])
plt.xlabel('Days')
plt.ylabel('Number')
plt.grid(linestyle='-', linewidth=0.5)
plt.subplot(5, 2, 10)
plt.hist(y_test, bins=range(0,10,1), edgecolor="black", color='purple', alpha=0.5)
plt.legend(['Actual Data'])
plt.xlabel('Days')
plt.ylabel('Number')
plt.grid(linestyle='-', linewidth=0.5)

### Section 8, Satatistics test for predicted service time with actual service time:

In [None]:
from scipy.stats import f_oneway
f_oneway(df[df.iloc[:,1] == 'PoissonRegressor'].iloc[:,0], df[df.iloc[:,1] == 'LinearRegression'].iloc[:,0])


import scipy.stats as stats
fvalue, pvalue = stats.f_oneway(df[df.iloc[:,1] == 'DL_512_Nodes'].iloc[:,0], 
                                df[df.iloc[:,1] == 'DL_256_Nodes'].iloc[:,0],
                                df[df.iloc[:,1] == 'LinearRegression'].iloc[:,0],
                                df[df.iloc[:,1] == 'PoissonRegressor'].iloc[:,0],
                                df[df.iloc[:,1] == 'BayesianRidge'].iloc[:,0],
                                df[df.iloc[:,1] == 'SVR'].iloc[:,0],
                                df[df.iloc[:,1] == 'RandomForestRegressor'].iloc[:,0],  
                                df[df.iloc[:,1] == 'GradientBoostingRegressor'].iloc[:,0])

print(fvalue, pvalue)


import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

tukey = pairwise_tukeyhsd(endog=df.iloc[:,0],
                          groups=df.iloc[:,1],
                          alpha=0.05)

print(tukey)


fvalue, pvalue = stats.f_oneway(df[df.iloc[:,1] == 'SVR'].iloc[:,0], 
                                df[df.iloc[:,1] == 'RandomForestRegressor'].iloc[:,0])

print(fvalue, pvalue)