In [6]:
### Linear Regression ###
import time
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score


df = pd.read_csv('../../Dataset/preprocessing-cleaning/autos_cleaned.csv', encoding='latin-1')
#df = df[df['price'] > 100]

le = LabelEncoder()
ohe = OneHotEncoder()
scaler = StandardScaler()

vTypeArr = le.fit_transform(df['vehicleType'].tolist())
#print(vTypeArr)

gearboxArr = le.fit_transform(df['gearbox'].tolist())
#print(gearboxArr)

modelArr = le.fit_transform(df['model'].tolist())
#print(modelArr)

fTypeArr = le.fit_transform(df['fuelType'].tolist())
#print(fTypeArr)

brandArr = le.fit_transform(df['brand'].tolist())
#print(brandArr)

RepDmgArr = le.fit_transform(df['notRepairedDamage'].tolist())
#print(RepDmgArr)


X = []
y = []

arrIdx = 0
for index, row in df.iterrows():
    X.append([vTypeArr[arrIdx], row['yearOfRegistration'], gearboxArr[arrIdx], \
              row['powerPS'], modelArr[arrIdx], row['kilometer'], row['monthOfRegistration'],\
              fTypeArr[arrIdx], brandArr[arrIdx], RepDmgArr[arrIdx]])
    y.append(row['price'])
    arrIdx += 1
    #print(X)
    #if index > 100:
    #    break
#print(y)

#X = scaler.fit_transform(X)
X = ohe.fit_transform(X)

print("Finished creating X & y")
print("Now splitting to train & test data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

print("Start training...\n")

##################################################
print("[Linear Regression]")
reg = LinearRegression(n_jobs=4)

time_start = time.clock()
print("Start fitting...")
reg.fit(X_train, y_train)

print("Start predicting...")
y_pred = reg.predict(X_test)

time_elapsed = (time.clock() - time_start)

print("Calculating scores...")
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
ev_score = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
cv_score = cross_val_score(reg, X, y, cv=5)


print("y prediction: ", y_pred)
print("MSE: ", mse)
print("MAE: ", mae)
print("Explained Variance Score: ", ev_score)
print("R^2 Regression Score: ", r2)
print("Cross Validation Score: ")
print(cv_score)

print("Computation time: ", time_elapsed)
print("\n")

####################################################
X_dense = X.toarray()
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_dense, y, test_size=.4, random_state=42)

print("[Ridge]")
reg = Ridge(alpha=0.3)

time_start = time.clock()
print("Start fitting...")
reg.fit(X_train_d, y_train_d)

print("Start predicting...")
y_pred = reg.predict(X_test_d)

time_elapsed = (time.clock() - time_start)

print("Calculating scores...")
mse = mean_squared_error(y_test_d, y_pred)
mae = mean_absolute_error(y_test_d, y_pred)
ev_score = explained_variance_score(y_test_d, y_pred)
r2 = r2_score(y_test_d, y_pred)
cv_score = cross_val_score(reg, X_dense, y, cv=5)

print("y prediction: ", y_pred)
print("MSE: ", mse)
print("MAE: ", mae)
print("Explained Variance Score: ", ev_score)
print("R^2 Regression Score: ", r2)
print("Cross Validation Score: ")
print(cv_score)

print("Computation time: ", time_elapsed)
print("\n")

####################################################

print("[Lasso]")
reg = Lasso(alpha=0.3)

time_start = time.clock()
print("Start fitting...")
reg.fit(X_train, y_train)

print("Start predicting...")
y_pred = reg.predict(X_test)

time_elapsed = (time.clock() - time_start)

print("Calculating scores...")
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
ev_score = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
cv_score = cross_val_score(reg, X, y, cv=5)

print("y prediction: ", y_pred)
print("MSE: ", mse)
print("MAE: ", mae)
print("Explained Variance Score: ", ev_score)
print("R^2 Regression Score: ", r2)
print("Cross Validation Score: ")
print(cv_score)

print("Computation time: ", time_elapsed)
print("\n")

####################################################
print("[ElasticNet]")
reg = ElasticNet(alpha=0.3)

time_start = time.clock()
print("Start fitting...")
reg.fit(X_train, y_train)

print("Start predicting...")
y_pred = reg.predict(X_test)

time_elapsed = (time.clock() - time_start)

print("Calculating scores...")
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
ev_score = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
cv_score = cross_val_score(reg, X, y, cv=5)

print("y prediction: ", y_pred)
print("MSE: ", mse)
print("MAE: ", mae)
print("Explained Variance Score: ", ev_score)
print("R^2 Regression Score: ", r2)
print("Cross Validation Score: ")
print(cv_score)

print("Computation time: ", time_elapsed)
print("\n")


Finished creating X & y
Now splitting to train & test data...
Start training...

[Linear Regression]
Start fitting...
Start predicting...
Calculating scores...
y prediction:  [  5725.78935446    581.52971155    714.93992773 ...,  16278.4598059
   2998.14406953  13664.69463848]
MSE:  17564999.8526
MAE:  2050.96271034
Explained Variance Score:  0.782042336383
R^2 Regression Score:  0.782040335596
Cross Validation Score: 
[ 0.77506539  0.767817    0.753228    0.76759303  0.74921984]
Computation time:  4.481158999999934


[Ridge]
Start fitting...
Start predicting...
Calculating scores...
y prediction:  [  5761.21802402    592.2912601     801.60030088 ...,  16277.36860182
   2979.68474947  13659.73921521]
MSE:  17481014.2717
MAE:  2050.7540079
Explained Variance Score:  0.78308415697
R^2 Regression Score:  0.783082491541
Cross Validation Score: 
[ 0.77689396  0.77095688  0.75450311  0.76713391  0.75143845]
Computation time:  16.414815999999973


[Lasso]
Start fitting...
Start predicting...
