In [90]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib
#Please enter data to predict a ticket price. To check accuracy, you may take data from https://venta.renfe.com/
#Example: MADRID,SEVILLA,2019-05-29 06:20:00,2019-05-29 09:16:00,   AV City,38.55,Turista,Promo 
#NOTE: the algorithm is case sensible, so please pay close attention to a letter case of entering data
X = [
    ['MADRID','SEVILLA','2019-08-16 08:00:00','2019-08-16 10:32:00',   'AVE',76.30,'Turista','Flexible']
]
X = pd.DataFrame(X, columns=['origin', 'destination', 'start_date', 'end_date', 'train_type', 'price', 'train_class', 'fare'])
X

Unnamed: 0,origin,destination,start_date,end_date,train_type,price,train_class,fare
0,MADRID,SEVILLA,2019-08-16 08:00:00,2019-08-16 10:32:00,AVE,76.3,Turista,Flexible


In [91]:
file_name = "LinearRegression.sav"

X.dropna(axis=0, subset=['price'], inplace=True)
y = X.price
X.drop(['price'], axis=1, inplace=True)

for col in ['start_date', 'end_date']:
    date_col = pd.to_datetime(X[col])
    X[col] = date_col
    X[col + '_hour'] = date_col.dt.hour
    X[col + '_minute'] = date_col.dt.minute
    X[col + '_second'] = date_col.dt.second
    X[col + '_weekday'] = date_col.dt.weekday_name
    X[col + '_day'] = date_col.dt.day
    X[col + '_month'] = date_col.dt.month
    X[col + '_year'] = date_col.dt.year
    
X['duration'] = X['end_date'] - X['start_date']
X['duration'] = X['duration'].apply(lambda x: x.total_seconds())

for col in ['start_date', 'end_date']:
    del X[col]

In [92]:
from sklearn.preprocessing import OneHotEncoder

#encoded categorical columns
OHE_file_path = 'OH_encoder.sav'
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
OH_encoder = joblib.load(OHE_file_path)
OH_object = pd.DataFrame(OH_encoder.transform(X[categorical_cols]))

OH_object.index = X.index

num_X = X.drop(categorical_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X = pd.concat([num_X, OH_object], axis=1)

In [93]:
OH_X

Unnamed: 0,start_date_hour,start_date_minute,start_date_second,start_date_day,start_date_month,start_date_year,end_date_hour,end_date_minute,end_date_second,end_date_day,...,42,43,44,45,46,47,48,49,50,51
0,8,0,0,16,8,2019,10,32,0,16,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
from sklearn.metrics import mean_absolute_error
linear_regression_path = 'LinearRegression.sav'
loaded_model = joblib.load(linear_regression_path)

preds = loaded_model.predict(OH_X)
print("MAE is {}".format(mean_absolute_error(y, preds)))

output = pd.DataFrame({'Id': OH_X.index,
                       'SalePrice': preds})
output.to_csv('predicted_prices.csv', index=False)
OH_X.to_csv('input.csv')
print(preds)

MAE is 2.275097656249997
[74.02490234]


In [95]:
OH_X

Unnamed: 0,start_date_hour,start_date_minute,start_date_second,start_date_day,start_date_month,start_date_year,end_date_hour,end_date_minute,end_date_second,end_date_day,...,42,43,44,45,46,47,48,49,50,51
0,8,0,0,16,8,2019,10,32,0,16,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
