In [None]:
import pandas as pd
from math import sqrt
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
train_df = pd.read_csv('train.csv')

train_size = int(0.8 * len(train_df))
train_data = train_df[:train_size]
test_data = train_df[train_size:]

X_train = train_data.drop(['total_fare', 'fare'], axis=1)
y_train = train_data['fare']
X_test = test_data.drop(['total_fare', 'fare'], axis=1)
y_test = test_data['fare']

In [None]:
models = {
    'LightGBM': lgb.LGBMRegressor(reg_alpha = 2,reg_lambda = 0.5,learning_rate=0.1),
}

results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = abs(y_pred)
    rmse = sqrt(mean_squared_error(y_test, y_pred))

    results[model_name] = {'RMSE': rmse}

best_model = min(results, key=lambda x: results[x]['RMSE'])
print('Best Model:', best_model)
print('RMSE:', results[best_model]['RMSE'])

Best Model: LightGBM
RMSE: 33.92262842364048


In [None]:
test_df = pd.read_csv('test.csv')
X_final_test = test_df.drop(['total_fare', 'fare'], axis=1)

best_model = models[best_model]
best_model.fit(X_train, y_train)
fare_predictions = best_model.predict(X_final_test)

fare_predictions = abs(fare_predictions)
test_df['fare'] = fare_predictions

In [None]:
test_df['total_fare'] = test_df['fare']+test_df['tip']+test_df['miscellaneous_fees']

In [None]:
test_df

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,fare,tip,miscellaneous_fees,total_fare,surge_applied
0,1076.0,4.18,1.0,97.735307,0,13.500,111.235307,0
1,429.0,1.48,4.0,48.421578,0,13.500,61.921578,0
2,856.0,4.15,1.0,86.644780,24,6.000,116.644780,0
3,622.0,3.22,1.0,69.277001,15,5.625,89.902001,0
4,507.0,3.98,1.0,109.428009,0,2.250,111.678009,0
...,...,...,...,...,...,...,...,...
89856,435.0,2.24,1.0,52.761079,13,13.700,79.461079,0
89857,519.0,2.61,1.0,59.704594,7,13.850,80.554594,0
89858,450.0,2.24,1.0,53.670001,0,26.625,80.295001,1
89859,919.0,4.12,1.0,89.520957,25,30.200,144.720957,1


In [None]:
final = test_df['total_fare']
final

0        111.235307
1         61.921578
2        116.644780
3         89.902001
4        111.678009
            ...    
89856     79.461079
89857     80.554594
89858     80.295001
89859    144.720957
89860     92.084162
Name: total_fare, Length: 89861, dtype: float64

In [None]:
final.to_csv('lastpred.csv',index=False)

In [None]:
#The End