In [159]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression , Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
# Import any additional modules and start coding below

In [160]:
# pre-processing the data
rental_info = pd.read_csv("rental_info.csv")

# converting the object to datetime
rental_info["rental_date"] = pd.to_datetime(rental_info["rental_date"])
rental_info["return_date"] = pd.to_datetime(rental_info["return_date"])
number_days = rental_info["return_date"] - rental_info["rental_date"]
# Adding days to the dataframe 
rental_info["rental_length_days"] = number_days.dt.days


In [161]:
# Dealing with categorical features
rental_info["deleted_scenes"]= np.where((rental_info["special_features"].str.contains("Deleted Scenes")),1,0)

rental_info["behind_the_scenes"]= np.where((rental_info["special_features"].str.contains("Behind the Scenes")),1,0)

In [162]:
# specifying target and feature value

X = rental_info.drop(["rental_date", "return_date", "special_features", "rental_length_days"], axis = 1)
y = rental_info["rental_length_days"]

In [163]:
# spliting data into test and train
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=9)

In [164]:
# keeping appropariate features --> feature selection

#first using grid search I find the best alpha to describe the target using lasso
#then I can find most important features based on coef > 0
params = {"alpha":np.arange(0.00001, 10, 500)}
KF = KFold(n_splits= 5, random_state=9, shuffle=True)

lasso = Lasso()

grid_search = GridSearchCV(lasso,param_grid=params,cv=KF)

grid_search.fit(X,y)
print(f"best param for Lasso" ,grid_search.best_params_)


best param for Lasso {'alpha': 1e-05}


In [165]:
# then using that alpha we claculate coef of lasso to find the features importance in describing the target y
lasso1 = Lasso(alpha=0.00001)
lasso1.fit(X_train, y_train)
lasso1_coef = lasso1.coef_
# coef over 0 indicates contribution to the performance
# Storing selected features for model selection
X_lasso_train, X_lasso_test = X_train.iloc[:,lasso1_coef > 0], X_test.iloc[:, lasso1_coef > 0]

finalTest = {}
# lin_reg on lasso chosen regression
lin_reg = LinearRegression()
lin_reg.fit(X_lasso_train, y_train)
y_test_pred  = lin_reg.predict(X_lasso_test)

mse_lin_reg = mean_squared_error(y_test_pred,y_test)
finalTest["LinearRegression"] = mse_lin_reg

In [166]:
# creating pipeline for other regression models
models = {
          "DecisionTreeRegressor" : DecisionTreeRegressor(),
          "RandomForestRegressor" : RandomForestRegressor()
         }
param_grid = {
    "DecisionTreeRegressor" : {"DecisionTreeRegressor__max_depth" : range(1,10)},
    "RandomForestRegressor":{  "RandomForestRegressor__max_depth": range(1,10), 
                             "RandomForestRegressor__n_estimators": np.arange(1,101,1)}
}
param_dist = {
    "DecisionTreeRegressor" : {"DecisionTreeRegressor__max_depth" : np.arange(1,11,1)},
    "RandomForestRegressor":{  "RandomForestRegressor__max_depth": np.arange(1,11,1), 
                             "RandomForestRegressor__n_estimators": np.arange(1,101,1)}
}


model_names = {}
model_params = {}
stored_models = {}
for name, model in models.items():
    pipeline = Pipeline(steps = [("scaler", StandardScaler()), (name, model)]) 
    
    Grid_search_m = RandomizedSearchCV(pipeline, param_dist[name], cv=5,random_state=9 , scoring="neg_mean_squared_error") 
    
    stored_models[name] = Grid_search_m.fit(X_train, y_train)
    model_params[name] = Grid_search_m.best_params_
    model_names[name] = Grid_search_m
    

In [167]:
# Finding the best MSE score in models on test data

for (name, model) in models.items():
    y_test_pred = stored_models[name].predict(X_test)
    finalTest[name] = mean_squared_error(y_test_pred, y_test)
    
finalTest

{'LinearRegression': 4.846670478675528,
 'DecisionTreeRegressor': 2.4516533842954953,
 'RandomForestRegressor': 2.224122180337045}

In [168]:
best_model = min(finalTest, key= finalTest.get)
best_mse = min(finalTest)
