In [None]:
import pandas as pd
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

# Load the data
df = pd.read_csv('data/get_around_pricing_project.csv', index_col=0)

print(df.head())

In [None]:
print(df.info())

In [None]:
print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

In [None]:
# Extract the features
X = df.drop('rental_price_per_day', axis=1)

# Extract the target column
y = df.loc[:, 'rental_price_per_day']

# Train / test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)

In [None]:
X.head()

In [None]:
# determine categorical and numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Numerical Transformer
numerical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
])

# Categorical Transformer
categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical_transformer", numerical_transformer, numerical_features),
        ("categorical_transformer", categorical_transformer, categorical_features)
    ]
)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [None]:
# Pipeline Model
model = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("Regressor", LinearRegression())
    ]
)

model.fit(X_train, y_train)

In [None]:
# Predictions on train and test set
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Print R^2 scores
print("R2 score on training set : ", r2_score(y_train, y_train_pred))
print("R2 score on test set : ", r2_score(y_test, y_test_pred))
model.