In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [8]:
df=pd.read_csv("C:/Users/Owner/Downloads/tripsdata_jan_aggregated.csv")
df.head(5)

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,payment_type,fare_amount,tip_amount,tip_ratio,trip_duration_sec,additional_charges,DailyAvgTemp,DailyAvgHumidity,AvgWindspeed,DailyPrecipitation,avg_income_borough
0,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,2,17.7,0.0,0.0,1188.0,5.0,42.2,61.8,6.6,0.0,121549
1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1,10.0,3.75,0.375,396.0,5.0,42.2,61.8,6.6,0.0,121549
2,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1,23.3,3.0,0.128755,1075.0,5.0,42.2,61.8,6.6,0.0,121549
3,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1,10.0,2.0,0.2,498.0,5.0,42.2,61.8,6.6,0.0,121549
4,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1,7.9,3.2,0.405063,366.0,5.0,42.2,61.8,6.6,0.0,121549


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2713438 entries, 0 to 2713437
Data columns (total 15 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   tpep_pickup_datetime   object 
 1   tpep_dropoff_datetime  object 
 2   passenger_count        float64
 3   trip_distance          float64
 4   payment_type           int64  
 5   fare_amount            float64
 6   tip_amount             float64
 7   tip_ratio              float64
 8   trip_duration_sec      float64
 9   additional_charges     float64
 10  DailyAvgTemp           float64
 11  DailyAvgHumidity       float64
 12  AvgWindspeed           float64
 13  DailyPrecipitation     float64
 14  avg_income_borough     int64  
dtypes: float64(11), int64(2), object(2)
memory usage: 310.5+ MB


In [10]:
# Split data into features and target variable
df.drop(columns=['tip_amount', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'], inplace=True)

X = df.drop(columns=['tip_ratio'])  
y = df['tip_ratio']  

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Feature selection using Lasso
lasso_model = Lasso(alpha=0.01)  
lasso_model.fit(X_train, y_train)
selected_features_lasso = X.columns[(lasso_model.coef_ != 0)]
selected_features_lasso

Index(['payment_type', 'fare_amount', 'trip_duration_sec',
       'additional_charges', 'avg_income_borough'],
      dtype='object')

In [13]:
# Select features using Lasso coefficients
##sfm = SelectFromModel(lasso_model, threshold=0.01)
#sfm.fit(X_train, y_train)

In [14]:
# Transform training and testing sets
X_train_lasso = X_train[selected_features_lasso]
X_test_lasso = X_test[selected_features_lasso]

In [15]:
# Fit Linear Regression model using selected features
linear_reg_model_selected = LinearRegression()
linear_reg_model_selected.fit(X_train_lasso, y_train)

In [16]:
# Linear Regression without feature selection
linear_reg_model_full = LinearRegression()
linear_reg_model_full.fit(X_train, y_train)

In [40]:
# Random Forest Regression without feature selection
rf_model = RandomForestRegressor(n_estimators=10, random_state=42)
sfm_rf = SelectFromModel(rf_model, threshold=0.01)
sfm_rf.fit(X_train, y_train)
X_train_rf = sfm_rf.transform(X_train)
X_test_rf = sfm_rf.transform(X_test)


In [41]:
# Train Random Forest
rf_model.fit(X_train_rf, y_train)

In [35]:
# Random Forest Regression with feature selection
rf_model_lasso= RandomForestRegressor(n_estimators=10, random_state=42)
sfm_rf_lasso = SelectFromModel(rf_model, threshold=0.01)
sfm_rf_lasso.fit(X_train_lasso, y_train)
X_train_rf_lasso = sfm_rf_lasso.transform(X_train_lasso)
X_test_rf_lasso = sfm_rf_lasso.transform(X_test_lasso)


In [37]:
# Train Random Forest on selected features
rf_model_lasso.fit(X_train_rf_lasso, y_train)

In [101]:
# Now fit your Linear Regression model using the selected features
linear_reg_model.fit(X_train_lasso, y_train)

# Make sure to use the same selected features when evaluating the model
train_rmse_lasso, test_rmse_lasso = evaluate_model(linear_reg_model, X_train_lasso, X_test_lasso, y_train, y_test)

In [23]:
# Evaluate Models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
    train_r2 = r2_score(y_train, y_train_pred)
    y_test_pred = model.predict(X_test)
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
    return train_rmse, test_rmse,train_r2, test_r2

# Evaluate Linear Regression with selected features
train_rmse_lasso, test_rmse_lasso,train_r2_lasso, test_r2_lasso = evaluate_model(linear_reg_model_selected, X_train_lasso, X_test_lasso, y_train, y_test)
print("Linear Regression with Lasso Feature Selection:")
print(f"Train RMSE: {train_rmse_lasso}")
print(f"Test RMSE: {test_rmse_lasso}")
print(f"Train R2-Score: {train_r2_lasso}")
print(f"Test R2-Score: {test_r2_lasso}")


# Evaluate Linear Regression without feature selection
train_rmse_full, test_rmse_full,train_r2_full, test_r2_full = evaluate_model(linear_reg_model_full, X_train, X_test, y_train, y_test)
print("\nLinear Regression without Feature Selection:")
print(f"Train RMSE: {train_rmse_full}")
print(f"Test RMSE: {test_rmse_full}")
print(f"Train R2-Score: {train_r2_full}")
print(f"Test R2-Score: {test_r2_full}")


Linear Regression with Lasso Feature Selection:
Train RMSE: 0.09806526710844032
Test RMSE: 0.09816917755189361
Train R2-Score: 0.4726056413430354
Test R2-Score: 0.4709899116217966

Linear Regression without Feature Selection:
Train RMSE: 0.09801447443335352
Test RMSE: 0.09811728022933713
Train R2-Score: 0.47315182520119714
Test R2-Score: 0.471549088158001


In [43]:
train_rmse_full, test_rmse_full,train_r2_full, test_r2_full = evaluate_model(rf_model, X_train_rf, X_test_rf, y_train, y_test)
print("\nRandom Forest Regression without feature selection:")
print(f"Train RMSE: {train_rmse_full}")
print(f"Test RMSE: {test_rmse_full}")
print(f"Train R2-Score: {train_r2_full}")
print(f"Test R2-Score: {test_r2_full}")



Random Forest Regression without feature selection:
Train RMSE: 0.040936841406839036
Test RMSE: 0.09406167836542713
Train R2-Score: 0.9080961490177383
Test R2-Score: 0.5143324392699882


In [44]:
train_rmse_lasso, test_rmse_lasso,train_r2_lasso, test_r2_lasso = evaluate_model(rf_model_lasso, X_train_rf_lasso, X_test_rf_lasso, y_train, y_test)
print("\nRandom Forest Regression with feature selection:")
print(f"Train RMSE: {train_rmse_lasso}")
print(f"Test RMSE: {test_rmse_lasso}")
print(f"Train R2-Score: {train_r2_lasso}")
print(f"Test R2-Score: {test_r2_lasso}")



Random Forest Regression with feature selection:
Train RMSE: 0.07950652131108234
Test RMSE: 0.08795268318614785
Train R2-Score: 0.6533345560899135
Test R2-Score: 0.5753688642464024
