### Preprocessing

In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib as plt
from matplotlib import pyplot
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV, Ridge
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC

In [None]:
data = pd.read_csv('train.csv', nrows=2000000)
data.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [None]:
def fill_nans(data: pd.DataFrame) -> pd.DataFrame:
  data[['dropoff_longitude', 'dropoff_latitude']] = data[['dropoff_longitude', 'dropoff_latitude']].fillna(data[['dropoff_longitude', 'dropoff_latitude']].mean())

  return data

In [None]:
from scipy import stats

def drop_outliers(data):
  if 'fare_amount' in data.columns:
    cols = ['fare_amount', 'passenger_count']
  else:
    cols = ['passenger_count']

  for col in cols:
    data = data[np.abs(stats.zscore(data[col])) < 3]

  latitude_outliers = data[(data['pickup_latitude'] < -90) | (data['pickup_latitude'] > 90) | (data['dropoff_latitude'] < -90) | (data['dropoff_latitude'] > 90)].index
  data.drop(latitude_outliers, axis=0, inplace=True)
  longitude_outliers = data[(data['pickup_longitude'] < -180) | (data['pickup_longitude'] > 180) | (data['dropoff_longitude'] < -180) | (data['dropoff_longitude'] > 180)].index
  data.drop(longitude_outliers, axis=0, inplace=True)

  return data

In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
  R = 6371.0

  lat1, lon1, lat2, lon2 = np.radians(lat1), np.radians(lon1), np.radians(lat2), np.radians(lon2)

  dlon = lon2 - lon1
  dlat = lat2 - lat1

  a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
  c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
  distance = R * c

  return round(distance, 3)

In [None]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
  data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
  data['year'] = data['pickup_datetime'].dt.year
  data['month'] = data['pickup_datetime'].dt.month
  data['hour'] = data['pickup_datetime'].dt.hour
  data.drop('pickup_datetime', axis=1, inplace=True)
  data['distance'] = haversine_distance(data['pickup_latitude'], data['pickup_longitude'], data['dropoff_latitude'], data['dropoff_longitude'])
  data.drop(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], axis=1, inplace=True)

  return data

In [None]:
def preprocessing(data: pd.DataFrame) -> pd.DataFrame:
  data = fill_nans(data)
  data = drop_outliers(data)
  data = feature_engineering(data)

  return data

In [None]:
data = preprocessing(data)

In [None]:
data.head()

Unnamed: 0,key,fare_amount,passenger_count,year,month,hour,distance
0,2009-06-15 17:26:21.0000001,4.5,1,2009,6,17,1.031
1,2010-01-05 16:52:16.0000002,16.9,1,2010,1,16,8.45
2,2011-08-18 00:35:00.00000049,5.7,2,2011,8,0,1.39
3,2012-04-21 04:30:42.0000001,7.7,1,2012,4,4,2.799
4,2010-03-09 07:51:00.000000135,5.3,1,2010,3,7,1.999


### Линейная регрессия

In [None]:
X = data.drop(['key', 'fare_amount'], axis=1)
y = data['fare_amount']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# бейзлайн

pipe = Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())])

pipe.fit(X_train, y_train)
ypred_train = pipe.predict(X_train)
ypred_test = pipe.predict(X_test)

rmse_test = np.sqrt(mean_squared_error(y_test, ypred_test))
print(f'Test: {mean_squared_error(ypred_test, y_test) ** 0.5}\nTrain:{mean_squared_error(ypred_train, y_train) ** 0.5}')

Test: 6.356712257429106
Train:6.372817140701533


#### Lasso

In [None]:
lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

lasso_selected_features = np.where(lasso.coef_ != 0)[0]
selected_features_names_lasso = X_train.columns[lasso_selected_features]
lasso_coefficients = lasso.coef_[lasso_selected_features]

print("Selected feature names (Lasso):", selected_features_names_lasso.tolist())
print("Corresponding coefficients:", lasso_coefficients.tolist())

Selected feature names (Lasso): ['passenger_count', 'year', 'month', 'hour', 'distance']
Corresponding coefficients: [0.06983401582096793, 0.44264751674439257, 0.07588097342471851, -0.02007818516737368, 7.246054274643282e-05]


In [None]:
X_train_lasso = X_train.iloc[:, lasso_selected_features]
X_test_lasso = X_test.iloc[:, lasso_selected_features]

In [None]:
pipe.fit(X_train_lasso, y_train)
y_pred_test_lasso = pipe.predict(X_test_lasso)
rmse_test_lasso = np.sqrt(mean_squared_error(y_test, y_pred_test_lasso))
print("\nValidation RMSE (Lasso):", rmse_test_lasso)


Validation RMSE (Lasso): 6.356712257429106


#### ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet

elasticnet = ElasticNet(alpha=0.01, l1_ratio=0.05)

elasticnet.fit(X_train, y_train)

elasticnet_selected_features = np.where(elasticnet.coef_ != 0)[0]
selected_features_names_elasticnet = X_train.columns[elasticnet_selected_features]
elasticnet_coefficients = elasticnet.coef_[elasticnet_selected_features]

print("Selected feature names (ElasticNet):", selected_features_names_elasticnet.tolist())
print("Corresponding coefficients:", elasticnet_coefficients.tolist())

Selected feature names (ElasticNet): ['passenger_count', 'year', 'month', 'hour', 'distance']
Corresponding coefficients: [0.08099535488033396, 0.44649503491311165, 0.07733328141857529, -0.020467903703535937, 7.219047230642627e-05]


In [None]:
X_train_elasticnet = X_train.iloc[:, elasticnet_selected_features]
X_test_elasticnet = X_test.iloc[:, elasticnet_selected_features]

In [None]:
pipe.fit(X_train_elasticnet, y_train)
y_pred_test_elasticnet = pipe.predict(X_test_elasticnet)
rmse_test_elasticnet = np.sqrt(mean_squared_error(y_test, y_pred_test_elasticnet))
print("\nValidation RMSE (ElasticNet):", rmse_test_elasticnet)


Validation RMSE (ElasticNet): 6.356712257429106
