In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
import os
plt.style.use("fivethirtyeight")
import warnings
warnings.filterwarnings("ignore")

# Preparing Data

## Create SparkSession

In [None]:
spark = SparkSession.builder.appName("NYC_ETA").getOrCreate()

## Read Parquet Files

In [None]:
# Merge all data parquet files

jul_data_path = "../data/yellow_tripdata_2024-07.parquet"
aug_data_path = "../data/yellow_tripdata_2024-08.parquet"
sep_data_path = "../data/yellow_tripdata_2024-09.parquet"
df_jul = spark.read.parquet(jul_data_path)
df_aug = spark.read.parquet(aug_data_path)
df_sep = spark.read.parquet(sep_data_path)

df = df_jul.union(df_aug).union(df_sep)

output_path = "../data/merged_yellow_tripdata_2024_Q3.parquet"
# df.write.parquet(output_path)

df.show(5)


In [None]:
df.printSchema()

![data_schema.png](./data_schema.png)

### Merge Taxi Zone Data

In [None]:
zones_df = spark.read.csv("../data/taxi_zone_lookup.csv", header=True, inferSchema=True)
zones_df.show(5)

#### Merge **zones_df** with **df** by **Pick Up Location**

In [None]:
df = df.join(zones_df, df.PULocationID == zones_df.LocationID, "inner")

df = df \
    .withColumnRenamed("Borough", "PU_Borough") \
    .withColumnRenamed("Zone", "PU_Zone") \
    .withColumnRenamed("service_zone", "PU_service_zone")

df = df.drop("LocationID")

df.show(5)

#### Merge **zones_df** with **df** by **Drop Off Location**

In [None]:
df = df.join(zones_df, df.DOLocationID == zones_df.LocationID, "inner")

df = df \
    .withColumnRenamed("Borough", "DO_Borough") \
    .withColumnRenamed("Zone", "DO_Zone") \
    .withColumnRenamed("service_zone", "DO_service_zone")

df = df.drop("LocationID")

df.show(5)

#### Drop some irrelevant columns which has no or little impact in estimate time arrival

In [None]:
columns_to_drop = [
    "store_and_fwd_flag", 
    "payment_type", 
    "fare_amount", 
    "extra", 
    "mta_tax", 
    "tip_amount", 
    "tolls_amount", 
    "improvement_surcharge", 
    "total_amount", 
    "PU_service_zone", 
    "DO_service_zone"
]

df = df.drop(*columns_to_drop)

df.show(5)


# Preprocessing Data

## Data Cleaning

#### trip_distance


In [None]:
from pyspark.sql.functions import col, mean as spark_mean, stddev

# Calculate mean and standard deviation
trip_dis_stats = df.select(
    spark_mean(col("trip_distance")).alias("mean"),
    stddev(col("trip_distance")).alias("stddev")
).collect()[0]
trip_dis_stats

In [None]:
trip_dis_mean = trip_dis_stats["mean"]
trip_dis_stddev = trip_dis_stats["stddev"]

print(f"Mean for Trip Duration is: {trip_dis_mean}")
print(f"Standard Deviation for Trip Duration is: {trip_dis_stddev}")

# Filter rows within 2 standard deviations
df = df.filter(
    (col("trip_distance") >= trip_dis_mean - 2 * trip_dis_stddev) &
    (col("trip_distance") <= trip_dis_mean + 2 * trip_dis_stddev) &
    (col("trip_distance") >= 1)
)

df.describe().show()

In [None]:
trip_distance_data = df.select("trip_distance").toPandas()

trip_distance_data = trip_distance_data[trip_distance_data['trip_distance'] > 0]

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(np.log(trip_distance_data['trip_distance']), bins=50, color='blue', alpha=0.7)
plt.title('Distribution of Log-transformed Trip Distance')
plt.xlabel('Log of Trip Distance (miles)')
plt.ylabel('Frequency')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.boxplot(np.log(trip_distance_data['trip_distance']), vert=False)
plt.title('Boxplot of Log-transformed Trip Distance')
plt.xlabel('Log of Trip Distance (miles)')
plt.grid(True)

plt.tight_layout()
plt.show()


#### passenger_count

Filter rows where `passenger_count` is 7, 8, 9 or 0 because it is uncommon in NYC taxi data
> *(most taxis accommodate up to 6 passengers)*

In [None]:
from pyspark.sql.functions import col
df = df.filter(~(col("passenger_count").isin([7, 8, 9, 0])))

##### airport_fee to enter_airport

In [None]:
from pyspark.sql.functions import when

df = df.withColumn(
    "enter_airport", 
    when(col("airport_fee") > 0, 1).otherwise(0)
)
df.select("enter_airport").show(5)


In [None]:
df = df.drop("airport_fee")

##### pick_up and drop_off datetime

In [None]:
from pyspark.sql.functions import to_timestamp

df = df.withColumn("tpep_pickup_datetime", to_timestamp("tpep_pickup_datetime", "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("tpep_dropoff_datetime", to_timestamp("tpep_dropoff_datetime", "yyyy-MM-dd HH:mm:ss"))

df.select("tpep_pickup_datetime", "tpep_dropoff_datetime").show(truncate=False)


`tpep_pickup_datetime`

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

df = df.withColumn("pickup_date", F.to_date(F.col("tpep_pickup_datetime")))
df = df.withColumn("pickup_hour", F.hour(F.col("tpep_pickup_datetime")))
df = df.withColumn("pickup_minute", F.minute(F.col("tpep_pickup_datetime")))
df = df.withColumn("pickup_second", F.second(F.col("tpep_pickup_datetime")))
df = df.withColumn("pickup_weekday", F.dayofweek(F.col("tpep_pickup_datetime")) - 1)  # Adjust to 0-based (Sunday: 0, Monday: 1,...)
df = df.withColumn("pickup_month", F.month(F.col("tpep_pickup_datetime")))
df = df.withColumn("pickup_day", F.day(F.col("tpep_pickup_datetime")))


# Weekly hour feature (captures patterns based on the time of day and the day of the week)
df = df.withColumn(
    "pickup_week_hour", F.col("pickup_weekday") * 24 + F.col("pickup_hour")  # 0 (Sunday midnight) to 167 (Saturday 11:00 PM)
)

df = df.withColumn(
    "pickup_time", 
    F.concat(
        F.lpad(F.col("pickup_hour"), 2, "0"), F.lit(":"),
        F.lpad(F.col("pickup_minute"), 2, "0"), F.lit(":"),
        F.lpad(F.col("pickup_second"), 2, "0")
    )
)

`tpep_dropoff_datetime`

In [None]:
df = df.withColumn("dropoff_date", F.to_date(F.col("tpep_dropoff_datetime")))
df = df.withColumn("dropoff_hour", F.hour(F.col("tpep_dropoff_datetime")))
df = df.withColumn("dropoff_minute", F.minute(F.col("tpep_dropoff_datetime")))
df = df.withColumn("dropoff_second", F.second(F.col("tpep_dropoff_datetime")))
df = df.withColumn("dropoff_weekday", F.dayofweek(F.col("tpep_dropoff_datetime")) - 1)  # Adjust to 0-based (Sunday: 0, Monday: 1,...)
df = df.withColumn("dropoff_month", F.month(F.col("tpep_dropoff_datetime")))
df = df.withColumn("dropoff_day", F.day(F.col("tpep_dropoff_datetime")))

df = df.withColumn(
    "dropoff_time", 
    F.concat(
        F.lpad(F.col("dropoff_hour"), 2, "0"), F.lit(":"),
        F.lpad(F.col("dropoff_minute"), 2, "0"), F.lit(":"),
        F.lpad(F.col("dropoff_second"), 2, "0")
    )
)

df.select(
    "pickup_date", "pickup_time" , "pickup_month", "pickup_day", "pickup_weekday", "pickup_hour", "pickup_minute", "pickup_second", 
    "pickup_week_hour", "dropoff_date", "dropoff_time" , "dropoff_month", "dropoff_day", "dropoff_hour", "dropoff_minute", "dropoff_second"
).show(truncate=False)


In [None]:
df = df.withColumn(
    "trip_duration_seconds", 
    (F.unix_timestamp("tpep_dropoff_datetime") - F.unix_timestamp("tpep_pickup_datetime")) 
)
df = df.filter(col("trip_duration_seconds") <= 100000)
df.select("trip_duration_seconds").show(5)


#### speed in miles per hour

In [None]:
df = df.withColumn("speed_mph", F.col("trip_distance") / (F.col("trip_duration_seconds") / 3600))
min_speed = 0
max_speed = 100

df = df.filter((F.col("speed_mph") >= min_speed) & (F.col("speed_mph") <= max_speed))
df.select("speed_mph").describe().show(5)

#### Remove rows with IQR of speed

In [None]:
# Convert the speed_mph column to a pandas DataFrame for visualization
speed_data = df.select("speed_mph").sample(fraction=0.1, seed=42).toPandas()

# Plot the distribution of speed
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(speed_data['speed_mph'], bins=50, color='blue', alpha=0.7)
plt.title('Distribution of Speed (mph)')
plt.xlabel('Speed (mph)')
plt.ylabel('Frequency')
plt.grid(True)

# Plot the boxplot of speed
plt.subplot(1, 2, 2)
plt.boxplot(speed_data['speed_mph'], vert=False)
plt.title('Boxplot of Speed (mph)')
plt.xlabel('Speed (mph)')
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Calculate the IQR for speed_mph
Q1_speed = df.approxQuantile("speed_mph", [0.25], 0.05)[0]
Q3_speed = df.approxQuantile("speed_mph", [0.75], 0.05)[0]
IQR_speed = Q3_speed - Q1_speed

# Define the lower and upper bounds for outliers
lower_bound_speed = Q1_speed - 1.5 * IQR_speed
upper_bound_speed = Q3_speed + 1.5 * IQR_speed

# Filter the dataframe to remove outliers
df = df.filter((col("speed_mph") >= lower_bound_speed) & (col("speed_mph") <= upper_bound_speed))

df.show(5)

#### Visualize the Total trip duration throughout a day every day in July

In [None]:
july_data = df.filter(F.col("pickup_month") == 7)

july_data = july_data.groupBy("pickup_day").agg(
    F.sum("trip_duration_seconds").alias("total_trip_duration_seconds"),
    F.count("pickup_day").alias("trip_count")
)

july_data = july_data.orderBy("pickup_day")

pandas_july_data = july_data.toPandas()

plt.figure(figsize=(10, 5))

plt.plot(pandas_july_data['pickup_day'], pandas_july_data['total_trip_duration_seconds'], linestyle='-', marker='o', color='b', alpha=0.6)

plt.title('Total Trip Duration in July by Day of the Month', fontsize=12)
plt.xlabel('Day of the Month', fontsize=8)
plt.ylabel('Total Trip Duration (Seconds)', fontsize=8)

plt.grid(True)
plt.tight_layout()
plt.show()


##### Total number of trips every weekday

In [None]:
# Group by pickup_weekday and count the number of trips
weekday_trip_count = df.groupBy("pickup_weekday").agg(F.count("pickup_weekday").alias("trip_count"))

# Order by pickup_weekday
weekday_trip_count = weekday_trip_count.orderBy("pickup_weekday")

# Convert to Pandas DataFrame for plotting
pandas_weekday_trip_count = weekday_trip_count.toPandas()

# Plot the results
plt.figure(figsize=(10, 5))

plt.bar(pandas_weekday_trip_count['pickup_weekday'], pandas_weekday_trip_count['trip_count'], color='b', alpha=0.7)

plt.title('Number of Trips by Weekday', fontsize=12)
plt.xlabel('Weekday (0=Sunday, 6=Saturday)', fontsize=9)
plt.ylabel('Number of Trips', fontsize=9)

plt.grid(True)
plt.tight_layout()
plt.show()

#### Visualize the number of trips occur in each hour throughout a day

In [None]:
hourly_trip_count = df.groupBy("pickup_hour").agg(
    F.count("pickup_hour").alias("trip_count")
)

hourly_trip_count = hourly_trip_count.orderBy("pickup_hour")

pandas_hourly_trip_count = hourly_trip_count.toPandas()

plt.figure(figsize=(10, 5))

plt.bar(pandas_hourly_trip_count['pickup_hour'], pandas_hourly_trip_count['trip_count'], color='b', alpha=0.7)

plt.title('Number of Trips by Hour of the Day', fontsize=12)
plt.xlabel('Hour of the Day', fontsize=9)
plt.ylabel('Number of Trips', fontsize=9)

plt.grid(True)
plt.tight_layout() 
plt.show()


In [None]:
df.show(5)

### Location

#### Borough having the most number of trips

In [None]:
pickup_borough_counts = df.groupBy("PU_Borough").count()
dropoff_borough_counts = df.groupBy("DO_Borough").count()

In [None]:
pickup_borough_counts_pd = pickup_borough_counts.toPandas()
dropoff_borough_counts_pd = dropoff_borough_counts.toPandas()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))

ax[0].bar(pickup_borough_counts_pd['PU_Borough'], pickup_borough_counts_pd['count'], color='green', alpha=0.7, label='Pickup Boroughs')
ax[0].set_xlabel('Pickup Borough')
ax[0].set_ylabel('Number of Trips')
ax[0].set_title('Popularity of Pickup Boroughs')
ax[0].tick_params(axis='x', rotation=45)
ax[0].legend(loc='upper right')

ax[1].bar(dropoff_borough_counts_pd['DO_Borough'], dropoff_borough_counts_pd['count'], color='blue', alpha=0.7, label='Dropoff Boroughs')
ax[1].set_xlabel('Dropoff Borough')
ax[1].set_ylabel('Number of Trips')
ax[1].set_title('Popularity of Dropoff Boroughs')
ax[1].tick_params(axis='x', rotation=45)
ax[1].legend(loc='upper right')

plt.tight_layout()
plt.show()

### Checking the Covariance between Features

In [None]:
numeric_columns = [
    "passenger_count", "trip_distance", "congestion_surcharge",
    "pickup_hour", "pickup_minute", "trip_duration_seconds", "speed_mph"
]

numeric_df = df.select(numeric_columns).sample(fraction=0.1, seed=42).toPandas()

covariance_matrix = numeric_df.cov()

plt.figure(figsize=(10, 8))
sns.heatmap(
    covariance_matrix,
    annot=True, 
    cmap="coolwarm",
    fmt=".0f",  
    cbar=True,  
    square=True, 
    linewidths=0.5 
)
plt.title("Covariance Heatmap of Numeric Columns")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


### Adding Weather dataset


In [None]:
weather_data = pd.read_csv("../data/weather_data/merged_weather.csv")
weather_data.head(5)

In [None]:
weather_df = spark.createDataFrame(weather_data)

df = df.join(weather_df, (df['pickup_date'] == weather_df['date']) & (df['pickup_hour'] == weather_df['hour']), "left")

df = df.drop(weather_df['hour'])

df.show(5)


In [None]:
columns_to_drop = [
    'name', 'date', 'time', 'RatecodeID', 'vendorid', 'store_and_fwd_flag', 'date', 'month', 'time', 'date', 'datetime']

df = df.drop(*columns_to_drop)

df.show(5)

In [None]:
scatter = df[['average_temperature', 'precipitation', 'snow_fall', 'snow_depth']].sample(fraction=0.1, seed=42).toPandas()

In [None]:
trip_duration_seconds_pd = df.select("trip_duration_seconds").sample(fraction=0.1, seed=42).toPandas()

for i in scatter.columns:
    sns.scatterplot(x=scatter[i], y=trip_duration_seconds_pd['trip_duration_seconds'])
    plt.ylabel('Travel Duration (Seconds)')
    plt.show()

base on the visualization we can drop column snow_fall and snow_depth because observing no noticeable trend or infomation

In [None]:
df = df.drop('snow_fall', 'snow_depth')

#### Relationship between Day Temperature and Number of Trips

In [None]:
import plotly.graph_objects as go

grouped_data = (
    df.groupBy("pickup_month", "pickup_day")
    .agg(
        F.avg("average_temperature").alias("avg_temperature"),
        F.count("pickup_day").alias("trip_count")
    )
    .orderBy("pickup_month", "pickup_day")
    .toPandas()
)

month_names = {7: "July", 8: "August", 9: "September"}
for month in [7, 8, 9]:
    month_data = grouped_data[grouped_data["pickup_month"] == month]
    
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=month_data['pickup_day'],
        y=month_data['avg_temperature'],
        mode='lines',
        line=dict(color='orange', width=2),
        name='Average Temperature'
    ))

    fig.add_trace(go.Scatter(
        x=month_data['pickup_day'],
        y=month_data['trip_count'],
        mode='lines',
        line=dict(color='blue', width=2),
        name='Trip Count',
        yaxis='y2'  # Use secondary y-axis
    ))

    fig.update_layout(
        title=f"Trip Count and Average Temperature in {month_names[month]}",
        xaxis=dict(title="Day of Month"),
        yaxis=dict(title="Average Temperature (°C)", side="left"),
        yaxis2=dict(title="Trip Count", overlaying='y', side="right"),
        legend=dict(title="Legend"),
        template="plotly_white"
    )

    fig.show()


#### Relationship between `prepcipitation` and `trip_count` vs `trip_duration`

In [None]:
import plotly.graph_objects as go

precipitation_data = (
    df.groupBy("precipitation")
    .agg(
        F.count("pickup_day").alias("trip_count"),
        F.avg("trip_duration_seconds").alias("avg_trip_duration")
    )
    .orderBy("precipitation")
    .toPandas()
)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=precipitation_data['precipitation'],
    y=precipitation_data['trip_count'],
    mode='lines+markers',
    line=dict(color='blue', width=2),
    name='Trip Count'
))

fig.add_trace(go.Scatter(
    x=precipitation_data['precipitation'],
    y=precipitation_data['avg_trip_duration'],
    mode='lines+markers',
    line=dict(color='orange', width=2, dash='dot'),
    name='Avg Trip Duration',
    yaxis='y2'  
))

fig.update_layout(
    title="Relationship Between Precipitation, Trip Count, and Avg Trip Duration",
    xaxis=dict(title="Precipitation (mm)"),
    yaxis=dict(title="Trip Count", side="left"),
    yaxis2=dict(title="Avg Trip Duration (seconds)", overlaying='y', side="right"),
    legend=dict(title="Legend"),
    template="plotly_white"
)

fig.show()


* People avoid trips during rainy weather unless necessary, as reflected in the trip count. The sharp drop in trip count supports this behavioral trend.
* Longer average trip durations in rainy conditions indicate potential traffic congestion, slower driving speeds, or more cautious travel during rain.
* The slight increase in trip count for higher precipitation levels might indicate that some trips are unavoidable, regardless of weather conditions.

### Understanding the congestion trends in each Zone in NYC

In [None]:
df.select('congestion_surcharge').distinct().show()

In [None]:
borough_surcharge_spark = df.groupBy("PU_Borough").agg(F.sum("congestion_surcharge").alias("Total_Surcharge"))

borough_surcharge_spark = borough_surcharge_spark.orderBy(F.desc("Total_Surcharge"))

borough_surcharge_pandas = borough_surcharge_spark.toPandas()

plt.figure(figsize=(10, 6))
plt.bar(borough_surcharge_pandas['PU_Borough'], borough_surcharge_pandas['Total_Surcharge'], color='skyblue')
plt.title('Total Congestion Surcharge by Borough')
plt.xlabel('Borough')
plt.ylabel('Total Congestion Surcharge')
plt.xticks(rotation=45)
plt.show()


##### Convert `congestion_surcharge` to `congestion_level` 

In [None]:
low_threshold = -2.5  
high_threshold = 2.5

df = df.withColumn(
    "congestion_level",
    F.when(df["congestion_surcharge"] <= low_threshold, "Low")
     .when((df["congestion_surcharge"] > low_threshold) & (df["congestion_surcharge"] < high_threshold), "Medium")
     .otherwise("High")
)

df.select("congestion_surcharge", "congestion_level").show()


##### `PU_Zone` that has High `congestion_level`

In [None]:
high_congestion_zones = df.filter(F.col("congestion_level") == "High") \
    .groupBy("PU_Zone", "PU_Borough") \
    .count() \
    .orderBy(F.desc("count"))

high_congestion_zones.show(10, truncate=False)

### Final Touch the Dataset

In [None]:
# Convert 'precipitation' to 'rain' (0 or 1)
df = df.withColumn("rain", F.when(df["precipitation"] > 0, 1).otherwise(0))

# Convert 'congestion_level' to 0, 1, or 2 (low, medium, high -> 0, 1, 2)
df = df.withColumn("congestion_level", 
                   F.when(df["congestion_level"] == "Low", 0)
                   .when(df["congestion_level"] == "Medium", 1)
                   .when(df["congestion_level"] == "High", 2)
                   .otherwise(-1))  # or set it to None if needed

# Show the updated dataframe with the transformed columns
df.select("precipitation", "rain", "enter_airport", "congestion_level").show(truncate=False)

# Drop unnecessary columns
df = df.drop('tpep_pickup_datetime', 'tpep_dropoff_datetime', 'congestion_surcharge', 'maximum_temperature', 'minimum_temperature')


In [None]:
def missing_ratio_spark(df):
    total_rows = df.count()

    missing_counts = []
    for column in df.columns:
        if dict(df.dtypes)[column] in ['float', 'double']:
            missing_count = df.filter(F.col(column).isNull() | F.isnan(column)).count()
        else:
            missing_count = df.filter(F.col(column).isNull()).count()
        
        missing_ratio = (missing_count / total_rows) * 100
        missing_counts.append((column, missing_ratio))
    
    missing_data_df = spark.createDataFrame(missing_counts, ["column_name", "missing_ratio"])

    missing_data_df = missing_data_df.orderBy(F.col("missing_ratio"), ascending=False)

    #missing_data_df = missing_data_df.filter(F.col("missing_ratio") > 0)
    
    return missing_data_df

missing_data = missing_ratio_spark(df)
missing_data.show()


# Train - Test Split and Feature selection

In [None]:
train_spark, test_spark = df.randomSplit([0.8, 0.2], seed=42)

###  Select the features and target columns
 `trip_duration_seconds` is the target


### Feature Selection by using Spark MLlib with RandomForestRegressor

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.sql import functions as F

#### Combine features into a single vector column using VectorAssembler

In [None]:
from pyspark.ml.feature import StringIndexer

# List of columns to be indexed
string_columns = [
	"PU_Borough", "PU_Zone", "DO_Borough", "DO_Zone", 
	"enter_airport", "pickup_time", "dropoff_time", 
	"precipitation", "congestion_level"
]

# Index string columns
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in string_columns]

# Apply the indexers
for indexer in indexers:
	df = indexer.fit(df).transform(df)

# Update feature columns to include indexed columns and exclude original string columns
feature_columns = [col for col in df.columns if col not in string_columns + ['trip_duration_seconds', 'pickup_date', 'dropoff_date']]

assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df_assembled = assembler.transform(df)


In [None]:
df_assembled.show(5)

#### Train a Random Forest model to get feature importances

In [None]:
rf = RandomForestRegressor(featuresCol='features', labelCol='trip_duration_seconds')
model = rf.fit(df_assembled)

#### Get feature importances

In [None]:
importances = model.featureImportances
feature_importance_dict = dict(zip(feature_columns, importances))

#### Sort features by importance and select top features

In [None]:
sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

print("Feature importance sorted by importance:")
for feature, importance in sorted_feature_importances:
    print(f'{feature}: {importance}')

#### Select top k important features

In [None]:
top_k_features = [feature for feature, _ in sorted_feature_importances[:10]]

#### Filter the DataFrame to use only the selected features

In [None]:
assembler_selected = VectorAssembler(inputCols=top_k_features, outputCol='selected_features')
df_selected = assembler_selected.transform(df)

df_selected.select('selected_features', 'trip_duration').show(5)

In [None]:
df.printSchema()