In [0]:
df = spark.read.table("nyc_taxi.idk.yellow_trips_csv_v")
display(df)
df.printSchema()

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import radians, sin, cos, atan2, sqrt
R=3959 #radius of earth in miles
df_clean=(
    df
    .filter((df.fare_amount > 0) & (df.fare_amount<500) & (df.trip_distance >0))
    .withColumn("pickup_hour", F.hour(df.tpep_pickup_datetime))
    .withColumn("pickup_day", F.dayofweek(df.tpep_pickup_datetime))
    
    
)
df_clean=(
    df_clean
    .withColumn("pick_lat", radians(df_clean.pickup_latitude))
    .withColumn("pick_long", radians(df_clean.pickup_longitude))
    .withColumn("drop_lat", radians(df_clean.dropoff_latitude))
    .withColumn("drop_long", radians(df_clean.dropoff_longitude))
)
df_clean=(
    df_clean
    .withColumn("lat_diff", df_clean.pick_lat - df_clean.drop_lat)
    .withColumn("long_diff", df_clean.pick_long - df_clean.drop_long)
)
df_clean=(
    df_clean
    .withColumn("a", sin(df_clean.lat_diff/2)**2 + cos(df_clean.pick_lat)*cos(df_clean.drop_lat)*sin(df_clean.long_diff/2)**2)
    .withColumn("c", 2*atan2(sqrt(F.col("a")), sqrt(1-F.col("a"))))
)
df_clean=(
    df_clean
    .withColumn("straight_line_distance", df_clean.c*R)

)
df_clean=df_clean.drop("pick_lat", "pick_long", "drop_lat", "drop_long", "lat_diff", "long_diff", "a", "c")

df_clean=df_clean.withColumn("high_fare", F.when(df_clean.fare_amount >20,1).otherwise(0))

train_df, test_df = df_clean.randomSplit([0.7,0.3], seed=42)
display(df_clean)