In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt

# Exercise Requirements

- Update the dependencies

```
pip install -r requirements.txt

```

- Download the files for Jan and Feb 2022 running these commands:

```
python3 discovery.py --url https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet --prefix yellow
python3 discovery.py --url https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet --prefix yellow
```



##  Downloading the data

Read the data for January 2022. 
How many columns are there?

In [None]:
def read_parquet(file_path : str) -> pd.Dataframe: 
    df = pd.read_parquet(file_path)
    return df

jan_file = "../data/yellow_tripdata_2022-01.parquet"
df_jan = read_parquet(jan_file)    
print(F" Jan 2022 Columns {df_jan.columns.count}")

##  Computing duration

Now let's compute the duration variable. It should contain the duration of a ride in minutes.
What's the standard deviation of the trips duration in January?

In [None]:
def get_duration_std_dev(df : pd.DataFrame) -> float:

    # Calculate the duration of each trip in the series
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']

    # Calculate the standard deviation of trip durations
    std_dev = df['duration'].std()
    return std_dev


std_dev_jan =get_duration_std_dev(df_jan)
print("Standard Deviation of Jan 2022 Trip Durations:", std_dev_jan)

##  Dropping outliers

Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [None]:
def drop_outliers(df : pd.DataFrame, min = 1, max = 60) -> pd.DataFrame:
    # filter the DataFrame to keep durations min and max
    df = df[(df['duration'] >= pd.Timedelta(minutes=min)) & (df['duration'] <= pd.Timedelta(minutes=max))]
    return df

def get_percentage_change(df_original : pd.DataFrame, df_filtered : pd.DataFrame) -> float:
    return (len(df_filtered) / len(df_original)) * 100

df_jan_filtered = drop_outliers(df_jan)
# calculate the percentage of rows remaining 
percentage_remaining = get_percentage_change(df_jan, df_jan_filtered)
print("percentage of Rows Remaining: {:.2f}%".format(percentage_remaining))


## One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries
- Fit a dictionary vectorizer
- Get a feature matrix from it
- What's the dimensionality of this matrix (number of columns)?

In [None]:
def one_hot_encode(df : pd.DataFrame) 
    # apply one-hot encoding to pickup and dropoff location IDs
    one_hot_encoded = pd.get_dummies(df[['PULocationID', 'DOLocationID']])
