# Eda and data cleaning

### Here i will test different machine learning models and choose one that fits the data i have

In [392]:
import pandas as pd
from taxipred.utils.constants import TAXI_CSV_PATH, DATA_PATH

df = pd.read_csv(TAXI_CSV_PATH)

df.head(5)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


In [393]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Time_of_Day            950 non-null    object 
 2   Day_of_Week            950 non-null    object 
 3   Passenger_Count        950 non-null    float64
 4   Traffic_Conditions     950 non-null    object 
 5   Weather                950 non-null    object 
 6   Base_Fare              950 non-null    float64
 7   Per_Km_Rate            950 non-null    float64
 8   Per_Minute_Rate        950 non-null    float64
 9   Trip_Duration_Minutes  950 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


In [394]:
df.describe().T.drop(["count"], axis=1)

Unnamed: 0,mean,std,min,25%,50%,75%,max
Trip_Distance_km,27.070547,19.9053,1.23,12.6325,25.83,38.405,146.067047
Passenger_Count,2.476842,1.102249,1.0,1.25,2.0,3.0,4.0
Base_Fare,3.502989,0.870162,2.01,2.73,3.52,4.26,5.0
Per_Km_Rate,1.233316,0.429816,0.5,0.86,1.22,1.61,2.0
Per_Minute_Rate,0.292916,0.115592,0.1,0.19,0.29,0.39,0.5
Trip_Duration_Minutes,62.118116,32.154406,5.01,35.8825,61.86,89.055,119.84
Trip_Price,56.874773,40.469791,6.1269,33.74265,50.0745,69.09935,332.043689


Now i want to see if Time_of_Day, Day_of_Week, Traffic_conditions and Weather make big enough impact on pice to see if they are worth having in my data. I Want to take a look at all object data to see if i can remove some columns or change them somehow. 

Here we see that Time_of_Day have no big impact on the price so we can remove it from the data.

In [395]:
df.groupby("Time_of_Day")["Trip_Price"].mean()


Time_of_Day
Afternoon    57.958716
Evening      56.404504
Morning      55.949429
Night        56.304401
Name: Trip_Price, dtype: float64

In [396]:
df.groupby("Day_of_Week")["Trip_Price"].mean()

Day_of_Week
Weekday    58.012418
Weekend    55.001961
Name: Trip_Price, dtype: float64

In [397]:
df.groupby("Traffic_Conditions")["Trip_Price"].mean()


Traffic_Conditions
High      65.059510
Low       55.597238
Medium    54.547422
Name: Trip_Price, dtype: float64

In [398]:
df.groupby("Weather")["Trip_Price"].mean()

Weather
Clear    55.809128
Rain     59.769260
Snow     57.815842
Name: Trip_Price, dtype: float64

In [399]:
df = df.drop(["Time_of_Day"], axis=1).copy()
df

Unnamed: 0,Trip_Distance_km,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Weekday,3.0,Low,Clear,3.56,0.80,0.32,53.82,36.2624
1,47.59,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Weekend,1.0,High,Clear,2.70,1.21,0.15,37.27,52.9032
3,30.33,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.6180
...,...,...,...,...,...,...,...,...,...,...
995,5.49,Weekend,4.0,Medium,Clear,2.39,0.62,0.49,58.39,34.4049
996,45.95,Weekday,4.0,Medium,Clear,3.12,0.61,,61.96,62.1295
997,7.70,Weekday,3.0,Low,Rain,2.08,1.78,,54.18,33.1236
998,47.56,Weekday,1.0,Low,Clear,2.67,0.82,0.17,114.94,61.2090


I want to see if the Per_Km_Rate and Per_Minute_Rate are accurate to the Trip_price. 

In [400]:
df["calculated_distance_price"] = (
    df["Trip_Distance_km"] * df["Per_Km_Rate"])


In [401]:
df["calculated_time_price"] = (
    df["Trip_Duration_Minutes"] * df["Per_Minute_Rate"])

In [402]:
df[["Trip_Price", "calculated_distance_price"]].describe().T.drop(["count"], axis= 1)

Unnamed: 0,mean,std,min,25%,50%,75%,max
Trip_Price,56.874773,40.469791,6.1269,33.74265,50.0745,69.09935,332.043689
calculated_distance_price,33.384276,28.790107,0.7874,13.5419,27.44,45.2556,253.093259


In [403]:
df[["Trip_Price", "calculated_time_price"]].describe().T.drop(["count"], axis= 1)

Unnamed: 0,mean,std,min,25%,50%,75%,max
Trip_Price,56.874773,40.469791,6.1269,33.74265,50.0745,69.09935,332.043689
calculated_time_price,18.094635,12.39313,0.791,8.682975,14.8875,25.36935,55.855


Neither Per_Km_Rate or Per_Minute_Rate are accurate so i will remove them from the data. I will also remove the new column i just made. 

In [404]:
df = df.drop(["Per_Km_Rate", "Per_Minute_Rate", "calculated_distance_price", "calculated_time_price"], axis=1).copy()
df

Unnamed: 0,Trip_Distance_km,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Trip_Duration_Minutes,Trip_Price
0,19.35,Weekday,3.0,Low,Clear,3.56,53.82,36.2624
1,47.59,Weekday,1.0,High,Clear,,40.57,
2,36.87,Weekend,1.0,High,Clear,2.70,37.27,52.9032
3,30.33,Weekday,4.0,Low,,3.48,116.81,36.4698
4,,Weekday,3.0,High,Clear,2.93,22.64,15.6180
...,...,...,...,...,...,...,...,...
995,5.49,Weekend,4.0,Medium,Clear,2.39,58.39,34.4049
996,45.95,Weekday,4.0,Medium,Clear,3.12,61.96,62.1295
997,7.70,Weekday,3.0,Low,Rain,2.08,54.18,33.1236
998,47.56,Weekday,1.0,Low,Clear,2.67,114.94,61.2090


I want to see how many NaN values i have

In [405]:
df.isna().sum()

Trip_Distance_km         50
Day_of_Week              50
Passenger_Count          50
Traffic_Conditions       50
Weather                  50
Base_Fare                50
Trip_Duration_Minutes    50
Trip_Price               49
dtype: int64

### Now i removed all of the NaN values except the ones in Trip_price

- features/independent variable: All columns except Trip_price

- label/target/dependent variable: Trip_price

In [408]:
df = df.dropna(subset=df.columns.drop("Trip_Price")).copy()
df

Unnamed: 0,Trip_Distance_km,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Trip_Duration_Minutes,Trip_Price
0,19.35,Weekday,3.0,Low,Clear,3.56,53.82,36.2624
2,36.87,Weekend,1.0,High,Clear,2.70,37.27,52.9032
5,8.64,Weekend,2.0,Medium,Clear,2.55,89.33,60.2028
6,3.85,Weekday,4.0,High,Rain,3.51,5.05,11.2645
8,30.45,Weekday,3.0,High,Clear,2.77,110.33,
...,...,...,...,...,...,...,...,...
995,5.49,Weekend,4.0,Medium,Clear,2.39,58.39,34.4049
996,45.95,Weekday,4.0,Medium,Clear,3.12,61.96,62.1295
997,7.70,Weekday,3.0,Low,Rain,2.08,54.18,33.1236
998,47.56,Weekday,1.0,Low,Clear,2.67,114.94,61.2090


Now i just want to double check if every NaN value is deleted, except Trip Price

In [407]:
df.isna().sum()

Trip_Distance_km          0
Day_of_Week               0
Passenger_Count           0
Traffic_Conditions        0
Weather                   0
Base_Fare                 0
Trip_Duration_Minutes     0
Trip_Price               31
dtype: int64