# Eda and data cleaning

### Here i will test different machine learning models and choose one that fits the data i have

In [153]:
import pandas as pd
from taxipred.utils.constants import TAXI_CSV_PATH

df = pd.read_csv(TAXI_CSV_PATH)

df.head(5)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Time_of_Day            950 non-null    object 
 2   Day_of_Week            950 non-null    object 
 3   Passenger_Count        950 non-null    float64
 4   Traffic_Conditions     950 non-null    object 
 5   Weather                950 non-null    object 
 6   Base_Fare              950 non-null    float64
 7   Per_Km_Rate            950 non-null    float64
 8   Per_Minute_Rate        950 non-null    float64
 9   Trip_Duration_Minutes  950 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


In [155]:
df.describe().T.drop(["count"], axis=1)

Unnamed: 0,mean,std,min,25%,50%,75%,max
Trip_Distance_km,27.070547,19.9053,1.23,12.6325,25.83,38.405,146.067047
Passenger_Count,2.476842,1.102249,1.0,1.25,2.0,3.0,4.0
Base_Fare,3.502989,0.870162,2.01,2.73,3.52,4.26,5.0
Per_Km_Rate,1.233316,0.429816,0.5,0.86,1.22,1.61,2.0
Per_Minute_Rate,0.292916,0.115592,0.1,0.19,0.29,0.39,0.5
Trip_Duration_Minutes,62.118116,32.154406,5.01,35.8825,61.86,89.055,119.84
Trip_Price,56.874773,40.469791,6.1269,33.74265,50.0745,69.09935,332.043689


### I want to see how many NaN values there are in the csv

In [156]:
df.isna().sum()

Trip_Distance_km         50
Time_of_Day              50
Day_of_Week              50
Passenger_Count          50
Traffic_Conditions       50
Weather                  50
Base_Fare                50
Per_Km_Rate              50
Per_Minute_Rate          50
Trip_Duration_Minutes    50
Trip_Price               49
dtype: int64

### Now i removed all of the NaN values except the ones in Trip_price

- features/independent variable: All columns except Trip_price

- label/target/dependent variable: Trip_price

In [157]:
df = df.dropna(subset=df.columns.drop("Trip_Price"))
df

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.80,0.32,53.82,36.2624
2,36.87,Evening,Weekend,1.0,High,Clear,2.70,1.21,0.15,37.27,52.9032
5,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.2028
8,30.45,Morning,Weekday,3.0,High,Clear,2.77,1.78,0.34,110.33,
12,41.79,Night,Weekend,3.0,High,Clear,4.60,1.77,0.11,86.95,88.1328
...,...,...,...,...,...,...,...,...,...,...,...
991,35.04,Morning,Weekend,4.0,Medium,Rain,2.90,1.10,0.15,9.99,
992,14.34,Afternoon,Weekday,1.0,Medium,Clear,3.23,1.01,0.29,45.07,30.7837
994,18.69,Evening,Weekday,3.0,Medium,Clear,4.90,1.79,0.17,79.41,51.8548
995,5.49,Afternoon,Weekend,4.0,Medium,Clear,2.39,0.62,0.49,58.39,34.4049


### I want to see if i dropped all NaN values

In [158]:
df.isna().sum()

Trip_Distance_km          0
Time_of_Day               0
Day_of_Week               0
Passenger_Count           0
Traffic_Conditions        0
Weather                   0
Base_Fare                 0
Per_Km_Rate               0
Per_Minute_Rate           0
Trip_Duration_Minutes     0
Trip_Price               26
dtype: int64

Now i want to see if Time_of_Day, Day_of_Week, Traffic_conditions, Weather and passenger_count make big enough impact on pice to see if they are worth having in my data 

In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 588 entries, 0 to 998
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       588 non-null    float64
 1   Time_of_Day            588 non-null    object 
 2   Day_of_Week            588 non-null    object 
 3   Passenger_Count        588 non-null    float64
 4   Traffic_Conditions     588 non-null    object 
 5   Weather                588 non-null    object 
 6   Base_Fare              588 non-null    float64
 7   Per_Km_Rate            588 non-null    float64
 8   Per_Minute_Rate        588 non-null    float64
 9   Trip_Duration_Minutes  588 non-null    float64
 10  Trip_Price             562 non-null    float64
dtypes: float64(7), object(4)
memory usage: 55.1+ KB


Here we see that Time_of_Day have no big impact on the price so we can remove it from the data.

In [160]:
df.groupby("Time_of_Day")["Trip_Price"].mean()


Time_of_Day
Afternoon    57.831848
Evening      58.131210
Morning      56.671981
Night        58.657759
Name: Trip_Price, dtype: float64

In [161]:
df.groupby("Day_of_Week")["Trip_Price"].mean()

Day_of_Week
Weekday    59.685419
Weekend    53.407494
Name: Trip_Price, dtype: float64

In [162]:
df.groupby("Traffic_Conditions")["Trip_Price"].mean()


Traffic_Conditions
High      69.169797
Low       55.542874
Medium    54.356849
Name: Trip_Price, dtype: float64

Weather also have a small impact on price so i will remove it aswell

In [163]:
df.groupby("Weather")["Trip_Price"].mean()

Weather
Clear    57.342003
Rain     58.844480
Snow     56.850657
Name: Trip_Price, dtype: float64

In [164]:
df.groupby("Passenger_Count")["Trip_Price"].mean()

Passenger_Count
1.0    56.939087
2.0    58.562152
3.0    54.589746
4.0    60.264663
Name: Trip_Price, dtype: float64

In [165]:
df = df.drop(["Weather", "Time_of_Day"], axis=1)
df

Unnamed: 0,Trip_Distance_km,Day_of_Week,Passenger_Count,Traffic_Conditions,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Weekday,3.0,Low,3.56,0.80,0.32,53.82,36.2624
2,36.87,Weekend,1.0,High,2.70,1.21,0.15,37.27,52.9032
5,8.64,Weekend,2.0,Medium,2.55,1.71,0.48,89.33,60.2028
8,30.45,Weekday,3.0,High,2.77,1.78,0.34,110.33,
12,41.79,Weekend,3.0,High,4.60,1.77,0.11,86.95,88.1328
...,...,...,...,...,...,...,...,...,...
991,35.04,Weekend,4.0,Medium,2.90,1.10,0.15,9.99,
992,14.34,Weekday,1.0,Medium,3.23,1.01,0.29,45.07,30.7837
994,18.69,Weekday,3.0,Medium,4.90,1.79,0.17,79.41,51.8548
995,5.49,Weekend,4.0,Medium,2.39,0.62,0.49,58.39,34.4049
