# Feature preparation
Right here the features for the ml models are created and filtered. Then a csv with only the important features is saved.

In [1]:
import os
import pandas as pd
import pickle

from sklearn.preprocessing import LabelEncoder  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
path_trips = os.path.join(os.getcwd(), "..", "data", "output", "Trips.csv")

### Take Params from csv

In [3]:
params = pd.read_csv(os.path.join(os.getcwd(), "..", "data", "input", "params.csv")).drop("0", axis=1)
_test_size = params[params["param"]=="test_size"]["value"].values[0]
_random_state = int(params[params["param"]=="random_state"]["value"].values[0])
_pca_components = int(params[params["param"]=="pca_components"]["value"].values[0])

In [4]:
df = pd.read_csv(path_trips)
df.head()

Unnamed: 0.1,Unnamed: 0,p_bike_racks_start,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number_start,p_uid_start,p_bikes_start,p_lat_start,...,24_demand_end,24_demand_hex_big_end,24_demand_hex_small_end,24_agg_time_end,6_demand_end,6_demand_hex_big_end,6_demand_hex_small_end,6_agg_time_end,24_idle_hex_big,24_available_hex_big
0,0,0,False,0,12,2019-01-20 00:07:00,93322,12098234,1,51.041798,...,265,24,4,2019-01-20 00:00:00,45,6.0,1.0,2019-01-20 00:00:00,94.0,447.0
1,1,0,True,0,0,2019-01-20 00:05:00,93576,10299640,5,51.03821,...,265,151,7,2019-01-20 00:00:00,45,30.0,1.0,2019-01-20 00:00:00,121.0,39.0
2,2,0,False,0,12,2019-01-20 00:00:00,93771,12095573,1,51.071262,...,265,24,1,2019-01-20 00:00:00,45,6.0,1.0,2019-01-20 00:00:00,94.0,447.0
3,3,0,False,0,12,2019-01-20 00:26:00,93478,12098942,1,51.028163,...,265,151,2,2019-01-20 00:00:00,45,30.0,1.0,2019-01-20 00:00:00,121.0,39.0
4,4,0,True,0,0,2019-01-20 00:29:00,93577,264599,1,51.04474,...,265,151,6,2019-01-20 00:00:00,45,30.0,2.0,2019-01-20 00:00:00,121.0,39.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567439 entries, 0 to 567438
Data columns (total 83 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Unnamed: 0               567439 non-null  int64  
 1   p_bike_racks_start       567439 non-null  int64  
 2   p_spot_start             567439 non-null  bool   
 3   p_booked_bikes_start     567439 non-null  int64  
 4   p_place_type_start       567439 non-null  int64  
 5   datetime_start           567439 non-null  object 
 6   b_number_start           567439 non-null  int64  
 7   p_uid_start              567439 non-null  int64  
 8   p_bikes_start            567439 non-null  int64  
 9   p_lat_start              567439 non-null  float64
 10  b_electric_lock_start    567439 non-null  bool   
 11  b_bike_type_start        567439 non-null  int64  
 12  p_name_start             567439 non-null  object 
 13  p_free_racks_start       567439 non-null  int64  
 14  b_lo

In [6]:
df.isnull().sum()

Unnamed: 0                0
p_bike_racks_start        0
p_spot_start              0
p_booked_bikes_start      0
p_place_type_start        0
                         ..
6_demand_hex_big_end      1
6_demand_hex_small_end    9
6_agg_time_end            9
24_idle_hex_big           0
24_available_hex_big      0
Length: 83, dtype: int64

## Drop not needed columns
Drop end information and not usefull columns.
The features are cleaned in three steps. <br>
- features_1 => drop not usefull columns 
- features_2 => drop end information
- features_3 => drop object types

In [7]:
print("Drop not int columns and not usefull")
# Unnamed: 0 => index column
# p_terminal_type_end => NaN values
features_1 = df.drop(["Unnamed: 0", 
                      #"p_terminal_type_start", 
                      "p_number_start", 
                      #"b_pedelec_battery_start"
                     ], axis=1)
print("Drop end information")
features_2 = features_1.drop(
    ["p_bike_racks_end",
     "p_spot_end", 
     "p_booked_bikes_end", 
     "p_place_type_end", 
     "datetime_end",
     "p_uid_end",
     "p_bikes_end",     
     "p_lat_end",
     "p_name_end",
     "p_free_racks_end",
     #"p_address_end",
     "p_number_end",
     "p_lng_end",
     "p_maintenance_end",
     "h3_hex_small_id_end",
     "h3_hex_big_id_end",
     "trip_duration",
     "24_demand_end",
     "24_demand_hex_small_end",
     "24_demand_hex_big_end",
     "24_agg_time_end",
     "6_demand_end",
     "6_demand_hex_big_end",
     "6_demand_hex_small_end",
     "6_agg_time_end"
     #"p_terminal_type_end", 
     #"p_bike_types_end"
    ], 
    axis=1
)
print("Drop object types...")
features_3 = features_2.drop(["datetime_start", 
                              "p_name_start", 
                              #"p_address_start", 
                              "b_lock_types_start",
                              "booking_date_start",
                              #"p_bike_types_start", 
                              #"MESS_DATUM"
                             ], axis=1)

Drop not int columns and not usefull
Drop end information
Drop object types...


### Dummy
Create Dummy variables for all booleans

In [8]:
p_spot_start = features_3["p_spot_start"].astype(int)
b_electric_lock_start = features_3["b_electric_lock_start"].astype(int)
p_maintenance_start = features_3["p_maintenance_start"].astype(int)
weekend = features_3["weekend"].astype(int)

le = LabelEncoder()
hexa_small = pd.Series(le.fit_transform(features_3["h3_hex_small_id_start"]))
hexa_big = pd.Series(le.fit_transform(features_3["h3_hex_big_id_start"]))

# weekend = p_df["Weekend"].astype(int)
features = features_3.drop(["p_spot_start", "b_electric_lock_start", "p_maintenance_start", "weekend", "h3_hex_small_id_start", "h3_hex_big_id_start"], axis=1)
features = pd.concat([features, p_spot_start, b_electric_lock_start, p_maintenance_start, weekend, hexa_small, hexa_big], axis=1)
features = features.rename({0:"hexa_small", 1:"hexa_big"}, axis=1)

### Feature Engineering

In [9]:
# features["XYZ"] = np.square(features["XXX"])

In [10]:
# Save features
features.to_csv(os.path.join(os.getcwd(), "..", "data", "output", "Features.csv"))

### Scaling and PCA

In [11]:
def train_scaler_pca(hex_size="hexa_small"):
    """
    Train Scaler and PCA depending on hex size.
    
    Args:
        hex_size (str): which hex size should be used? small or big possible
        
    Returns:
        No return
    """
    
    #FILTER
    features_X = features.drop(["24_demand", "24_demand_hex_big", "24_demand_hex_small", "24_agg_time",
                                "6_demand", "6_demand_hex_big", "6_demand_hex_small", "6_agg_time",
                                "2_demand", "2_demand_hex_big", "2_demand_hex_small", "2_agg_time",
                                "1_demand", "1_demand_hex_big", "1_demand_hex_small", "1_agg_time",
                                "24_available_hex_big"], axis=1)
    if hex_size=="hexa_small":
        features_X = features_X.drop("hexa_big", axis=1)
    else:
        features_X = features_X.drop("hexa_small", axis=1)
    
    features_y = features["24_demand"]
    
    #SPLIT
    X_train, X_test, y_train, y_test = train_test_split(features_X, features_y, random_state=_random_state, test_size=_test_size)

    # STANDARD SCALER
    st_scaler = StandardScaler()
    # fit scaler only on training set not on test set
    st_scaler.fit(X_train)

    # Save Scaler Object
    obj = st_scaler
    filename = "Standard_Scaler_"+hex_size+".pkl"
    pickle.dump(obj, open(os.path.join(os.getcwd(), "..", "data", "output", "models", filename), "wb"))
    #Scale
    X_train_scaled = st_scaler.transform(X_train)

    # PCA
    pca = PCA(n_components=_pca_components)
    pca.fit(X_train_scaled)
    pca_explained_variance = pca.explained_variance_ratio_
    print("Var explained:", pca_explained_variance)
    print("Sum var explained", sum(pca_explained_variance))

    # Save PCA Object
    obj = pca
    filename = "PCA_"+hex_size+".pkl"
    pickle.dump(obj, open(os.path.join(os.getcwd(), "..", "data", "output", "models", filename), "wb"))
    
    print("Done")

In [12]:
hex_sizes = ["hexa_small", "hexa_big"]
for size in hex_sizes:
    train_scaler_pca(size)

Var explained: [0.15111753 0.08875992 0.07731926 0.07128437 0.06059471 0.05778791
 0.0559941  0.05110823 0.04907027 0.03668807 0.03225188 0.03007776
 0.02970358 0.02857031 0.02754478 0.02717029 0.02603132 0.01755765
 0.01634274 0.01440483]
Sum var explained 0.9493795180723564
Done
Var explained: [0.15112974 0.08895539 0.07727211 0.07127171 0.06057989 0.05776519
 0.05618349 0.05110355 0.04899236 0.03669037 0.03171153 0.03004762
 0.02986584 0.02852085 0.02765812 0.02693942 0.02625373 0.01767922
 0.01637636 0.01440161]
Sum var explained 0.9493981110512227
Done
