In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_union
from sklearn.compose import make_column_selector
from sklearn.preprocessing import FunctionTransformer
from sklearn import set_config; set_config(display='diagram')

from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

In [22]:
region_type = {"Vaud": "urban", "Zürich": "urban", "St. Gallen": "urban", "Valais": "rural", "Bern": "urban", "Ticino": "urban", "Genève": "urban", "Aargau": "rural", 
           "Basel-Stadt": "urban", "Thurgau": "rural", "Luzern": "urban", "Obwalden": "rural", "Solothurn": "rural", "Graubünden": "rural", "Basel-Landschaft": "rural",
           "Freiburg": "urban", "Neuchâtel": "urban", "Zug": "urban", "Schwyz": "rural", "Schaffhausen": "rural", "Appenzell Ausserrhoden": "rural", 
            "Appenzell Innerrhoden": "rural", "Jura": "rural", "Uri": "rural", "Glarus": "rural", "Nidwalden": "rural"}

region_language = {"Vaud": "french", "Zürich": "german", "St. Gallen": "german", "Valais": "french", "Bern": "german", "Ticino": "italien", "Genève": "french", 
           "Aargau": "german", "Basel-Stadt": "german", "Thurgau": "german", "Luzern": "german", "Obwalden": "german", "Solothurn": "german", "Graubünden": "german",
           "Basel-Landschaft": "german", "Freiburg": "french", "Neuchâtel": "french", "Zug": "german", "Schwyz": "german", "Schaffhausen": "german",
           "Appenzell Ausserrhoden": "german", "Appenzell Innerrhoden": "german", "Jura": "french", "Uri": "german", "Glarus": "german", "Nidwalden": "german"}

clean = {'GenÃ¨ve': 'Genève', 'ZÃ¼rich': 'Zürich', 'GraubÃ¼nden': 'Graubünden', 'NeuchÃ¢tel': 'Neuchâtel'}

In [23]:
path_from = "raw_data/full_data_processed.csv"
df = pd.read_csv(path_from, delimiter=',', low_memory=False, encoding = 'utf-8')
df = df.drop(columns=['Unnamed: 0', 'id', 'Unnamed: 0.1'])
df = df.rename(columns={'date': 'Date'})

In [40]:
## Drop lines
df = df[df['Item First Supplying Date'].notnull()]
df.drop(df.loc[df['Store Segment']=="Florist"].index, inplace=True)
df.drop(df.loc[df['Total Supply'] < df['Meals Saved']].index, inplace=True)
df = df[df['Total Supply'] >= 0]
df = df.dropna(subset=['Declared Supply'])

In [82]:
# df[df['Meals Saved']==0].count()
df.drop(df.loc[df['Total Supply']==0][df['Declared Supply']==0].sort_values('Total Supply').index, inplace=True)

  df.drop(df.loc[df['Total Supply']==0][df['Declared Supply']==0].sort_values('Total Supply').index, inplace=True)


In [25]:
## Convert Data Types
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df['Item Price'] = df['Item Price'].str.replace(",",".").astype(float)
df['Item First Supplying Date'] = pd.to_datetime(df['Item First Supplying Date'], format='%m/%d/%Y')
df['Item Last Supplying Date'] = pd.to_datetime(df['Item Last Supplying Date'], format='%m/%d/%Y')
df['Pickup Time Start Time of Day'] = pd.to_datetime(df['Pickup Time Start Time of Day'], format='%H:%M')
df['Pickup Time End Time of Day'] = pd.to_datetime(df['Pickup Time End Time of Day'], format='%H:%M')

In [26]:
## Creates Features
df['Pickup Length'] = df['Pickup Time End Time of Day']-df['Pickup Time Start Time of Day']
df['Supply Length'] = df['Item Last Supplying Date']-df['Item First Supplying Date']
df['Supply Length'] = (df['Supply Length'] / np.timedelta64(1,'D')).astype(int)
df['Store Region'] = df['Store Region'].apply(lambda x: clean[x] if x in clean.keys() else x)
df['Region Language'] = df['Store Region'].map(region_language)
df['Region Type'] = df['Store Region'].map(region_type)

In [27]:
## Rename NaN
df['Store Segment'] = df['Store Segment'].replace(np.nan, 'No_Segment')
df['Avg Rating Overall'] = df['Avg Rating Overall'].replace(np.nan, 0.0)

In [31]:
## Drop all columns where date is before first supply date
# df['test'] = df['Item First Supplying Date']-df['Date']
# df['test'] = (df['test'] / np.timedelta64(1,'D')).astype(int)
# df['Pickup Length'] = (df['Pickup Length'] / np.timedelta64(1,'D')).astype(int)

In [29]:
# IMPROVE QUALITY
## Delete all all rows, where the Date is lower then the Start of Supply Date

# EXTEND VALUES
## Build Dimension "Morning, Evening, Afternoon, FullDay"
## Convert Pickup Length to int(min)
## Get ZIP Codes for Swiss City
## Clean City & Store Address without utf-8 errors
## Get Long + Lat for Locations (depending on ZIP-Code task)

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2567958 entries, 0 to 2663733
Data columns (total 22 columns):
 #   Column                         Dtype          
---  ------                         -----          
 0   Store ID                       int64          
 1   Item ID                        int64          
 2   Store Category                 object         
 3   Store Segment                  object         
 4   Store Region                   object         
 5   City                           object         
 6   Store Address                  object         
 7   Item First Supplying Date      datetime64[ns] 
 8   Item Last Supplying Date       datetime64[ns] 
 9   Item Price                     float64        
 10  Pickup Time Start Time of Day  datetime64[ns] 
 11  Pickup Time End Time of Day    datetime64[ns] 
 12  Date                           datetime64[ns] 
 13  Count Ratings                  int64          
 14  Declared Supply                int64          
 15

In [67]:
df[]

Unnamed: 0,Store ID,Item ID,Store Category,Store Segment,Store Region,City,Store Address,Item First Supplying Date,Item Last Supplying Date,Item Price,...,Date,Count Ratings,Declared Supply,Meals Saved,Total Supply,Avg Rating Overall,Pickup Length,Supply Length,Region Language,Region Type
0,11344,11438,Key Account,Bakery,Genève,GenÃ¨ve,Place de Cornavin 7,2017-12-07,2022-03-01,4.5,...,2019-01-01,0,20,2,20,0.0,0 days 00:50:00,1545,french,urban
1,11729,11822,Indie - Outbound,Buffet,Bern,Bern,LÃ¤nggassstrasse 53,2018-11-02,2019-06-27,6.9,...,2019-01-01,1,0,9,16,4.0,0 days 04:00:00,237,german,urban
2,11729,11822,Indie - Outbound,Buffet,Bern,Bern,LÃ¤nggassstrasse 53,2018-11-02,2019-06-27,6.9,...,2019-01-02,8,0,28,33,4.25,0 days 04:00:00,237,german,urban


In [13]:
df.to_csv('raw_data/full_data_clean.csv')

## Cleaning Functions

In [None]:
## Get long + lat based on City
def geo:
    