In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_union
from sklearn.compose import make_column_selector
from sklearn.preprocessing import FunctionTransformer
from sklearn import set_config; set_config(display='diagram')

from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

In [14]:
region_type = {"Vaud": "urban", "Zürich": "urban", "St. Gallen": "urban", "Valais": "rural", "Bern": "urban", "Ticino": "urban", "Genève": "urban", "Aargau": "rural", 
           "Basel-Stadt": "urban", "Thurgau": "rural", "Luzern": "urban", "Obwalden": "rural", "Solothurn": "rural", "Graubünden": "rural", "Basel-Landschaft": "rural",
           "Freiburg": "urban", "Neuchâtel": "urban", "Zug": "urban", "Schwyz": "rural", "Schaffhausen": "rural", "Appenzell Ausserrhoden": "rural", 
            "Appenzell Innerrhoden": "rural", "Jura": "rural", "Uri": "rural", "Glarus": "rural", "Nidwalden": "rural"}

region_language = {"Vaud": "french", "Zürich": "german", "St. Gallen": "german", "Valais": "french", "Bern": "german", "Ticino": "italien", "Genève": "french", 
           "Aargau": "german", "Basel-Stadt": "german", "Thurgau": "german", "Luzern": "german", "Obwalden": "german", "Solothurn": "german", "Graubünden": "german",
           "Basel-Landschaft": "german", "Freiburg": "french", "Neuchâtel": "french", "Zug": "german", "Schwyz": "german", "Schaffhausen": "german",
           "Appenzell Ausserrhoden": "german", "Appenzell Innerrhoden": "german", "Jura": "french", "Uri": "german", "Glarus": "german", "Nidwalden": "german"}

clean = {'GenÃ¨ve': 'Genève', 'ZÃ¼rich': 'Zürich', 'GraubÃ¼nden': 'Graubünden', 'NeuchÃ¢tel': 'Neuchâtel'}

In [15]:
path_from = "raw_data/full_data_processed.csv"
df = pd.read_csv(path_from, delimiter=',', low_memory=False, encoding = 'utf-8')
df = df.rename(columns={'date': 'Date'})
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1','Store Address', 'id'])

In [16]:
## Drop lines I
df = df[df['Item First Supplying Date'].notnull()]
df.drop(df.loc[df['Store Segment']=="Florist"].index, inplace=True)
df.drop(df.loc[df['Total Supply'] < df['Meals Saved']].index, inplace=True)
df = df[df['Total Supply'] >= 0]
df = df.dropna(subset=['Declared Supply'])

In [17]:
# df[df['Meals Saved']==0].count()
df.drop(df.loc[df['Total Supply']==0][df['Declared Supply']==0].sort_values('Total Supply').index, inplace=True)

  df.drop(df.loc[df['Total Supply']==0][df['Declared Supply']==0].sort_values('Total Supply').index, inplace=True)


In [18]:
## Convert Data Types
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df['Item Price'] = df['Item Price'].str.replace(",",".").astype(float)
df['Item First Supplying Date'] = pd.to_datetime(df['Item First Supplying Date'], format='%m/%d/%Y')
df['Item Last Supplying Date'] = pd.to_datetime(df['Item Last Supplying Date'], format='%m/%d/%Y')
df['Pickup Time Start Time of Day'] = pd.to_datetime(df['Pickup Time Start Time of Day'], format='%H:%M')
df['Pickup Time End Time of Day'] = pd.to_datetime(df['Pickup Time End Time of Day'], format='%H:%M')

In [19]:
## Creates Features
df['Pickup Length'] = df['Pickup Time End Time of Day']-df['Pickup Time Start Time of Day']
df['Supply Length'] = df['Item Last Supplying Date']-df['Item First Supplying Date']
df['Supply Length'] = (df['Supply Length'] / np.timedelta64(1,'D')).astype(int)
df['Store Region'] = df['Store Region'].apply(lambda x: clean[x] if x in clean.keys() else x)
df['Region Language'] = df['Store Region'].map(region_language)
df['Region Type'] = df['Store Region'].map(region_type)

In [20]:
## Drop lines II
df.drop(df.loc[df['Supply Length'] == 0].index, inplace=True)
df = df.drop(columns=['Item First Supplying Date', 'Item Last Supplying Date'])

In [21]:
## Rename NaN
df['Store Segment'] = df['Store Segment'].replace(np.nan, 'No_Segment')
df['Avg Rating Overall'] = df['Avg Rating Overall'].replace(np.nan, 0.0)
df['City'] = df['City'].map(lambda x: str(x).encode('latin-1').decode("utf-8"))

In [23]:
df.to_csv('raw_data/full_data_clean.csv')

In [12]:
###