In [None]:
# %% [markdown]
# ### Title: Data preparation
# ### Author: Agnes Piecyk
# ### Content:
# #### (1) imports the files umsatzdaten_gekuerzt.csv, kiwo.csv and wetter.csv from an URL and stores the merged data in a pandas dataframe
# #### (2) includes a brief data exploration 
# #### (3) removes rows with NaN values (n=2309)
# #### (4) converts "Datum" into datetime and adds new columns for "Wochentag", "Monat" and "Jahr"

# %%
# import library
import pandas as pd # library for data manipulation

# import data from csv files
url1 = "https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/kiwo.csv"
url2 = "https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/umsatzdaten_gekuerzt.csv"
url3 = "https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/wetter.csv"

# Read the CSVs
kiwo = pd.read_csv(url1)
umsatz = pd.read_csv(url2)
wetter = pd.read_csv(url3)

# Merge the dataframes in two steps
## Merge umsatz and wetter dataframes
df = pd.merge(umsatz, wetter, on="Datum", how="inner")
## Merge with kiwo dataframe
df = pd.merge(df, kiwo, on=['Datum'], how='left')
df['KielerWoche'] = df['KielerWoche'].fillna(0).astype(int) # fill missing values with 0 and convert to integer

# export the merged dataframe to a csv file
df.to_csv("merged_data.csv", index=False)

# %%
# Brief data exploration
print(df.head()) #  print the first 5 rows of the dataframe
print(df.info()) # print information about the dataframe (9318 entries)
print(df.isnull().sum()) # check for missing values
print(df.shape) # print the number of rows and columns in the dataframe

# %%
# check wether there are duplicates
print(df.duplicated().sum())

# %%
# Get descriptive statistics, e.g. to check for outliers
print(df.describe())

# %%
# simplify the dataframe by removing rows with NaN values
## remove rows with NaN values
df = df.dropna()

## check the df without NaN values
print(df.info()) ### 7009 entries

## export the merged dataframe without NaN values to a csv file
df.to_csv("merged_data_withoutNaN.csv", index=False)


