# Adding external features to our dataset:

In [1]:
import pandas as pd

In [2]:
base_path = '../../data/01_additional_data/'
matches_file_name = 'MATCHES_2013-2019.xlsx'
festivities_file_name = 'festivities_calendar_Madrid_2013-2020.csv'

## Matches

In [3]:
# Read matches
matches = pd.read_excel(base_path+matches_file_name,parse_dates = ['Date'] )
matches.head()
matches.columns =  [col.lower() for col in matches.columns]

In [4]:
df_matches = matches.groupby('date')['matches_madrid'].sum().reset_index()

In [5]:
# Filter
df_matches = matches[matches['matches_madrid']>0][['date','matches_madrid']]

In [6]:
df_matches[df_matches['date']<= '2019-09-30'].shape

(1086, 2)

## Festivities

In [7]:
df_public_holidays = pd.read_csv(base_path+festivities_file_name, encoding = "ISO-8859-1", sep = ';', parse_dates = ['Dia'])

In [8]:
df_public_holidays = df_public_holidays.loc[df_public_holidays['laborable / festivo / domingo festivo']=='festivo','Dia'].to_frame()

In [9]:
df_public_holidays['festivo'] = 1

In [10]:
df_public_holidays.head()

Unnamed: 0,Dia,festivo
0,2013-01-01,1
6,2013-07-01,1
76,2013-03-18,1
86,2013-03-28,1
87,2013-03-29,1


In [11]:
df_public_holidays[df_public_holidays['Dia']<= '2019-09-30'].shape

(102, 2)

## Adding it to the transactions dataframe

In [12]:
# Reading the file
input_path = "../../data/03_processed/" + "filtered_transactions_clean"
filtered_file_name="c1-filtered_transactions.csv"
sep=";"
df = pd.read_csv(input_path, sep=sep, parse_dates = ['order_date'])

In [13]:
# Grouping by product and date:
df = df.groupby(['order_date','product'])['units_ordered'].sum().reset_index()
df.shape

(42910, 3)

### Adding Public holidays

In [14]:
df_holidays = pd.merge(df, df_public_holidays, how = 'left', left_on='order_date', right_on='Dia')

In [15]:
df_holidays[df_holidays['festivo']==1]['order_date'].unique().shape

(102,)

In [16]:
df_holidays.head()

Unnamed: 0,order_date,product,units_ordered,Dia,festivo
0,2008-01-01,baguette,86.0,NaT,
1,2008-01-01,croissant petit,178.0,NaT,
2,2008-01-01,croissant simple,137.0,NaT,
3,2008-01-01,milhojas frambuesa,12.0,NaT,
4,2008-01-01,mousse tres chocolates,2.0,NaT,


In [17]:
df_holidays.shape

(42910, 5)

### Adding matches

In [18]:
df_holidays_matches = pd.merge(df_holidays, df_matches, how = 'left', left_on='order_date', right_on='date')

In [19]:
df_holidays_matches.head()

Unnamed: 0,order_date,product,units_ordered,Dia,festivo,date,matches_madrid
0,2008-01-01,baguette,86.0,NaT,,NaT,
1,2008-01-01,croissant petit,178.0,NaT,,NaT,
2,2008-01-01,croissant simple,137.0,NaT,,NaT,
3,2008-01-01,milhojas frambuesa,12.0,NaT,,NaT,
4,2008-01-01,mousse tres chocolates,2.0,NaT,,NaT,


In [20]:
df_holidays_matches[df_holidays_matches['matches_madrid']==1]['order_date'].unique().shape

(612,)

In [21]:
df_holidays_matches = df_holidays_matches[['order_date', 'product', 'units_ordered', 'festivo','matches_madrid']]

In [26]:
from datetime import datetime as dtt

df_holidays_matches['weekday_let']=df_holidays_matches['order_date'].apply(lambda x: dtt.weekday(x))
df_holidays_matches['weekday_num']=df_holidays_matches['order_date'].apply(lambda x: x.strftime('%a'))

In [27]:
df_holidays_matches.fillna(0, inplace = True)

In [28]:
df_holidays_matches.head()

Unnamed: 0,order_date,product,units_ordered,festivo,matches_madrid,weekday_let,weekday_num
0,2008-01-01,baguette,86.0,0.0,0.0,1,Tue
1,2008-01-01,croissant petit,178.0,0.0,0.0,1,Tue
2,2008-01-01,croissant simple,137.0,0.0,0.0,1,Tue
3,2008-01-01,milhojas frambuesa,12.0,0.0,0.0,1,Tue
4,2008-01-01,mousse tres chocolates,2.0,0.0,0.0,1,Tue


In [30]:
print(df_holidays_matches[df_holidays_matches['festivo']==1]['festivo'].count()/10)
print(df_holidays_matches[df_holidays_matches['matches_madrid']==1]['matches_madrid'].count()/10)

107.0
1086.0


In [24]:
exit_path = "../../data/03_processed/" + "time_series.csv"
df_holidays_matches.to_csv(exit_path)

## NOTAS:

FALTAN POR INTRODUCIR LOS DATOS METEOROLÓGICOS

PODRÍAN INTRODUCIRSE TAMBIÉN LOS PARTIDOS DE CHAMPIONS (PROBABLEMENTE EXPLICARÁN MUCHAS ALTERACIONES ENTRE SEMANA)

TANTO EL CALENDARIO DE FESTIVOS COMO EL DE PARTIDOS DEL MADRID COMIENZAN EN 2013. SIN EMBARGO, NUESTRAS SERIES COMIENZAN EN 2008. TENEMOS POR LO TANTO, DOS OPCIONES:

* O BIEN TOMAMOS LAS SERIES DESDE 2008 PARA HACER MODELOS DE SERIES TEMPORALES, Y MODELOS A PARTIR DE 2013 PARA MODELOS DE MACHINE LEARNING

* O BIEN TENEMOS QUE OBTENER LOS DATOS QUE FALTAN

EL CALENDARIO QUE METÍ INCLUÍA COMO 1s LOS PARTIDOS DE EQUIPOS COMO RAYO, GETAFE Y LEGANÉS: A COMPROBAR SI TIENEN INFLUENCIA (PROBABLEMENTE SEA ESCASA). PUEDE AÑADIRSE UNA COLUMNA ALTERNATIVA CON SOLAMENTE "1" EN LOS PARTIDOS DEL MADRID Y EL ATLÉTICO DE MADRID. ASIMISMO, HAY QUE ANALIZAR SI CUENTAN TODOS LOS PARTIDOS, O SÓLO AQUÉLLOS EN LOS QUE JUEGAN EN CASA
    
PROBLEMA CON EL NÚMERO DE PARTIDOS: 1086 EN 6 AÑOS, SON DEMASIADOS
    
EXISTÍA UN PROBLEMA SIMILAR CON EL NÚMERO DE FESTIVOS, PERO YA ESTÁ SOLUCIONADO. AÚN ASÍ, NO COINCIDEN CON LOS QUED HAY EN EL PERIODO 2013-2019 DEL ARCHIVO DE ENTRADA. TENEMOS TAMBIÉN QUE REVISARLOS

