In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup



In [None]:
#Cargar los datos
dtype_spec = {
    train_df.columns[7]: 'str'
}

train_df = pd.read_csv('train.csv', dtype=dtype_spec)
test_df = pd.read_csv('test.csv')
store_df = pd.read_csv('store.csv')

In [None]:
# Mostrar las primeras filas de cada DataFrame
print(train_df.head())


In [None]:
print(test_df.head())

In [None]:
print(store_df.head())

In [None]:
# Unir los datos de la tienda con los datos de entrenamiento y prueba
train_df = train_df.merge(store_df, on='Store', how='left')
test_df = test_df.merge(store_df, on='Store', how='left')


In [None]:
# Mostrar las primeras filas después de la unión
print(train_df.head())


In [None]:
print(test_df.head())


In [None]:
#pasar nombre de columnas a minusculas y reemplazar espacios por guiones bajos
train_df.columns = [col.lower().replace(' ', '_') for col in train_df.columns]
test_df.columns = [col.lower().replace(' ', '_') for col in test_df.columns]


In [None]:
#evaluamos la presencia de valores nulos
print(train_df.isnull().sum())


In [None]:
#evaluamos la presencia de valores nulos
print(test_df.isnull().sum())

In [None]:
# Rellenar valores nulos en columnas de competencia
train_df['competitiondistance'].fillna(0, inplace=True)
train_df['competitionopensincemonth'].fillna(0, inplace=True)
train_df['competitionopensinceyear'].fillna(0, inplace=True)

test_df['competitiondistance'].fillna(0, inplace=True)
test_df['competitionopensincemonth'].fillna(0, inplace=True)
test_df['competitionopensinceyear'].fillna(0, inplace=True)

# Rellenar valores nulos en columnas de promoción
train_df['promo2sinceweek'].fillna(0, inplace=True)
train_df['promo2sinceyear'].fillna(0, inplace=True)
train_df['promointerval'].fillna(0, inplace=True)

test_df['promo2sinceweek'].fillna(0, inplace=True)
test_df['promo2sinceyear'].fillna(0, inplace=True)
test_df['promointerval'].fillna(0, inplace=True)

In [None]:
#mostrar los tipos de datos
print(train_df.dtypes)

In [None]:
# Crear un boxplot para visualizar los outliers en la columna 'sales'
plt.figure(figsize=(12, 6))
sns.boxplot(x=train_df['sales'])
plt.title('Boxplot de Sales')
plt.show()

# Crear un boxplot para visualizar los outliers en la columna 'competitiondistance'
plt.figure(figsize=(12, 6))
sns.boxplot(x=train_df['competitiondistance'])
plt.title('Boxplot de Competition Distance')
plt.show()

In [None]:
# Convertir columnas a booleanos
train_df['promo'] = train_df['promo'].astype(bool)
train_df['promo2'] = train_df['promo2'].astype(bool)
train_df['stateholiday'] = train_df['stateholiday'].astype(bool)
train_df['schoolholiday'] = train_df['schoolholiday'].astype(bool)

test_df['promo'] = test_df['promo'].astype(bool)
test_df['promo2'] = test_df['promo2'].astype(bool)
test_df['stateholiday'] = test_df['stateholiday'].astype(bool)
test_df['schoolholiday'] = test_df['schoolholiday'].astype(bool)

In [None]:
#convertir datetime 
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

In [None]:
#Crear columnas adicionales para el año, mes y día
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day


In [None]:
#Crear columnas adicionales para el año, mes y día
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day


In [None]:
# URL of the weather data
url_weather = 'http://www.estesparkweather.net/archive_reports.php?date='

# Function to scrape weather data for a specific date
def scrape_weather_data(date):
    response = requests.get(url_weather + date)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract weather data (adjust according to the website's structure)
    weather_data = []
    for row in soup.find_all('tr'):
        columns = row.find_all('td')
        if columns:
            time = columns[0].text.strip()
            temperature = columns[1].text.strip()
            humidity = columns[2].text.strip()
            weather_data.append({'time': time, 'temperature': temperature, 'humidity': humidity})

    return pd.DataFrame(weather_data)


In [None]:
#loop each date of the dataset and scrape the data
weather_data = []
for date in train_df['date'].dt.strftime('%Y%m%d').unique():
    weather_data.append(scrape_weather_data(date))
    