# Master BigData UCM 2023

## 1. Datos vehículos a la venta 2016 en UK

Propósito:

- Facilitar la lectura del .csv que contiene los datos de este ejemplo



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import plotnine
from plotnine import *

### Carga de datos

Este dataset tiene ya 7 años pero sigue siendo válido para nuestro propósito. 

La principal ventaja es que tiene una variale de grupo (o "factor") creada en español (la variable "Tipo") para facilitar los gráficos por tipo de vehículo.

In [None]:
df= pd.read_pickle("datos_vehiculos_2016.pkl")

In [None]:
df.shape

In [None]:
df.describe(include = 'all')

In [None]:
count_classes = pd.value_counts(df['Tipo'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Tipos de vehículos UK 2016")
plt.xlabel("Tipo")
plt.ylabel("Frecuencia");

In [None]:
plt.scatter(df['EngineCapacity'], 
            df['MetricCombined'])

plt.xlabel('Tamaño motor (cm3)')
plt.ylabel('Consumo combinado')
plt.show()

# 2. Visualizaciones con ggplot

Además del contenido recogido en la docu. teórica, es aconsejable seguir este excelente (y práctico) tutorial:

https://realpython.com/ggplot-python/


## 2.1 Scatterplot dataset vehículos



In [None]:
(
ggplot(df) +
    aes(x = 'EngineCapacity', 
          y = 'MetricCombined')
)

In [None]:
(
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined')  # What variable to use
    + geom_point()   # Geometric object to use for drawing
)

In [None]:
mi_grafico = (
ggplot(df) +
    aes(x = 'EngineCapacity', 
          y = 'MetricCombined')
)

In [None]:
mi_grafico + geom_point()

In [None]:
(
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined',
         color = 'Tipo')  # What variable to use
    + geom_point()   # Geometric object to use for drawing
)

In [None]:
(
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined',
         color = 'CO2gkm')  # What variable to use
    +  geom_point()   # Geometric object to use for drawing
)

In [None]:
(
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined',
         color = 'CO2gkm')  # What variable to use
    +  geom_density_2d()   # Geometric object to use for drawing
)

In [None]:
import sklearn
import skmisc

In [None]:
(
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined',
         color = 'Tipo')  # What variable to use
    + geom_point()
    + geom_smooth(method = 'lowess')   # Geometric object to use for drawing
)

In [None]:
(
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined')  # What variable to use
    + geom_point()  
    + geom_smooth() # Geometric object to use for drawing
)

In [None]:
(
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined',
         color = 'Tipo')  # What variable to use
    + geom_point()
#    + geom_smooth(method = 'lowess') 
    + geom_smooth(method = 'lm') 
)

In [None]:
!pip install scikit-learn
!pip install scikit-misc

In [None]:
(
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined',
         color = 'Tipo')  # What variable to use
    + geom_smooth(method = 'lowess')   # Geometric object to use for drawing
    + facet_grid(facets = ". ~ Tipo")
)

In [None]:
(
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined',
         color = 'Tipo')  # What variable to use
    + geom_smooth(method = 'lowess')   # Geometric object to use for drawing
    + geom_point()
    + facet_grid(facets = "Tipo ~ . ")
)

In [None]:
(
    ggplot(df)
    + aes( x = 'Tipo',
         y = "EmissionsNOxmgkm")
    + geom_boxplot()
    + coord_flip() 
#    + theme_classic()
    
)

In [None]:
grafico = (
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined',
         color = 'Tipo')  # What variable to use
    + geom_smooth(method = 'lowess')   # Geometric object to use for drawing
    + geom_point()
    + facet_grid(facets = "Tipo ~ . ")
)

In [None]:
ggsave(plot = grafico, 
       filename = 'enero_024.png', 
       dpi = 600)

In [None]:
grafico2 = (
    ggplot(df)  # What data to use
    + aes(x = 'EngineCapacity', 
          y = 'MetricCombined',
         color = 'Tipo')  # What variable to use
    + geom_smooth(method = 'lowess')   # Geometric object to use for drawing
    + facet_grid(facets = ". ~ Tipo")
)

In [None]:
grafico2.save(width = 24, 
             height = 14, 
             dpi=300,
             format = 'png')

In [None]:
grafico.save(width = 14, 
             height = 24, 
             dpi=300)