In [16]:
import os
import numpy as np
import pandas as pd

## <p style='text-align: center; text-decoration: underline; color: #10A0B4;'> Datos oficiales de la **Union Europea** </p>
[Origen de los datos](https://www.eea.europa.eu/en/datahub/datahubitem-view/fa8b1229-3db6-495d-b18e-9c9b3267c02b)

---

## Autos registrados desde **2017** hasta **2023**

---

# <p style='text-align: center; color: orange;'>Normativa euro 6</p>

# <p style='text-align: center; text-decoration: underline; color: green;'>Limpiado de datos</p>
#### El dataset contiene mas de **14 Gigas** de informacion (69.643.588 filas).

El gran tamano del conjunto de datos trae problemas:
- No puedo subirlo a GitHub (+100MB). 
- Lento procesamiento.

Vamos a resolver ambas cuentiones achicando el dataset de manera tal que el impacto por usar menos informacion sea minimo

In [17]:
weight_file_GB = ((os.path.getsize('data.csv')/1024)/1024)/1024
print('El archivo pesa', round(weight_file_GB, 2), 'Gigas')

El archivo pesa 14.07 Gigas


In [18]:
# Obtenemos las columnas del Data Frame
df_columns = pd.read_csv('data.csv', nrows=0)
print('Columnas del data frame crudo: \n', df_columns.columns)

# Cargamos unicamente las que nos interesan
df_columns.drop(columns=['year', 'Country', 'Date of registration', 'Enedc (g/km)', 'r', 'VFN', 'Mp', 'Man', 'Tan', 'T', 'Va', 'Ve', 'MMS', 'Ct', 'Cr', 'Mt', 'W (mm)', 'At1 (mm)', 'At2 (mm)', 'Fm', 'IT', 'Ernedc (g/km)', 'Erwltp (g/km)', 'De','Vf', 'Status', 'ech', 'RLFI', 'Mh', 'z (Wh/km)', 'ep (KW)', 'Mk', 'Cn', 'Electric range (km)'], inplace=True)
columns_to_load = ['ID', 'm (kg)', 'Ewltp (g/km)', 'Ft', 'ec (cm3)', 'Fuel consumption ']

# Separamos la informacion en fragmentos de 1 millon para que la computadora no se sobresature
df_crud = pd.read_csv('data.csv', chunksize=1000000, usecols=columns_to_load)

print('\n\nColumnas que nos sirven para el proyecto: \n', df_columns.columns)

Columnas del data frame crudo: 
 Index(['ID', 'Country', 'VFN', 'Mp', 'Mh', 'Man', 'MMS', 'Tan', 'T', 'Va',
       'Ve', 'Mk', 'Cn', 'Ct', 'Cr', 'r', 'm (kg)', 'Mt', 'Enedc (g/km)',
       'Ewltp (g/km)', 'W (mm)', 'At1 (mm)', 'At2 (mm)', 'Ft', 'Fm',
       'ec (cm3)', 'ep (KW)', 'z (Wh/km)', 'IT', 'Ernedc (g/km)',
       'Erwltp (g/km)', 'De', 'Vf', 'Status', 'year', 'Date of registration',
       'Fuel consumption ', 'ech', 'RLFI', 'Electric range (km)'],
      dtype='object')


Columnas que nos sirven para el proyecto: 
 Index(['ID', 'm (kg)', 'Ewltp (g/km)', 'Ft', 'ec (cm3)', 'Fuel consumption '], dtype='object')


In [19]:
# Creamos el dataframe que utilisaremos
df = pd.DataFrame(columns=df_columns.columns)

# Cambiamos de nombre algunas columnas
rename_columns = {'m (kg)': 'Mass_(kg)', 'Ewltp (g/km)': 'CO2_emission_(g/km)', 'Ft': 'Fuel_type', 'ec (cm3)': 'Engine_size_cm3', 'Fuel consumption ': 'Fuel_consumption_(l/100km)'}
df.rename(columns=rename_columns, inplace=True)

for chunk in df_crud:

    chunk.rename(columns=rename_columns, inplace=True)    
    chunk = chunk.dropna()
    
    df = pd.concat([df, chunk], ignore_index=True)
    
print(df.shape)

# Lo guardo en un archivo aparte para evitar volver a ejecutar esta celda
df.to_csv('df_data_clean.csv', index=False)

  df = pd.concat([df, chunk], ignore_index=True)


(15023885, 6)


---
---
---

### <p style='color: green; text-align: center;'>Vamos a utilizar 1 millon de filas para que el archivo pese menos de 100 megas y poder subirlo a GitHub</p>

##### Primero limpiamos los datos

In [20]:
df1 = pd.read_csv('df_data_clean.csv')
print('Tamano del archivo df1:', (round((os.path.getsize('df_data_clean.csv')/1024)/1024, 2)), 'MB')
df1.shape

Tamano del archivo df1: 591.11 MB


(15023885, 6)

In [21]:
#Cuantos tipos de combustible hay
print(df1['Fuel_type'].value_counts())

for fuel_type in df1['Fuel_type'].unique():
    print(fuel_type, len(fuel_type))

Fuel_type
petrol             5101007
PETROL             4334156
DIESEL             1891905
diesel             1815946
petrol/electric     828111
PETROL/ELECTRIC     451775
lpg                 237865
LPG                 164343
DIESEL/ELECTRIC      55721
diesel/electric      41836
e85                  38871
NG                   34385
ng                   17583
E85                   5969
NG-BIOMETHANE         4408
unknown                  2
UNKNOWN                  1
ELECTRIC                 1
Name: count, dtype: int64
PETROL 6
PETROL/ELECTRIC 15
DIESEL 6
LPG 3
NG-BIOMETHANE 13
DIESEL/ELECTRIC 15
NG 2
E85 3
UNKNOWN 7
ELECTRIC 8
petrol 6
diesel 6
lpg 3
petrol/electric 15
diesel/electric 15
ng 2
e85 3
unknown 7


In [22]:
#Todo en minuscula y sin espacios
df1['Fuel_type'] = df1['Fuel_type'].str.lower().replace(' ', '').apply(lambda x: x.strip())

#Eliminamos los que no se sabe su combustible
word_to_delete = ['unknown']
def follow_words(text, words):
    return any(word in str(text) for word in words)

df1 = df1[~df1.applymap(lambda x: follow_words(x, word_to_delete)).any(axis=1)]

df1['Fuel_type'].value_counts()

  df1 = df1[~df1.applymap(lambda x: follow_words(x, word_to_delete)).any(axis=1)]


Fuel_type
petrol             9435163
diesel             3707851
petrol/electric    1279886
lpg                 402208
diesel/electric      97557
ng                   51968
e85                  44840
ng-biomethane         4408
electric                 1
Name: count, dtype: int64

### +85% de los vehiculos usa nafta o diesel
Unicamente vamos a utilizar los de nafta y diesel

In [23]:
fuel_type_count = dict(df1[['Fuel_type']].value_counts())
print(fuel_type_count)

fuel_type_count = {str(key[0]): value for key, value in fuel_type_count.items()}
print(fuel_type_count)

fuel_type_count = {key: round((value/df1.shape[0])*100, 2) for key, value in fuel_type_count.items()}
print('\n', fuel_type_count)

df1 = df1[df1['Fuel_type'].isin(['petrol', 'diesel'])]
df1[['Fuel_type']].value_counts()

{('petrol',): 9435163, ('diesel',): 3707851, ('petrol/electric',): 1279886, ('lpg',): 402208, ('diesel/electric',): 97557, ('ng',): 51968, ('e85',): 44840, ('ng-biomethane',): 4408, ('electric',): 1}
{'petrol': 9435163, 'diesel': 3707851, 'petrol/electric': 1279886, 'lpg': 402208, 'diesel/electric': 97557, 'ng': 51968, 'e85': 44840, 'ng-biomethane': 4408, 'electric': 1}

 {'petrol': 62.8, 'diesel': 24.68, 'petrol/electric': 8.52, 'lpg': 2.68, 'diesel/electric': 0.65, 'ng': 0.35, 'e85': 0.3, 'ng-biomethane': 0.03, 'electric': 0.0}


Fuel_type
petrol       9435163
diesel       3707851
Name: count, dtype: int64

In [24]:
df1['Engine_size_cm3'] = df1['Engine_size_cm3']/1000
df1.rename(columns={'Engine_size_cm3': 'Engine_size'}, inplace=True)

df1.to_csv('df_data_clean.csv')

df1

Unnamed: 0,ID,Mass_(kg),CO2_emission_(g/km),Fuel_type,Engine_size,Fuel_consumption_(l/100km)
0,56003309,1710.0,128.0,petrol,2.487,5.7
1,56003313,1710.0,128.0,petrol,2.487,5.7
2,56003314,1710.0,128.0,petrol,2.487,5.7
3,56003351,1635.0,160.0,petrol,1.987,7.1
4,56003352,1635.0,160.0,petrol,1.987,7.1
...,...,...,...,...,...,...
15023880,86000221,985.0,125.0,petrol,1.193,5.5
15023881,86000222,985.0,125.0,petrol,1.193,5.5
15023882,86000223,985.0,125.0,petrol,1.193,5.5
15023883,86000224,985.0,125.0,petrol,1.193,5.5


In [32]:
df1 = df1[df['Fuel_consumption_(l/100km)'] > 5.4]
df1

  df1 = df1[df['Fuel_consumption_(l/100km)'] > 5.4]


Unnamed: 0,ID,Mass_(kg),CO2_emission_(g/km),Fuel_type,Engine_size,Fuel_consumption_(l/100km)
0,56003309,1710.0,128.0,petrol,2.487,5.7
1,56003313,1710.0,128.0,petrol,2.487,5.7
2,56003314,1710.0,128.0,petrol,2.487,5.7
3,56003351,1635.0,160.0,petrol,1.987,7.1
4,56003352,1635.0,160.0,petrol,1.987,7.1
...,...,...,...,...,...,...
15023880,86000221,985.0,125.0,petrol,1.193,5.5
15023881,86000222,985.0,125.0,petrol,1.193,5.5
15023882,86000223,985.0,125.0,petrol,1.193,5.5
15023883,86000224,985.0,125.0,petrol,1.193,5.5


---

In [33]:
# Vamos a utilizar menos volumen de datos para poder maniobrar mas facil
df_aux = df1.sample(1000000)
df_aux.to_csv('df_fit.csv', index=False)
print('Tamano del archivo df2:', round((os.path.getsize('df_fit.csv')/1024)/1024, 2), 'MB')

df2 = pd.read_csv('df_fit.csv')
print(df2.shape)
df2

Tamano del archivo df2: 38.02 MB
(1000000, 6)


Unnamed: 0,ID,Mass_(kg),CO2_emission_(g/km),Fuel_type,Engine_size,Fuel_consumption_(l/100km)
0,85705304,1128.0,125.0,petrol,1.199,5.5
1,52096030,1315.0,127.0,petrol,0.999,5.6
2,81371067,1334.0,133.0,petrol,0.999,5.9
3,56380957,2020.0,169.0,diesel,2.925,6.4
4,82639177,1190.0,133.0,petrol,1.490,5.9
...,...,...,...,...,...,...
999995,82160439,1395.0,150.0,petrol,1.332,6.6
999996,80175442,1377.0,132.0,petrol,1.482,5.8
999997,57385707,1263.0,125.0,petrol,1.199,5.5
999998,51177448,1746.0,227.0,petrol,1.984,10.0
