## ANALISIS DE TENDENCIAS DE CONSUMOS E INFRAESTRUCTURAS

_By JoseForguez & DiegoTondo, 2020-08-09_

In [1]:
import pandas as pd
import numpy as np
import warnings
import glob
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

import plotly.graph_objs as go
import plotly.offline as plotly
from plotly.subplots import make_subplots

import matplotlib.dates as md

In [2]:
#pd.set_option('display.max_columns', 50)

#### LOAD TRANSACTIONS

In [3]:
# no lo cargo del modo convencional. Cargo las tablas con el tipo convertido (Martin)
#df_transa = pd.read_csv('../dataset/data_csv/sis_transa_201801_202007.csv', parse_dates=['fecha'])

# Levanto las transacciones desde el .plk
df_transa = pd.read_pickle('dataframe_transa.pkl.zip')

In [4]:
df_transa.columns

Index(['id_transaccion', 'id_vehiculo', 'id_usuario', 'id_equipo', 'id_bomba',
       'id_tanque', 'producto', 'fecha_hora', 'fecha_hora_stop',
       'departamento', 'codigo_error', 'cantidad', 'acum_vehiculo',
       'acum_usuario', 'odometro', 'km_transaccion', 'valor', 'volume_start',
       'volume_stop', 'temp_start', 'temp_stop', 'local_price', 'geo_latitud',
       'geo_longitud', 'veh_efficiency', 'baja'],
      dtype='object')

In [5]:
c2drop = ['acum_usuario', 
          'acum_vehiculo',
          'km_transaccion',
          'local_price',
          'odometro',
          'valor',
          'veh_efficiency'
         ]
df_transa.drop(columns=c2drop, axis=1, inplace=True)
df_transa[:5]

Unnamed: 0,id_transaccion,id_vehiculo,id_usuario,id_equipo,id_bomba,id_tanque,producto,fecha_hora,fecha_hora_stop,departamento,codigo_error,cantidad,volume_start,volume_stop,temp_start,temp_stop,geo_latitud,geo_longitud,baja
0,6974,39920,1045,139,174,134,1.0,2018-01-01 08:27:00,NaT,0,BF,115.972,,,,,,,False
1,6975,39819,1041,139,174,134,1.0,2018-01-01 13:56:00,NaT,0,BF,127.906998,,,,,,,False
2,6976,39931,1055,139,174,134,1.0,2018-01-01 16:54:00,NaT,0,BF,65.570999,,,,,,,False
3,6977,39922,1047,139,174,134,1.0,2018-01-01 20:38:00,NaT,0,BF,71.700996,,,,,,,False
4,5250,41504,436,111,100,81,1.0,2018-01-01 00:23:00,NaT,1,C1,360.018005,,,,,,,False


In [6]:
df_transa[pd.isna(df_transa['cantidad'])]

Unnamed: 0,id_transaccion,id_vehiculo,id_usuario,id_equipo,id_bomba,id_tanque,producto,fecha_hora,fecha_hora_stop,departamento,codigo_error,cantidad,volume_start,volume_stop,temp_start,temp_stop,geo_latitud,geo_longitud,baja
247703,0,0,0,10,0,0,,NaT,NaT,,,,,,,,,,False


In [7]:
df_transa.dropna(subset=['cantidad'], inplace=True)

In [8]:
df_transa.dtypes

id_transaccion             uint16
id_vehiculo                uint32
id_usuario                 uint16
id_equipo                  uint32
id_bomba                   uint32
id_tanque                  uint32
producto                  float32
fecha_hora         datetime64[ns]
fecha_hora_stop    datetime64[ns]
departamento               object
codigo_error               object
cantidad                  float32
volume_start              float32
volume_stop               float32
temp_start                float32
temp_stop                 float32
geo_latitud               float32
geo_longitud              float32
baja                         bool
dtype: object

In [9]:
df_transa.shape

(2820448, 19)

In [10]:
df_transa = df_transa[df_transa['cantidad']>=0]
df_transa.shape

(2819561, 19)

In [11]:
df_transa = df_transa[~(df_transa['fecha_hora']<'2018-01-01 00:00:00')]
df_transa.shape

(2819201, 19)

In [12]:
df_transa[df_transa['id_equipo'].isna()]

Unnamed: 0,id_transaccion,id_vehiculo,id_usuario,id_equipo,id_bomba,id_tanque,producto,fecha_hora,fecha_hora_stop,departamento,codigo_error,cantidad,volume_start,volume_stop,temp_start,temp_stop,geo_latitud,geo_longitud,baja


#### LOAD SITES

In [13]:
df_equipo = pd.read_pickle("dataframe_equipos.pkl.zip")
df_equipo

Unnamed: 0,id_equipo,id_empresa,current_firmware,delay_ue,geo_latitude,geo_longitude
0,585,25,V60403bQ4830361,0,,
1,749,210,V80604bQ0428b17,0,,
2,584,1000,V60403bQ4830361,0,,
3,581,1025,V60403bQ4830361,0,-38.375065,-68.622688
4,599,1050,V60403bQ59fd4f2,180,-31.528835,-68.514709
...,...,...,...,...,...,...
682,813,25,,0,,
683,810,25,,0,,
684,816,25,,0,,
685,819,193,V70604bQ0428b17,0,,


In [14]:
df_equipo.shape

(687, 6)

#### LOAD COMPANIES

In [15]:
df_emp = pd.read_pickle("dataframe_empresas.pkl.zip",)
df_emp

Unnamed: 0,id_empresa,hash_empresa,pais,provincia,ciudad,cp,baja,segmento
0,1,5db5b11547f4d1211cb214641003e8dc,Argentina,,,,False,Demo
1,2,1c30b9e1977a521dbea216c753f885d7,Argentina,,,,False,Demo
2,12,539d586bbb63b67b0fa061fdc6261663,Argentina,,,,False,Transporte
3,13,cdddc8771f6e4b67179ff57df346cd71,Argentina,,,,False,Mineria
4,14,7df8dafb9e88317f5c9c4563db20e6e9,Argentina,,,,False,Transporte
...,...,...,...,...,...,...,...,...
325,1078,bbbe0c06ab3498704aa92623c5998c3d,,,,,False,Constructoras
326,1079,99e0903277d59dad4c35eecb08044920,,Buenos Aires,General Villegas,,False,Agro
327,1081,433b2ede19346280c63fa74a5f3cf80c,,Buenos Aires,CABA,1043,False,Transporte
328,1082,ce1d80a2dbc563f156e2fb2c63114a9c,,Misiones,Aristóbulo del valle,,False,Transporte


##### Segmentos de empresa
 * d: Demo
 * t: Transporte
 * m: Mineria
 * o: Oil and Gas
 * i: Industria
 * a: Agro
 * e: Estaciones de Servicio
 * c: Constructoras
 * s: Telcos (telecomunicaciones)

In [16]:
df_emp.columns

Index(['id_empresa', 'hash_empresa', 'pais', 'provincia', 'ciudad', 'cp',
       'baja', 'segmento'],
      dtype='object')

In [17]:
df_emp.drop(axis=1, columns=['pais', 'provincia', 'ciudad','cp'], inplace=True)

In [18]:
df_emp.dtypes

id_empresa       int16
hash_empresa    object
baja              bool
segmento        object
dtype: object

In [19]:
df_emp

Unnamed: 0,id_empresa,hash_empresa,baja,segmento
0,1,5db5b11547f4d1211cb214641003e8dc,False,Demo
1,2,1c30b9e1977a521dbea216c753f885d7,False,Demo
2,12,539d586bbb63b67b0fa061fdc6261663,False,Transporte
3,13,cdddc8771f6e4b67179ff57df346cd71,False,Mineria
4,14,7df8dafb9e88317f5c9c4563db20e6e9,False,Transporte
...,...,...,...,...
325,1078,bbbe0c06ab3498704aa92623c5998c3d,False,Constructoras
326,1079,99e0903277d59dad4c35eecb08044920,False,Agro
327,1081,433b2ede19346280c63fa74a5f3cf80c,False,Transporte
328,1082,ce1d80a2dbc563f156e2fb2c63114a9c,False,Transporte


#### LOAD VEHICULES

In [20]:
df_veh = pd.read_pickle('dataframe_vehiculos.pkl.zip')

In [21]:
df_veh.columns

Index(['id_vehiculo', 'id_equipo', 'baja', 'main_id'], dtype='object')

In [22]:
df_veh

Unnamed: 0,id_vehiculo,id_equipo,baja,main_id
0,2,333333,False,000001
1,3,333333,False,000239
2,4,333333,False,000270
3,5,333333,False,000252
4,6,333333,False,000235
...,...,...,...,...
51565,102075,822,False,000849
51566,102076,832,False,000849
51567,102077,542,False,999999
51568,102078,661,False,999999


In [23]:
df_veh.drop(['id_equipo'], axis=1, inplace=True)
df_veh.dtypes

id_vehiculo     int32
baja             bool
main_id        object
dtype: object

### LOAD PRODUCTS

In [24]:
df_prod = pd.read_pickle('dataframe_productos.pkl.zip')
df_prod.head(5)

Unnamed: 0,id_equipo,producto,nombre_producto
0,333333,0,DIESEL
1,333333,1,DESCONOCIDO
2,333333,2,DESCONOCIDO
3,333333,3,DESCONOCIDO
4,1,0,DIESEL


***
## **Merge Dataframes**

In [25]:
df_equipo.shape, df_emp.shape

((687, 6), (330, 4))

In [26]:
dfe = pd.merge(df_equipo, df_emp, on='id_empresa')
dfe.shape

(687, 9)

In [27]:
df_transa.shape

(2819201, 19)

In [28]:
df = pd.merge(df_transa, dfe, on='id_equipo', how='left')
df = df.rename(columns={'baja_x':'baja_transaccion','baja_y':'baja_empresa'})
df.shape

(2819201, 27)

In [29]:
df.dtypes

id_transaccion              uint16
id_vehiculo                 uint32
id_usuario                  uint16
id_equipo                   uint32
id_bomba                    uint32
id_tanque                   uint32
producto                   float32
fecha_hora          datetime64[ns]
fecha_hora_stop     datetime64[ns]
departamento                object
codigo_error                object
cantidad                   float32
volume_start               float32
volume_stop                float32
temp_start                 float32
temp_stop                  float32
geo_latitud                float32
geo_longitud               float32
baja_transaccion              bool
id_empresa                 float64
current_firmware            object
delay_ue                   float64
geo_latitude               float32
geo_longitude              float32
hash_empresa                object
baja_empresa                object
segmento                    object
dtype: object

In [30]:
df = df[~df['baja_transaccion']]

In [31]:
df.shape

(2819198, 27)

In [32]:
dfa = pd.merge(df, df_veh, on='id_vehiculo', how='left')
dfa = dfa.rename(columns={'baja':'baja_vehiculo'})
dfa.shape

(2819198, 29)

In [33]:
dfa[['fecha_hora', 'id_vehiculo', 'main_id', 'id_empresa', 'cantidad', 'segmento', 'id_tanque', 'id_bomba', 'hash_empresa']]

Unnamed: 0,fecha_hora,id_vehiculo,main_id,id_empresa,cantidad,segmento,id_tanque,id_bomba,hash_empresa
0,2018-01-01 08:27:00,39920,000005,43.0,115.972000,Constructoras,134,174,1899d5678099a1705184deaa57573548
1,2018-01-01 13:56:00,39819,000002,43.0,127.906998,Constructoras,134,174,1899d5678099a1705184deaa57573548
2,2018-01-01 16:54:00,39931,000016,43.0,65.570999,Constructoras,134,174,1899d5678099a1705184deaa57573548
3,2018-01-01 20:38:00,39922,000007,43.0,71.700996,Constructoras,134,174,1899d5678099a1705184deaa57573548
4,2018-01-01 00:23:00,41504,000355,27.0,360.018005,Transporte,81,100,48a207f3a91979ecf532170652d04745
...,...,...,...,...,...,...,...,...,...
2819193,2019-12-30 11:45:50,68866,040040,264.0,0.532000,Transporte,101165,10953,7c9981d376f5e9a249de2f71bb2bffe4
2819194,2019-12-30 11:47:09,68866,040040,264.0,0.238000,Transporte,101165,10953,7c9981d376f5e9a249de2f71bb2bffe4
2819195,2019-12-30 11:53:59,68866,040040,264.0,9.696000,Transporte,101165,10953,7c9981d376f5e9a249de2f71bb2bffe4
2819196,2019-12-30 11:56:10,68866,040040,264.0,0.000000,Transporte,101165,10953,7c9981d376f5e9a249de2f71bb2bffe4


In [34]:
list(dfa[dfa['id_empresa'].isnull()]['id_equipo'].unique())

[999999,
 100492,
 100220,
 100276,
 100430,
 100434,
 100488,
 100487,
 100494,
 100493]

In [35]:
dfa = dfa.dropna(axis=0, subset=['id_empresa'])
dfa.shape

(2805130, 29)

In [37]:
dfa[:5]

Unnamed: 0,id_transaccion,id_vehiculo,id_usuario,id_equipo,id_bomba,id_tanque,producto,fecha_hora,fecha_hora_stop,departamento,...,id_empresa,current_firmware,delay_ue,geo_latitude,geo_longitude,hash_empresa,baja_empresa,segmento,baja_vehiculo,main_id
0,6974,39920,1045,139,174,134,1.0,2018-01-01 08:27:00,NaT,0,...,43.0,,0.0,,,1899d5678099a1705184deaa57573548,False,Constructoras,False,5
1,6975,39819,1041,139,174,134,1.0,2018-01-01 13:56:00,NaT,0,...,43.0,,0.0,,,1899d5678099a1705184deaa57573548,False,Constructoras,False,2
2,6976,39931,1055,139,174,134,1.0,2018-01-01 16:54:00,NaT,0,...,43.0,,0.0,,,1899d5678099a1705184deaa57573548,False,Constructoras,False,16
3,6977,39922,1047,139,174,134,1.0,2018-01-01 20:38:00,NaT,0,...,43.0,,0.0,,,1899d5678099a1705184deaa57573548,False,Constructoras,False,7
4,5250,41504,436,111,100,81,1.0,2018-01-01 00:23:00,NaT,1,...,27.0,,0.0,,,48a207f3a91979ecf532170652d04745,False,Transporte,False,355


In [38]:
dfb = pd.merge(dfa, df_prod, on=['id_equipo','producto'], how='left')
dfb

Unnamed: 0,id_transaccion,id_vehiculo,id_usuario,id_equipo,id_bomba,id_tanque,producto,fecha_hora,fecha_hora_stop,departamento,...,current_firmware,delay_ue,geo_latitude,geo_longitude,hash_empresa,baja_empresa,segmento,baja_vehiculo,main_id,nombre_producto
0,6974,39920,1045,139,174,134,1.0,2018-01-01 08:27:00,NaT,0000,...,,0.0,,,1899d5678099a1705184deaa57573548,False,Constructoras,False,000005,DESCONOCIDO
1,6975,39819,1041,139,174,134,1.0,2018-01-01 13:56:00,NaT,0000,...,,0.0,,,1899d5678099a1705184deaa57573548,False,Constructoras,False,000002,DESCONOCIDO
2,6976,39931,1055,139,174,134,1.0,2018-01-01 16:54:00,NaT,0000,...,,0.0,,,1899d5678099a1705184deaa57573548,False,Constructoras,False,000016,DESCONOCIDO
3,6977,39922,1047,139,174,134,1.0,2018-01-01 20:38:00,NaT,0000,...,,0.0,,,1899d5678099a1705184deaa57573548,False,Constructoras,False,000007,DESCONOCIDO
4,5250,41504,436,111,100,81,1.0,2018-01-01 00:23:00,NaT,0001,...,,0.0,,,48a207f3a91979ecf532170652d04745,False,Transporte,False,000355,DIESEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2805125,3213,68866,6468,449,10953,101165,0.0,2019-12-30 11:45:50,2019-12-30 11:47:01,0000,...,V60102b8,0.0,,,7c9981d376f5e9a249de2f71bb2bffe4,False,Transporte,False,040040,DIESEL
2805126,3214,68866,6468,449,10953,101165,0.0,2019-12-30 11:47:09,2019-12-30 11:47:22,0000,...,V60102b8,0.0,,,7c9981d376f5e9a249de2f71bb2bffe4,False,Transporte,False,040040,DIESEL
2805127,3215,68866,6468,449,10953,101165,0.0,2019-12-30 11:53:59,2019-12-30 11:55:01,0000,...,V60102b8,0.0,,,7c9981d376f5e9a249de2f71bb2bffe4,False,Transporte,False,040040,DIESEL
2805128,3216,68866,6468,449,10953,101165,0.0,2019-12-30 11:56:10,2019-12-30 11:56:36,0000,...,V60102b8,0.0,,,7c9981d376f5e9a249de2f71bb2bffe4,False,Transporte,False,040040,DIESEL


In [39]:
dfb.nombre_producto = dfb.nombre_producto.fillna('DESCONOCIDO')

In [40]:
dfb.nombre_producto.unique()

array(['DESCONOCIDO', 'DIESEL', 'NAFTA', 'FORMULA', 'COMBUSTIBLE',
       'INFINIA T40', 'INFINIA T20', 'AVGAS 100', 'UREA', 'COMUN', 'BIO',
       'INFINIA', 'GENÉRICO', 'ACEITE', 'REFRIGERANTE', 'GRASA'],
      dtype=object)

In [41]:
dfb.to_pickle('sis_transa_201801_202007_merged.pkl.zip',compression='infer')