## ANALISIS DE TENDENCIAS DE CONSUMOS E INFRAESTRUCTURAS

_By JoseForguez & DiegoTondo, 2020-08-09_

In [1]:
import pandas as pd
import numpy as np
import warnings
import glob
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

import plotly.graph_objs as go
import plotly.offline as plotly
from plotly.subplots import make_subplots

import matplotlib.dates as md

In [None]:
#pd.set_option('display.max_columns', 50)

#### LOAD TRANSACTIONS

In [129]:
df_transa = pd.read_csv('../dataset/data_csv/sis_transa_201801_202007.csv', parse_dates=['fecha'])


Columns (0,1,3,5,9,10,11,12,21,23,29,32,35,36) have mixed types.Specify dtype option on import or set low_memory=False.



In [130]:
df_transa.columns

Index(['acum_usuario', 'acum_vehiculo', 'baja', 'campovar1', 'campovar2',
       'cantidad', 'codigo_error', 'departamento', 'fecha', 'fecha_stop',
       'geo_latitud', 'geo_longitud', 'geo_status', 'hora', 'hora_stop',
       'id_bomba', 'id_equipo', 'id_tanque', 'id_transaccion', 'id_usuario',
       'id_vehiculo', 'km_transaccion', 'local_price', 'odometro', 'producto',
       'pump_site_id', 'sync', 'tag_description_1', 'tag_description_2',
       'tank_site_id', 'temp_start', 'temp_stop', 'user_site_id',
       'user_tag_id', 'valor', 'veh_efficiency', 'veh_site_id', 'veh_tag_id',
       'volume_start', 'volume_stop'],
      dtype='object')

In [131]:
c2drop = ['acum_usuario', 
          'acum_vehiculo',
          'campovar1',
          'campovar2',
          'km_transaccion',
          'local_price',
          'odometro',
          'tag_description_1',
          'tag_description_2',
          'user_tag_id',
          'valor',
          'veh_efficiency'
         ]

In [132]:
df_transa.drop(columns=c2drop, axis=1, inplace=True)

In [133]:
df_transa[:5]

Unnamed: 0,baja,cantidad,codigo_error,departamento,fecha,fecha_stop,geo_latitud,geo_longitud,geo_status,hora,...,pump_site_id,sync,tank_site_id,temp_start,temp_stop,user_site_id,veh_site_id,veh_tag_id,volume_start,volume_stop
0,0,115.972,BF,0,2018-01-01,,,,,08:27:00,...,,0,,,,,,7500620F7A,,
1,0,127.907,BF,0,2018-01-01,,,,,13:56:00,...,,0,,,,,,750061AE13,,
2,0,65.571,BF,0,2018-01-01,,,,,16:54:00,...,,0,,,,,,750061FF96,,
3,0,71.701,BF,0,2018-01-01,,,,,20:38:00,...,,0,,,,,,750063E823,,
4,0,360.018,C1,1,2018-01-01,,,,,00:23:00,...,,0,,,,,,,,


In [134]:
df_transa[pd.isna(df_transa['cantidad'])]

Unnamed: 0,baja,cantidad,codigo_error,departamento,fecha,fecha_stop,geo_latitud,geo_longitud,geo_status,hora,...,pump_site_id,sync,tank_site_id,temp_start,temp_stop,user_site_id,veh_site_id,veh_tag_id,volume_start,volume_stop
1326272,0,,,,NaT,,,,,00:00:00,...,,0,,,,,,,,


In [135]:
df_transa.dropna(subset=['cantidad'], inplace=True)

In [136]:
len(df_transa[df_transa['cantidad'].astype('string').str.contains(',')])

28

In [137]:
df_transa['cantidad'] = df_transa['cantidad'].astype('string').str.replace(',', '.').astype('float64')

In [138]:
df_transa.dtypes

baja                       int64
cantidad                 float64
codigo_error              object
departamento              object
fecha             datetime64[ns]
fecha_stop                object
geo_latitud               object
geo_longitud              object
geo_status                object
hora                      object
hora_stop                 object
id_bomba                   int64
id_equipo                  int64
id_tanque                  int64
id_transaccion             int64
id_usuario                 int64
id_vehiculo                int64
producto                 float64
pump_site_id             float64
sync                       int64
tank_site_id              object
temp_start               float64
temp_stop                float64
user_site_id              object
veh_site_id               object
veh_tag_id                object
volume_start             float64
volume_stop              float64
dtype: object

In [139]:
df_transa.shape

(3003975, 28)

In [140]:
df_transa = df_transa[df_transa['cantidad']>=0]
df_transa.shape

(3003030, 28)

In [145]:
df_transa = df_transa[~(df_transa['fecha']<'2018-01-01')]
df_transa.shape

(3002670, 28)

In [232]:
df_transa[df_transa['id_equipo'].isna()]

Unnamed: 0,baja,cantidad,codigo_error,departamento,fecha,fecha_stop,geo_latitud,geo_longitud,geo_status,hora,...,pump_site_id,sync,tank_site_id,temp_start,temp_stop,user_site_id,veh_site_id,veh_tag_id,volume_start,volume_stop


In [293]:
df_transa[['id_vehiculo', 'cantidad']]

Unnamed: 0,id_vehiculo,cantidad
0,39920,115.972
1,39819,127.907
2,39931,65.571
3,39922,71.701
4,41504,360.018
...,...,...
3003971,59961,92.407
3003972,68753,55.568
3003973,69306,83.843
3003974,68724,48.896


#### LOAD SITES

In [221]:
df_equipo = pd.read_csv('../dataset/data_csv/fs_equipo.csv')
df_equipo.columns

Index(['id_equipo', 'direccion_ip', 'id_empresa', 'descripcion', 'pass',
       'ultima_fecha', 'ultima_fecha_h', 'firmware_version', 'sync', 'online',
       'baja', 'ultima_fecha_sync_db', 'current_firmware',
       'ultima_sincronizacion', 'ultima_conexion', 'delay_ue', 'geo_latitude',
       'geo_longitude', 'id_canal'],
      dtype='object')

In [222]:
df_equipo.drop(axis=1, columns=['direccion_ip', 'pass', 'sync', 'online'], inplace=True)

In [223]:
df_equipo.shape

(698, 15)

#### LOAD COMPANIES

In [191]:
df_emp = pd.read_csv('../dataset/data_csv/fs_empresa_tagged.csv')

In [192]:
df_emp.columns

Index(['id_empresa', 'cuit', 'empresa', 'telefono', 'email', 'pais', 'id_pais',
       'provincia', 'ciudad', 'direccion', 'cp', 'baja',
       'ultima_fecha_sync_db', 'segmento'],
      dtype='object')

In [193]:
df_emp.drop(axis=1, columns=['cuit', 'telefono', 'email', 'pais', 'id_pais', 'provincia', 'ciudad', 'direccion', 'cp', 'ultima_fecha_sync_db'], inplace=True)

In [194]:
df_emp = df_emp.convert_dtypes()
df_emp.dtypes

id_empresa     Int64
empresa       string
baja           Int64
segmento      string
dtype: object

#### LOAD VEHICULES

In [249]:
df_veh = pd.read_csv('../dataset/data_csv/fs_vehiculos.csv')


Columns (7,8,9,10,13,14,16) have mixed types.Specify dtype option on import or set low_memory=False.



In [284]:
df_veh.columns

Index(['id_vehiculo', 'id_equipo', 'tag_id', 'vehiculo', 'descripcion',
       'departamento', 'limite', 'verificar', 'codigo', 'odometro_inicio',
       'odometro_fin', 'cargas_max_dia', 'llave_tipo', 'autorizacion',
       'condicion_desautorizacion', 'cantidad_total', 'cargas_hoy',
       'cargas_hasta_hoy', 'ultima_fecha', 'sync', 'baja', 'main_id'],
      dtype='object')

In [285]:
df_veh

Unnamed: 0,id_vehiculo,id_equipo,tag_id,vehiculo,descripcion,departamento,limite,verificar,codigo,odometro_inicio,...,llave_tipo,autorizacion,condicion_desautorizacion,cantidad_total,cargas_hoy,cargas_hasta_hoy,ultima_fecha,sync,baja,main_id
0,2,333333,04007C180B,000001,000001,0001,9,0,1234,193958,...,0,0,1,00000.000,01,0.0,2018-12-20,1,0,000001
1,3,333333,04007C10EB,000239,000239,0001,9,3,1234,079345,...,0,0,1,00000.000,02,0.0,2018-12-20,1,0,000239
2,4,333333,04007C6D8D,000270,000270,0001,9,3,1234,842409,...,0,0,1,00000.000,01,0.0,2018-12-20,1,0,000270
3,5,333333,04007BC3B5,000252,000252,0001,9,3,1234,287596,...,0,0,1,00000.000,02,0.0,2018-12-20,1,0,000252
4,6,333333,04007C3FA7,000235,000235,0001,9,3,1234,235878,...,0,0,1,00000.000,01,0.0,2018-12-20,1,0,000235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51565,102075,822,??????????,000849,PRUEBA UT,0000,9,0,1234,0,...,0,0,0,00000.000,0,0.0,2020-07-31,1,0,000849
51566,102076,832,??????????,000849,PRUEBA UT,0000,9,0,1234,0,...,0,0,0,00000.000,0,0.0,2020-07-31,1,0,000849
51567,102077,542,??????????,999999,PRUEBA UT,0000,9,0,1234,0,...,0,0,0,00000.000,0,0.0,2020-07-31,1,0,999999
51568,102078,661,??????????,999999,PRUEBA UT,0000,9,0,1234,0,...,0,0,0,00000.000,0,0.0,2020-07-31,1,0,999999


In [286]:
df_veh.drop(df_veh.loc[:,'id_equipo':'sync'], axis=1, inplace=True)

In [287]:
df_veh

Unnamed: 0,id_vehiculo,baja,main_id
0,2,0,000001
1,3,0,000239
2,4,0,000270
3,5,0,000252
4,6,0,000235
...,...,...,...
51565,102075,0,000849
51566,102076,0,000849
51567,102077,0,999999
51568,102078,0,999999


#### Merge Dataframes

In [242]:
df_equipo.shape, df_emp.shape

((698, 15), (330, 4))

In [243]:
dfe = pd.merge(df_equipo, df_emp, on='id_empresa')
dfe.shape

(698, 18)

In [244]:
df_transa.shape

(3002670, 28)

In [245]:
df = pd.merge(df_transa, dfe, on='id_equipo', how='left')
df.shape

(3002670, 45)

In [246]:
df = df[df['baja']==0]

In [247]:
df.shape

(3002667, 45)

In [295]:
dfa = pd.merge(df, df_veh, on='id_vehiculo', how='left')
dfa.shape

(3002667, 47)

In [300]:
dfa[['fecha', 'id_vehiculo', 'main_id', 'id_empresa', 'cantidad', 'segmento', 'id_tanque', 'id_bomba', 'empresa', 'descripcion']]

Unnamed: 0,fecha,id_vehiculo,main_id,id_empresa,cantidad,segmento,id_tanque,id_bomba,empresa,descripcion
0,2018-01-01,39920,000005,43.0,115.972,c,134,174,Coop. San Bernardo,Coop. San Bernardo
1,2018-01-01,39819,000002,43.0,127.907,c,134,174,Coop. San Bernardo,Coop. San Bernardo
2,2018-01-01,39931,000016,43.0,65.571,c,134,174,Coop. San Bernardo,Coop. San Bernardo
3,2018-01-01,39922,000007,43.0,71.701,c,134,174,Coop. San Bernardo,Coop. San Bernardo
4,2018-01-01,41504,000355,27.0,360.018,t,81,100,Sarmiento,Cruz del Eje
...,...,...,...,...,...,...,...,...,...,...
3002662,2020-07-31,59961,000574,269.0,92.407,a,1133,915,Miramar Estrella UT,Miramar
3002663,2020-07-31,68753,001127,149.0,55.568,i,101322,465,Urbacordoba,Cordoba
3002664,2020-07-31,69306,000601,269.0,83.843,a,101175,916,Miramar Estrella UT,Miramar
3002665,2020-07-31,68724,001208,149.0,48.896,i,101322,465,Urbacordoba,Cordoba


In [336]:
list(dfa[dfa['id_empresa'].isnull()]['id_equipo'].unique())

[100492, 100220, 100276, 100430, 100434, 100488, 100487, 100494, 100493]

In [341]:
dfa = dfa.dropna(axis=0, subset=['id_empresa'])
dfa.shape

(2990501, 47)

In [349]:
dfa[:5]

Unnamed: 0,baja_x,cantidad,codigo_error,departamento,fecha,fecha_stop,geo_latitud,geo_longitud,geo_status,hora,...,ultima_conexion,delay_ue,geo_latitude,geo_longitude,id_canal,empresa,baja_y,segmento,baja_y.1,main_id
0,0,115.972,BF,0,2018-01-01,,,,,08:27:00,...,2020-07-31 16:43:48,0.0,,,2.0,Coop. San Bernardo,0,c,0.0,5
1,0,127.907,BF,0,2018-01-01,,,,,13:56:00,...,2020-07-31 16:43:48,0.0,,,2.0,Coop. San Bernardo,0,c,0.0,2
2,0,65.571,BF,0,2018-01-01,,,,,16:54:00,...,2020-07-31 16:43:48,0.0,,,2.0,Coop. San Bernardo,0,c,0.0,16
3,0,71.701,BF,0,2018-01-01,,,,,20:38:00,...,2020-07-31 16:43:48,0.0,,,2.0,Coop. San Bernardo,0,c,0.0,7
4,0,360.018,C1,1,2018-01-01,,,,,00:23:00,...,2020-07-31 23:50:06,0.0,,,4.0,Sarmiento,0,t,0.0,355


In [342]:
#dfa.to_csv('../dataset/data_csv/sis_transa_201801_202007_merged.csv', index=False)