In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import os

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats

import seaborn as sns

from datetime import datetime

import re

from tqdm.notebook import tqdm

from tools import * 

config = pd.Series({
    'devide_by':1,
    'year':2019,
    'datafrom': ['BICING_ESTACIONS', 'BicingNou_ESTACIONS', 'BicingNou_INFORMACIO'],
    'dataset': 'BicingNou_MOD',
    'ttl': 30,
    'month': 3,
    'monthname': 'Marc'
})

os.system(f"mkdir -p ../dades/{config.year}/{config.dataset}")


In [None]:
%%html
<style>
    p {
        float:left
    }  
    table {
        display: inline-block
    }
</style>

<div>
    <table>
        <tr>
            <th><p>Camp<p><th>
            <th><p>Descripció<p><th>
        <tr>
        <tr>
            <td><p>last_updated<p><td>
            <td><p>Timestamp de l'arxiu<p><td>
        <tr>
        <tr>
            <td><p>ttl<p><td>
            <td><p>TimeToLive de la resposta<p><td>
        <tr>
        <tr>
            <td><p>data<p><td>
            <td><p>Contenidor d'arrays d'informació d'estacions<p><td>
        <tr>
        <tr>
            <td><p>stations<p><td>
            <td><p>Array de dades de cada estació<p><td>
        <tr>
        <tr>
            <td><p>station_id<p><td>
            <td><p>Identificador de l'estació<p><td>
        <tr>
        <tr>
            <td><p>num_bikes_available<p><td>
            <td><p>Nombre de bicicletes disponibles<p><td>
        <tr>
        <tr>
            <td><p>num_bikes_available_types<p><td>
            <td><p>Array de tipus de bicicletes disponibles<p><td>
        <tr>
        <tr>
            <td><p>mechanical<p><td>
            <td><p>Nombre de bicicletes mecàniques disponibles<p><td>
        <tr>
        <tr>
            <td><p>ebike<p><td>
            <td><p>Nombre de bicicletes elèctriques disponibles<p><td>
        <tr>
        <tr>
            <td><p>num_docks_available<p><td>
            <td><p>Nombre de ancoratges disponibles<p><td>
        <tr>
        <tr>
            <td><p>is_installed<p><td>
            <td><p>L'estació està correctament instalada (0-NO,1-SI)<p><td>
        <tr>
        <tr>
            <td><p>is_renting<p><td>
            <td><p>L'estació està proporcionant bicicletes correctament<p><td>
        <tr>
        <tr>
            <td><p>is_returning<p><td>
            <td><p>L'estació està ancorant bicicletes correctament<p><td>
        <tr>
        <tr>
            <td><p>last_reported<p><td>
            <td><p>Timestamp de la informació de l'estació<p><td>
        <tr>
        <tr>
            <td><p>is_charging_station<p><td>
            <td><p>L'estació té capacitat de càrrega de bicicletes elèctriques<p><td>
        <tr>
        <tr>
            <td><p>status<p><td>
            <td><p>Estat de l'estació (IN_SERVICE=En servei, CLOSED=Tancada)<p><td>
        <tr>
    <table>
<div>

# Old data Marc 2019

In [None]:
%%time

dades_2019_Marc_info_old = pd.read_csv(f'../dades/{config.year}/{config.datafrom[0]}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom[0]}.csv', low_memory=False)

intial_size = dades_2019_Marc_info_old.shape[0]
print(dades_2019_Marc_info_old.shape)

dades_2019_Marc_info_old.rename(
    columns={
        'id': 'station_id',
        'latitude':'lat',
        'longitude':'lon',
        'type':'physical_configuration',
        'updateTime':'last_updated',
        'bikes':'num_bikes_available',
        'slots':'num_docks_available',
        'streetName':'street_name',
        'streetNumber':'street_number',
    }, 
    inplace=True
)

# STATUS = IN_SERVICE=En servei, CLOSED=Tancada
# replace IN_SERVICE with 1 and CLOSED with 0
dades_2019_Marc_info_old['status'].replace(to_replace=['CLS', 'OPN'], value=[0, 1], inplace=True)

dades_2019_Marc_info_old['physical_configuration'].replace(to_replace=['BIKE', 'BIKE-ELECTRIC'], value=[0, 1], inplace=True)

dades_2019_Marc_info_old['is_installed'] = 1
dades_2019_Marc_info_old['is_renting'] = 1
dades_2019_Marc_info_old['is_returning'] = 1
dades_2019_Marc_info_old['is_charging_station'] = 1
dades_2019_Marc_info_old['num_bikes_available_types.mechanical'] = dades_2019_Marc_info_old['num_bikes_available']
dades_2019_Marc_info_old['num_bikes_available_types.ebike'] = 0
dades_2019_Marc_info_old['post_code'] = '0'
dades_2019_Marc_info_old['capacity'] = 0

# we don't have this column anywhere in the new dataset so it got removed
dades_2019_Marc_info_old.drop('nearbyStations', axis=1, inplace=True)

dades_2019_Marc_info_old = convert_timestamp(dades_2019_Marc_info_old.copy(), ['last_updated'], sort=True, add=True, pattern='%d/%m/%y %H:%M:%S')

# convert timestamps to multimple of 5
dades_2019_Marc_info_old = timestamp_multipleof(
    devide_by=config.devide_by, 
    column='minutes_last_updated_date',
    df=dades_2019_Marc_info_old.copy(), 
    new_column='last_updated', 
    year_column='year_last_updated_date',
    month_column='month_last_updated_date',
    day_column='dayofmonth_last_updated_date',
    hour_column='hour_last_updated_date',
    minutes_column='minutes_last_updated_date'
)    

# drop not needed columns
dades_2019_Marc_info_old.drop(
    [
        'year_last_updated_date', 'month_last_updated_date',
        'week_last_updated_date', 'dayofweek_last_updated_date',
        'dayofmonth_last_updated_date', 'dayofyear_last_updated_date',
        'hour_last_updated_date', 'minutes_last_updated_date'
    ], 
    axis=1, 
    inplace=True
)

print(dades_2019_Marc_info_old.shape)
print('removed:', intial_size-dades_2019_Marc_info_old.shape[0])

In [None]:
%%time

# replace nans with string of 0
dades_2019_Marc_info_old.street_number = dades_2019_Marc_info_old.street_number.astype(str)
dades_2019_Marc_info_old.street_number.fillna('0', inplace=True)
dades_2019_Marc_info_old.loc[dades_2019_Marc_info_old.street_number.isin(['nan', '.']),['street_number']] = '0' 

# extract the correct column
correct_column = dades_2019_Marc_info_old[dades_2019_Marc_info_old.street_number != '0'].groupby(['station_id'])['street_number'].max()
correct_column = pd.DataFrame(correct_column.reset_index())
# correct the data according to the data in the correct column
dades_2019_Marc_info_old = correct_columns(dades_2019_Marc_info_old.copy(), 'station_id', 'street_number', correct_column=correct_column)

dades_2019_Marc_info_old.street_number.fillna('0', inplace=True)
dades_2019_Marc_info_old.post_code.fillna('0', inplace=True)

cond = (dades_2019_Marc_info_old.num_bikes_available + dades_2019_Marc_info_old.num_docks_available) > dades_2019_Marc_info_old.capacity
dades_2019_Marc_info_old.loc[cond, ['capacity']] = dades_2019_Marc_info_old[cond]['num_bikes_available'] + dades_2019_Marc_info_old[cond]['num_docks_available']

dades_2019_Marc_info_old = correct_columns(dades_2019_Marc_info_old.copy(), 'station_id', 'capacity')

dades_2019_Marc_info_old.loc[dades_2019_Marc_info_old.capacity.isna(),['capacity']] = 0

dades_2019_Marc_info_old = correct_columns(dades_2019_Marc_info_old.copy(), 'station_id', 'altitude', take='first')

In [None]:
print_duplicates(dades_2019_Marc_info_old.copy(), ['station_id', 'last_updated'])

In [None]:
get_features_nans(dades_2019_Marc_info_old)

In [None]:
get_features_zero(dades_2019_Marc_info_old)

In [None]:
%%time

intial_size = dades_2019_Marc_info_old.shape[0]
print(dades_2019_Marc_info_old.shape)

dades_2019_Marc_info_old['ttl'] = config.ttl
### will remove the duplicate for last reported for all stations in the dataset
dades_2019_Marc_info_old = remove_duplicates_all(dades_2019_Marc_info_old.copy(), 'last_updated')
# (3729928, 19)

print(dades_2019_Marc_info_old.shape)
print('removed:', intial_size-dades_2019_Marc_info_old.shape[0])

In [None]:
%%time

dades_2019_Marc_info_old.reset_index(drop=True, inplace=True)

dades_2019_Marc_info_old.drop(['ttl'], axis=1, inplace=True)

# save checkpoint

dades_2019_Marc_info_old.to_csv(f'../dades/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', index=False)

In [None]:
# re read file
dades_2019_Marc_info_old = pd.read_csv(f'../dades/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', low_memory=False)

In [None]:
print_duplicates(dades_2019_Marc_info_old.copy(), ['station_id', 'last_updated'])

In [None]:
get_features_nans(dades_2019_Marc_info_old)

In [None]:
get_features_zero(dades_2019_Marc_info_old)

In [None]:
get_columns_unique(dades_2019_Febrer_info_old)

# New data 2019 Marc 

## dades_2019_Marc_info

### Preprocessing the data 

In [None]:
%%time

dades_2019_Marc_info = pd.read_csv(f'../dades/{config.year}/{config.datafrom[1]}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom[1]}.csv', low_memory=False)

print(dades_2019_Marc_info.shape)

# change column to one hot enconding
dades_2019_Marc_info['is_charging_station'] = dades_2019_Marc_info.is_charging_station.astype(np.int)

# STATUS = IN_SERVICE=En servei, CLOSED=Tancada
# replace IN_SERVICE with 1 and CLOSED with 0
dades_2019_Marc_info['status'].replace(to_replace=['CLOSED', 'IN_SERVICE'], value=[0, 1], inplace=True)

# will remove the duplicate for last reported for all stations in the dataset
dades_2019_Marc_info = remove_duplicates_all(dades_2019_Marc_info.copy(), 'last_reported')

# convert timestamps of last_updated
dades_2019_Marc_info = convert_timestamp(dades_2019_Marc_info.copy(), ['last_updated'], sort=True, add=True)

# convert timestamps to multimple of 3
dades_2019_Marc_info = timestamp_multipleof(
    devide_by=config.devide_by, 
    column='minutes_last_updated_date',
    df=dades_2019_Marc_info.copy(), 
    new_column='last_updated', 
    year_column='year_last_updated_date',
    month_column='month_last_updated_date',
    day_column='dayofmonth_last_updated_date',
    hour_column='hour_last_updated_date',
    minutes_column='minutes_last_updated_date'
)    

# drop not needed columns
dades_2019_Marc_info.drop(
    [
        'year_last_updated_date', 'month_last_updated_date',
        'week_last_updated_date', 'dayofweek_last_updated_date',
        'dayofmonth_last_updated_date', 'dayofyear_last_updated_date',
        'hour_last_updated_date', 'minutes_last_updated_date'
    ], 
    axis=1, 
    inplace=True
)

# we don't have this column anywhere in the new dataset so it got removed
# dades_2019_Marc_info.drop('last_reported', axis=1, inplace=True)

print(dades_2019_Marc_info.shape)

## dades_2019_Marc_us

### Preprocessing the data 

In [None]:
%%time

dades_2019_Marc_us = pd.read_csv(f'../dades/{config.year}/{config.datafrom[2]}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom[2]}.csv', low_memory=False)

print(dades_2019_Marc_us.shape)

# dades_2019_Marc_us[~(dades_2019_Marc_us.altitude == "0")] # 884 row  does not have 0 in the altitud column
cond = (~(dades_2019_Marc_us.altitude == "0") & (dades_2019_Marc_us.capacity == 1))
print(dades_2019_Marc_us[cond].shape) 
# 884 row does not have 0 in the altitud column
# capacity is filled with values 1 to fix this we need to shift the data 

# Fix data 
dades_2019_Marc_us.loc[cond, ['capacity']] = dades_2019_Marc_us[cond].post_code
dades_2019_Marc_us.loc[cond, ['post_code']] = dades_2019_Marc_us[cond].address
dades_2019_Marc_us.loc[cond, ['address']] = dades_2019_Marc_us[cond].altitude
dades_2019_Marc_us.loc[cond, ['altitude']] = '0'

# post code is wrong need fixing using long & lat. 
# can be fixed using post code data from old dataset after the merge
dades_2019_Marc_us['post_code'] = ''

dades_2019_Marc_us = convert_timestamp(dades_2019_Marc_us.copy(), ['last_updated'], sort=True, add=True)

# convert timestamps to multimple of 3
dades_2019_Marc_us = timestamp_multipleof(
    devide_by=config.devide_by, 
    column='minutes_last_updated_date',
    df=dades_2019_Marc_us.copy(), 
    new_column='last_updated', 
    year_column='year_last_updated_date',
    month_column='month_last_updated_date',
    day_column='dayofmonth_last_updated_date',
    hour_column='hour_last_updated_date',
    minutes_column='minutes_last_updated_date'
)    

# drop not needed columns
dades_2019_Marc_us.drop(
    [
        'year_last_updated_date', 'month_last_updated_date',
        'week_last_updated_date', 'dayofweek_last_updated_date',
        'dayofmonth_last_updated_date', 'dayofyear_last_updated_date',
        'hour_last_updated_date', 'minutes_last_updated_date'
    ], 
    axis=1, 
    inplace=True
)

dades_2019_Marc_us['physical_configuration'].replace(to_replace=['BIKESTATION', 'ELECTRICBIKESTATION'], value=[0, 1], inplace=True)

# create mew column of last reported and last updated 
dades_2019_Marc_us['street_name'] = dades_2019_Marc_us.apply(
    lambda x: " ".join(re.findall("[a-zA-Z]+", x['name'])),
    axis=1
)

def lambda_fun(name):
    ret = 'nan'
    try:
        ret = re.findall("\d+$", name)[0]
    except:
        ret = 'nan'
        
    return ret

# create mew column of last reported and last updated 
dades_2019_Marc_us['street_number'] = dades_2019_Marc_us.apply(
    lambda x: lambda_fun(x['name']),
    axis=1
)

# we don't have this column anywhere in the new dataset so it got removed
dades_2019_Marc_us.drop(['address', 'name'], axis=1, inplace=True)

print(dades_2019_Marc_us.shape)


## merge data us with data info

In [None]:
%%time

merged_data = dades_2019_Marc_info.merge(
    dades_2019_Marc_us.copy(), 
    left_on=[
        'station_id',
        'last_updated'
    ], 
    right_on=[
        'station_id',
        'last_updated'
    ],
    how='left',
    suffixes=("_stat", "_us")
).copy()

merged_data.shape

In [None]:
# confirm data is correct
merged_data[merged_data.last_reported.isna()].shape[0], merged_data.shape[0],(merged_data[merged_data.last_reported.isna()].shape[0]/merged_data.shape[0])*100

In [None]:
%%time

# we don't have this column anywhere in the new dataset so it got removed
merged_data.drop(
    ['ttl_us', 'ttl_stat', 'last_reported'],
    axis=1, 
    inplace=True
)

merged_data[[
    'num_bikes_available',
    'num_bikes_available_types.mechanical', 
    'num_bikes_available_types.ebike', 
    'num_docks_available', 
    'is_installed', 
    'is_renting', 
    'is_returning', 
    'is_charging_station', 
    'status',
    'last_updated'
]] = merged_data[[
    'num_bikes_available',
    'num_bikes_available_types.mechanical', 
    'num_bikes_available_types.ebike', 
    'num_docks_available', 
    'is_installed', 
    'is_renting', 
    'is_returning', 
    'is_charging_station', 
    'status', 
    'last_updated'
]].astype(np.int)

merged_data.altitude = merged_data.altitude.fillna('0').astype(np.int)

merged_data.physical_configuration.fillna(1., inplace=True)

merged_data = correct_columns(merged_data, 'station_id', 'capacity')

merged_data = correct_columns(merged_data, 'station_id', 'lat', take='first')

merged_data = correct_columns(merged_data, 'station_id', 'lon', take='first')

merged_data = correct_columns(merged_data, 'station_id', 'street_name', take='first')

merged_data = correct_columns(merged_data, 'station_id', 'street_number', take='first')


In [None]:
print_duplicates(merged_data, ['station_id', 'last_updated'])

In [None]:
get_features_nans(merged_data)

In [None]:
get_features_zero(merged_data)

In [None]:
merged_data.post_code.fillna('0', inplace=True)

# Merge data Marc old with new

In [None]:
dades_2019_Marc_info_old.columns, dades_2019_Marc_info_old.shape

In [None]:
merged_data.columns,merged_data.shape

In [None]:
%%time

dades_2019_Marc_tot = pd.concat([merged_data,dades_2019_Marc_info_old])

In [None]:
dades_2019_Marc_tot.reset_index(drop=True, inplace=True)

In [None]:
dades_2019_Marc_tot.shape

In [None]:
print_duplicates(dades_2019_Marc_tot.copy(), ['station_id', 'last_updated'])

In [None]:
get_features_nans(dades_2019_Marc_tot)

In [None]:
get_features_zero(dades_2019_Marc_tot)

# Cleaning and store of new file

In [None]:
%%time

dades_2019_Marc_tot['ttl'] = config.ttl

### will remove the duplicate for last reported for all stations in the dataset
dades_2019_Marc_tot = remove_duplicates_all(dades_2019_Marc_tot.copy(), 'last_updated')
# (4305871, 19)

In [None]:
dades_2019_Marc_tot.shape

In [None]:
f'{3979843-3962162} deleted'

In [None]:
%%time

dades_2019_Marc_tot.reset_index(drop=True, inplace=True)

dades_2019_Marc_tot.drop(['ttl'], axis=1, inplace=True)

# save checkpoint

dades_2019_Marc_tot.to_csv(f'../dades/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', index=False)

# re read file
dades_2019_Marc_tot = pd.read_csv(f'../dades/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', low_memory=False)

In [None]:
dades_2019_Marc_tot.street_name.unique()

In [None]:
print_duplicates(dades_2019_Marc_tot.copy(), ['station_id', 'last_updated'])

In [None]:
get_features_nans(dades_2019_Marc_tot)

In [None]:
get_features_zero(dades_2019_Marc_tot)

In [None]:
dades_2019_Marc_tot

# Visualize data

In [None]:

# re read file
dades_2019_Marc_tot = pd.read_csv(f'../dades/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', low_memory=False)

## Study on TTL for station 1
Taking one station ID and looking for the best TTL for multiple answers from the same time


In [None]:
dades_2019_Marc_tot_s1 = dades_2019_Marc_tot[dades_2019_Marc_tot.station_id == 1]

In [None]:
dades_2019_Marc_tot_s1.shape

In [None]:
get_features_nans(dades_2019_Marc_tot_s1)

In [None]:
get_features_zero(dades_2019_Marc_tot_s1)

In [None]:
dades_2019_Marc_tot_s1.street_name.unique()

In [None]:
dades_2019_Marc_tot_s1.post_code.unique()

In [None]:
dades_2019_Marc_tot_s1.head(5)

### last_updated 

|  | info |
| -- | -- |
|  | last updates is the timestamp for the file |
|  | a station id will never have repeated last_updated timestamp in the dataset |

In [None]:
(dades_2019_Marc_tot_s1.last_updated.value_counts() > 1).any()  # give insight if there any last_updated timestamps is repeated 

In [None]:
show_column_counts(dades_2019_Marc_tot_s1, 'last_updated')

## study finished

In [None]:
dades_2019_Marc_tot.columns

## station_id

In [None]:
print(stats.describe(dades_2019_Marc_tot.station_id))
dades_2019_Marc_tot.station_id.describe()

In [None]:
# station_id 
# Most values are repeated equally in the dataset
show_column_counts(dades_2019_Marc_tot, 'station_id')

## num_bikes_available

In [None]:
# num_bikes_available
# by looking at the differance between the 75% and max number 
# existance of outliers is possible. 54 is too big for a station size
# TODO, check max size of a station? Replace outliers with the maximum size
print(stats.describe(dades_2019_Marc_tot.num_bikes_available))
dades_2019_Marc_tot.num_bikes_available.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'num_bikes_available')

## num_docks_available

In [None]:
print(stats.describe(dades_2019_Marc_tot.num_docks_available))
dades_2019_Marc_tot.num_docks_available.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'num_docks_available')

## capacity

In [None]:
print(stats.describe(dades_2019_Marc_tot.capacity))
dades_2019_Marc_tot.capacity.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'capacity')

## num_bikes_available_types.mechanical

In [None]:
print(stats.describe(dades_2019_Marc_tot['num_bikes_available_types.mechanical']))
dades_2019_Marc_tot['num_bikes_available_types.mechanical'].describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'num_bikes_available_types.mechanical')

## num_bikes_available_types.ebike

In [None]:
print(stats.describe(dades_2019_Marc_tot['num_bikes_available_types.ebike']))
dades_2019_Marc_tot['num_bikes_available_types.ebike'].describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'num_bikes_available_types.ebike')

## is_installed

In [None]:
print(stats.describe(dades_2019_Marc_tot.is_installed))
dades_2019_Marc_tot.is_installed.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'is_installed')

## is_renting

In [None]:
print(stats.describe(dades_2019_Marc_tot.is_renting))
dades_2019_Marc_tot.is_renting.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'is_renting')

## is_returning

In [None]:
print(stats.describe(dades_2019_Marc_tot.is_returning))
dades_2019_Marc_tot.is_returning.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'is_returning')

## is_charging_station

In [None]:
print(stats.describe(dades_2019_Marc_tot.is_charging_station))
dades_2019_Marc_tot.is_charging_station.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'is_charging_station')

## status

In [None]:
print(stats.describe(dades_2019_Marc_tot.status))
dades_2019_Marc_tot.status.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'status')

## last_updated

In [None]:
print(stats.describe(dades_2019_Marc_tot.last_updated))
dades_2019_Marc_tot.last_updated.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'last_updated')

## physical_configuration

In [None]:
print(stats.describe(dades_2019_Marc_tot.physical_configuration))
dades_2019_Marc_tot.physical_configuration.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'physical_configuration')

## altitude

In [None]:
print(stats.describe(dades_2019_Marc_tot.altitude))
dades_2019_Marc_tot.altitude.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'altitude')

## post_code

In [None]:
print(stats.describe(dades_2019_Marc_tot.post_code))
dades_2019_Marc_tot.post_code.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'post_code')

## lat

In [None]:
print(stats.describe(dades_2019_Marc_tot.lat))
dades_2019_Marc_tot.lat.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'lat')

## lon

In [None]:
print(stats.describe(dades_2019_Marc_tot.lon))
dades_2019_Marc_tot.lon.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'lon')

## street_name

In [None]:
print(stats.describe(dades_2019_Marc_tot.street_name))
dades_2019_Marc_tot.street_name.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'street_name')

## street_number

In [None]:
print(stats.describe(dades_2019_Marc_tot.street_number))
dades_2019_Marc_tot.street_number.describe()

In [None]:
show_column_counts(dades_2019_Marc_tot, 'street_number')