In [1]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import csv
#from sklearn.svm


def merge_data(folder='./', preffix='mi_pollution', encoding='ISO-8859-1'):
    # Joins all the preffix*.csv files and the data from stations of the preffix_legend-mi.csv
    if not os.path.exists(folder):
        print(f"folder {folder} doesn't exist's, no data merged")
        return

    dataframes = []
    names = pd.DataFrame()
    for filename in os.listdir(folder):
        file = folder+filename
        if 'legend' in filename and filename.endswith(".csv"):
            names = pd.read_csv(file, header=None, encoding=encoding)
            names.rename(columns=pd.to_numeric)
        elif filename.startswith(preffix) and filename.endswith(".csv"):
            if csv.Sniffer().has_header(file):
                df = pd.read_csv(file, header=None, skiprows=[0], encoding=encoding)
            else: 
                df = pd.read_csv(file, encoding=encoding)
            
            dataframes.append(df)

    total = pd.DataFrame()
    for df in dataframes:
        total = total.append(df)

    total = pd.merge(total, names, left_on=0, right_on=0, how='inner')
    
    return total

In [2]:
#assumes no header in the names file
path = '../MI_Air_Quality/data/'
air = merge_data(path)
air.rename(columns={0:'station_id','1_x': 'date', '2_x': 'val', '1_y':'station_name', '2_y':'latitude'
                    , 3:'longitude', 4:'particle', 5:'unit',6:'date_format'}, inplace=True)
air.info()
air.to_csv('./data/air_complete.csv', encoding='utf-8')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41252 entries, 0 to 41251
Data columns (total 9 columns):
station_id      41252 non-null int64
date            41252 non-null object
val             41252 non-null float64
station_name    41252 non-null object
latitude        41252 non-null float64
longitude       41252 non-null float64
particle        41252 non-null object
unit            41252 non-null object
date_format     41252 non-null object
dtypes: float64(3), int64(1), object(5)
memory usage: 3.1+ MB


In [3]:
air.head()

Unnamed: 0,station_id,date,val,station_name,latitude,longitude,particle,unit,date_format
0,6372,2013/12/30 00:00,89.0,Milano - P.zza Zavattari,45.476089,9.143509,Total Nitrogen,ppb,YYYY/MM/DD HH24:MI
1,6372,2013/12/30 01:00,109.0,Milano - P.zza Zavattari,45.476089,9.143509,Total Nitrogen,ppb,YYYY/MM/DD HH24:MI
2,6372,2013/12/30 02:00,116.0,Milano - P.zza Zavattari,45.476089,9.143509,Total Nitrogen,ppb,YYYY/MM/DD HH24:MI
3,6372,2013/12/30 03:00,123.0,Milano - P.zza Zavattari,45.476089,9.143509,Total Nitrogen,ppb,YYYY/MM/DD HH24:MI
4,6372,2013/12/30 04:00,127.0,Milano - P.zza Zavattari,45.476089,9.143509,Total Nitrogen,ppb,YYYY/MM/DD HH24:MI


In [4]:
#assumes no header in the names file
path = '../MI_Weather_Station_Data/data/'
weather = merge_data(path, preffix='mi_meteo_')
weather.rename(columns={0:'station_id','1_x': 'date', '2_x': 'val', '1_y':'station_name', '2_y':'latitude'
                    , 3:'longitude', 4:'type', 5:'unit'}, inplace=True)
weather.info()
weather.to_csv('./data/weather_complete.csv', encoding='utf-8')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39134 entries, 0 to 39133
Data columns (total 8 columns):
station_id      39134 non-null int64
date            39134 non-null object
val             39134 non-null float64
station_name    39134 non-null object
latitude        39134 non-null float64
longitude       39134 non-null float64
type            39134 non-null object
unit            39134 non-null object
dtypes: float64(3), int64(1), object(4)
memory usage: 2.7+ MB


In [5]:
weather.head()

Unnamed: 0,station_id,date,val,station_name,latitude,longitude,type,unit
0,6030,2013/11/14 11:00,69.0,Milano - via Brera,45.471192,9.187616,Wind Direction,degree
1,6030,2013/11/14 12:00,59.0,Milano - via Brera,45.471192,9.187616,Wind Direction,degree
2,6030,2013/11/14 13:00,56.0,Milano - via Brera,45.471192,9.187616,Wind Direction,degree
3,6030,2013/11/14 14:00,54.0,Milano - via Brera,45.471192,9.187616,Wind Direction,degree
4,6030,2013/11/14 15:00,61.0,Milano - via Brera,45.471192,9.187616,Wind Direction,degree


In [6]:
gates = pd.read_csv('../MI_Area_C/gates.csv')
gates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 5 columns):
58                  41 non-null int64
via Legnano         41 non-null object
Pedaggio            41 non-null object
45.4769732230969    41 non-null float64
9.1811081696448     41 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.7+ KB


In [7]:
gates.head()

Unnamed: 0,58,via Legnano,Pedaggio,45.4769732230969,9.1811081696448
0,57,via di Porta Tenaglia,Pedaggio,45.47764,9.18175
1,59,via della Moscova,Pedaggio,45.478308,9.181916
2,60,via Volta,Pedaggio,45.480548,9.182938
3,61,corso Garibaldi,Pedaggio,45.480173,9.186543
4,62,via Milazzo,Pedaggio,45.479946,9.187852


In [8]:
transit = pd.read_csv('../MI_Area_C/transit_hdr.csv')

In [9]:
transit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9183475 entries, 0 to 9183474
Data columns (total 3 columns):
Timestamp    object
 Plate       object
 Gate        int64
dtypes: int64(1), object(2)
memory usage: 210.2+ MB


In [10]:
transit.head()

Unnamed: 0,Timestamp,Plate,Gate
0,2013-11-01 21:02:17,4f5716dcd615f21f658229a8570483a8,65
1,2013-11-01 16:12:39,0abba297ac142f63c604b3989d0ce980,64
2,2013-11-01 11:06:10,faafae756ce1df66f34f80479d69411d,57
3,2013-11-01 09:52:14,71f3ccc86145428f46e5c1fe60fb65b8,57
4,2013-11-01 10:49:58,4a4febfd7e5464fb071ae0ae7143a155,64


In [11]:
vehicles = pd.read_csv('../MI_Area_C/vehicles_hdr.csv')

In [12]:
vehicles.info(())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994901 entries, 0 to 994900
Data columns (total 6 columns):
Plate      994901 non-null object
 EURO      994901 non-null int64
 VType     994901 non-null int64
 FType     994901 non-null int64
 DPF       994901 non-null int64
 Length    994901 non-null int64
dtypes: int64(5), object(1)
memory usage: 45.5+ MB


In [13]:
vehicles.head()

Unnamed: 0,Plate,EURO,VType,FType,DPF,Length
0,0abba297ac142f63c604b3989d0ce980,3,4,1,2,4011
1,faafae756ce1df66f34f80479d69411d,1,4,1,2,3526
2,71f3ccc86145428f46e5c1fe60fb65b8,0,0,0,0,0
3,4a4febfd7e5464fb071ae0ae7143a155,0,0,0,0,0
4,27af738684988fbfd2b66867eaf3204c,3,4,1,2,3408


# TODO: Exploratory analysis