In [109]:
import os
import pandas as pd
import numpy as np
import csv
#from sklearn.svm


def merge_data(folder='./', preffix='mi_pollution', encoding='ISO-8859-1'):
    # Joins all the preffix*.csv files and the data from stations of the preffix_legend-mi.csv
    if not os.path.exists(folder):
        print(f"folder {folder} doesn't exist's, no data merged")
        return

    dataframes = []
    names = pd.DataFrame()
    for filename in os.listdir(folder):
        file = folder+filename
        if filename.startswith(preffix) and filename.endswith(".csv"):
            if csv.Sniffer().has_header(file):
                df = pd.read_csv(file, header=None, skiprows=[0], encoding=encoding)
            else: 
                df = pd.read_csv(file, encoding=encoding)
            
            dataframes.append(df)
            
        if 'legend' in filename and filename.endswith(".csv"):
            names = pd.read_csv(file, header=None, encoding=encoding)
            names.rename(columns=pd.to_numeric)

    total = pd.DataFrame()
    for df in dataframes:
        total = total.append(df)

    total = pd.merge(total, names, left_on=0, right_on=0, how='inner')
    
    return total

In [120]:
#assumes no header in the names file
path = '/home/nickman/code/MADAS/II/Lab Rossi/MI_Air_Quality/data/'
air = merge_data(path)
air.rename(columns={0:'station_id','1_x': 'date', '2_x': 'val', '1_y':'station_name', '2_y':'latitude'
                    , 3:'longitude', 4:'particle', 5:'unit',6:'date_format'}, inplace=True)
print(air.head())

   station_id              date   val           station_name   latitude  \
0        5722  2013/12/31 02:00   9.0  Milano - Parco Lambro  45.491645   
1        5722  2013/12/31 03:00   9.0  Milano - Parco Lambro  45.491645   
2        5722  2013/12/31 04:00  10.0  Milano - Parco Lambro  45.491645   
3        5722  2013/12/31 05:00  10.0  Milano - Parco Lambro  45.491645   
4        5722  2013/12/31 06:00  12.0  Milano - Parco Lambro  45.491645   

   longitude particle   unit         date_format  
0    9.24671    Ozono  µg/m3  YYYY/MM/DD HH24:MI  
1    9.24671    Ozono  µg/m3  YYYY/MM/DD HH24:MI  
2    9.24671    Ozono  µg/m3  YYYY/MM/DD HH24:MI  
3    9.24671    Ozono  µg/m3  YYYY/MM/DD HH24:MI  
4    9.24671    Ozono  µg/m3  YYYY/MM/DD HH24:MI  


In [111]:
air.groupby('station_name').particle.count()

station_name
Milano - P.zza  Zavattari       5838
Milano - P.zza Abbiategrasso    2179
Milano - Parco Lambro           3473
Milano - via Senato             6789
Milano - via Verziere           3986
Milano - viale Liguria          4383
Milano - viale Marche           5862
Milano -via Carlo Pascal        8742
Name: particle, dtype: int64

In [112]:
air.groupby(['station_name','station_id']).particle.count()

station_name                  station_id
Milano - P.zza  Zavattari     5506          1459
                              5841          1460
                              6062          1460
                              6372          1459
Milano - P.zza Abbiategrasso  5552           719
                              6344          1460
Milano - Parco Lambro         5550          1061
                              5722          1352
                              6340          1060
Milano - via Senato           5551          2203
                              5834          1461
                              6057          1461
                              6354          1461
                              10320           58
                              17122           58
                              20005           87
Milano - via Verziere         5531          1311
                              5725          1311
                              6366          1311
                            

In [113]:
air['city'], air['station'] = air.station_name.str.split('-',1).str
air['date'] = pd.to_datetime(air['date'], format='%Y/%m/%d %H:%M')
air['day_of_week'] = air['date'].dt.weekday_name

In [77]:
air.head()

Unnamed: 0,station_id,date,val,station_name,latitude,longitude,particle,unit,date_format,city,station,day_of_week
0,5722,2013-12-31 01:00:00,10.0,Milano - Parco Lambro,45.491645,9.24671,Ozono,µg/m3,YYYY/MM/DD HH24:MI,Milano,Parco Lambro,Tuesday
1,5722,2013-12-31 02:00:00,9.0,Milano - Parco Lambro,45.491645,9.24671,Ozono,µg/m3,YYYY/MM/DD HH24:MI,Milano,Parco Lambro,Tuesday
2,5722,2013-12-31 03:00:00,9.0,Milano - Parco Lambro,45.491645,9.24671,Ozono,µg/m3,YYYY/MM/DD HH24:MI,Milano,Parco Lambro,Tuesday
3,5722,2013-12-31 04:00:00,10.0,Milano - Parco Lambro,45.491645,9.24671,Ozono,µg/m3,YYYY/MM/DD HH24:MI,Milano,Parco Lambro,Tuesday
4,5722,2013-12-31 05:00:00,10.0,Milano - Parco Lambro,45.491645,9.24671,Ozono,µg/m3,YYYY/MM/DD HH24:MI,Milano,Parco Lambro,Tuesday


In [114]:
air.dtypes

station_id               int64
date            datetime64[ns]
val                    float64
station_name            object
latitude               float64
longitude              float64
particle                object
unit                    object
date_format             object
city                    object
station                 object
day_of_week             object
dtype: object

In [115]:
air.to_csv(path+'total_airquality_milano.csv', sep=';', encoding='utf-8')

In [122]:
path = '/home/nickman/code/MADAS/II/Lab Rossi/MI_Weather_Station_Data/data/'
weather = merge_data(path, preffix='mi_meteo')
weather.rename(columns={0:'station_id','1_x': 'date', '2_x': 'val', '1_y':'station_name', '2_y':'latitude'
                    , '3_y':'longitude', '4_y':'measure', '5_y':'unit'}, inplace=True)
weather['city'], air['station'] = air.station_name.str.split('-',1).str
weather['date'] = pd.to_datetime(air['date'], format='%Y/%m/%d %H:%M')
weather['day_of_week'] = weather['date'].dt.weekday_name
weather.head()

Unnamed: 0,station_id,date,val,3_x,4_x,5_x,station_name,latitude,longitude,measure,unit,city,day_of_week
0,6120,2013-12-31 02:00:00,2.2,,,,Milano - via Brera,45.471192,9.187616,Wind Speed,m/s,Milano,Tuesday
1,6120,2013-12-31 03:00:00,1.8,,,,Milano - via Brera,45.471192,9.187616,Wind Speed,m/s,Milano,Tuesday
2,6120,2013-12-31 04:00:00,2.0,,,,Milano - via Brera,45.471192,9.187616,Wind Speed,m/s,Milano,Tuesday
3,6120,2013-12-31 05:00:00,2.1,,,,Milano - via Brera,45.471192,9.187616,Wind Speed,m/s,Milano,Tuesday
4,6120,2013-12-31 06:00:00,1.9,,,,Milano - via Brera,45.471192,9.187616,Wind Speed,m/s,Milano,Tuesday


# TODO: Exploratory analysis