# Notebook "5_Besoin_3_otimisation_cmb_10_profils"


## Description

Ce notebook se focalise sur l'analyse exploratoire concernant l'optmisation de la CMB 10 profils naive

## Structure du notebook

Ce notebook est construit de plusieurs parties :
- I. Import des bibliothèques
- II. Chargement des données et des profils
- III. Analyse exploratoire

## Prérequis

### Source de données

Pour que le notebook fonctionne, il est nécessaire que le fichier project_config.yml soit situé au même niveau que ce notebook : ce fichier permet la connexion à la source de données.

### Installation des prérequis techniques

Le notebook fonctionne sur Python 3.7 avec les bibliothèques suivantes :
- pandas
- psycopg2
- sqlachemy
- sklearn
- numpy
- yaml
- matplotlib

# I - Import des bibliothèques

In [7]:
# handling postgres database
import psycopg2
import pandas.io.sql as sqlio
import pandas as pd 
from sqlalchemy import create_engine
from io import StringIO
from sklearn.linear_model import Lasso
from sklearn import metrics
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import yaml
from IPython.display import clear_output
import itertools

Profiles = ['HOA',
    'BBOA',
    'OPOA/OBBOA',
    'BSOA (marine)',
    'BSOA (isoprene)',
    'ASOA (nitro-PAHs)',
    'ASOA (oxy-PAHs)',
    'ASOA (phenolic compounds oxidation)',
    'ASOA (toluene oxidation)',
    'SOA (unknown)']

# II - Chargement des données et des profils

## II.a Recherche des identifiants de connexion

In [8]:
with open(r'.\project_config.yml') as file:
    dbInfo = yaml.load(file, Loader=yaml.FullLoader)
    HOSTNAME = dbInfo["project-database"]["hostname"]
    DATABASE = dbInfo["project-database"]["name"]
    USER = dbInfo["project-database"]["user"]
    PASSWORD = dbInfo["project-database"]["password"]
    PORT = "5432"

## II.b Recherche des profils

In [9]:
df = pd.read_excel('./pmf_profiles.xlsx', engine='openpyxl')

df.drop(columns=['Unnamed: 0'], inplace=True)
df.rename(columns={'Unnamed: 1': 'amus'}, inplace=True)
df = df[["amus"] + Profiles]
pro = df.columns[1:]

## II.c Récupération des données

In [11]:
## SELECTION DE LA DATE
start_date = '2015-03-01'
end_date = '2015-03-30'
## FIN DE LA SELECTION DE LA DATE

sql = f"""SELECT to_char(date, 'YYYY-MM-DD HH24:00:00') FROM(SELECT date FROM public.data_receptor 
          WHERE date >= '{start_date}' AND date <= '{end_date}' GROUP BY 1 ORDER BY 1) AS foo;"""
connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
df_dates = sqlio.read_sql_query(sql, connection)
connection.close()
dates = df_dates.T.values[0]

OperationalError: could not translate host name "13.postgresql.dev.asterix.heka.ai" to address: Unknown host


# III. Analyse exploratoire

## III.a Création de la boucle principale

In [None]:
def global_loop(profiles):
    contribution = [[] for i in profiles]
    contribution_mae = []
    contribution_mse = []
    connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
    for date in dates:    
        sql = f"""SELECT * FROM public.data_receptor where date = '{date}' 
            order by mass;"""
        df_receptor_data = sqlio.read_sql_query(sql, connection)
        df_receptor_data = df_receptor_data[df_receptor_data.columns]
        cor = df.merge(df_receptor_data, left_on='amus', right_on='mass').drop(columns=['mass', 'amus'])

        X_train = cor[profiles].values
        y_train = cor['value'].values.reshape(-1,1)

        alpha = 0.0001
        lasso = Lasso(fit_intercept=False, alpha=alpha, max_iter=5000)     # We train without intercept and we shoose to have only positive values
        lasso.fit(X_train, y_train)                                        #training the algorithm
        clear_output(wait=True)
        print(profiles)      
        print(date)

        data_profile = df[profiles].values
        construct = np.dot(data_profile, lasso.coef_)

        # MAE error
        error_mae = np.sum(np.abs(construct - [a[0] for a in y_train])) / len(y_train)
        contribution_mae.append(error_mae)

        # MSE error
        error_mse = np.sum(np.abs(construct**2 - np.array([a[0] for a in y_train])**2)) / len(y_train)
        contribution_mse.append(error_mse)

        for n, prof in enumerate(profiles):
            contribution[n].append(lasso.coef_[n])
    connection.close()
    return contribution, contribution_mae, contribution_mse

## III.b CMB 10 profils "optimisée"

### III.b.1 Boucle de traitements

In [None]:
list_profils = []
seuil_last = np.inf

for i in range(10):
    for profil in pro:
        selected_profil_approx = None
        if profil not in list_profils:
            result, error_mae, error_mse = global_loop(list_profils + [profil])
            a = np.mean(error_mae)
            if a < seuil_last:
                seuil_last = a
                selected_profil_approx = profil
        if selected_profil_approx is not None:
            list_profils.append(selected_profil_approx)
    else:
        break

['HOA', 'OPOA/OBBOA', 'BSOA (marine)', 'BSOA (isoprene)', 'ASOA (nitro-PAHs)', 'ASOA (phenolic compounds oxidation)', 'ASOA (toluene oxidation)', 'SOA (unknown)']
2015-03-14 00:00:00


### III.b.2 Résultats

In [None]:
result, error_mae, error_mse = global_loop(list_profils)
print(np.mean(error_mae))

['HOA', 'OPOA/OBBOA', 'BSOA (marine)', 'BSOA (isoprene)', 'ASOA (nitro-PAHs)', 'ASOA (phenolic compounds oxidation)', 'ASOA (toluene oxidation)', 'SOA (unknown)']
2015-03-14 00:00:00
0.006545532813622266


## III.c CMB 10 profils "naïve"

### III.c.2 Boucle de traitements

In [None]:
seuil = np.inf
for L in range(0, len(pro)+1):
    for subset in itertools.combinations(pro, L):
        if subset != ():
            result, error_mae, error_mse = global_loop(list(subset))
            a = np.mean(error_mae)
            if a < seuil:
                seuil = a
                selected_profil = list(subset)

['HOA', 'BBOA', 'OPOA/OBBOA', 'BSOA (marine)', 'BSOA (isoprene)', 'ASOA (nitro-PAHs)', 'ASOA (oxy-PAHs)', 'ASOA (phenolic compounds oxidation)', 'ASOA (toluene oxidation)', 'SOA (unknown)']
2015-03-14 00:00:00


In [None]:
selected_profil

['HOA',
 'BBOA',
 'OPOA/OBBOA',
 'BSOA (marine)',
 'ASOA (nitro-PAHs)',
 'ASOA (phenolic compounds oxidation)',
 'ASOA (toluene oxidation)',
 'SOA (unknown)']

### III.c.2 Résultats

In [None]:
result, error_mae, error_mse = global_loop(selected_profil)
print(np.mean(error_mae))

['HOA', 'BBOA', 'OPOA/OBBOA', 'BSOA (marine)', 'ASOA (nitro-PAHs)', 'ASOA (phenolic compounds oxidation)', 'ASOA (toluene oxidation)', 'SOA (unknown)']
2015-03-14 00:00:00
0.006028899552365574
