# P4

## Récupération des données

In [3]:
import pandas as pd
import numpy as np
import time
import os

# Option pour ne pas tout charger
ALLFILES = True
CT_DIR_DATA = 'C:\Work\OpenClassrooms\data\P4/'

filename = '2016_'
# Récupération du 1er fichier
dataraw = pd.read_csv(CT_DIR_DATA + filename + '01_ready.csv', sep="\t", encoding='utf-8')
print(dataraw.shape)
print(dataraw.columns)

if ALLFILES:
    # Concaténation de toutes les données
    for i in range(2, 13):
        n = CT_DIR_DATA + filename + '%0*d' % (2, i) + '_ready.csv'
        #print('reading', n, '...')
        f = pd.read_csv(n, sep="\t", encoding='utf-8')
        print(n, f.shape)
        print(f.columns)
        dataraw = dataraw.append(f)
        print(dataraw.shape)


(433298, 9)
Index(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'UNIQUE_CARRIER',
       'ORIGIN_AIRPORT_ID', 'ORIGIN_CITY_NAME', 'ARR_DELAY', 'HDAYS',
       'DEP_HOUR'],
      dtype='object')
C:\Work\OpenClassrooms\data\P4/2016_02_ready.csv (416097, 12)
Index(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'UNIQUE_CARRIER',
       'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'ARR_DELAY', 'DISTANCE',
       'CRS_ELAPSED_TIME', 'HDAYS', 'ARR_HOUR', 'DEP_HOUR'],
      dtype='object')
(849395, 13)
C:\Work\OpenClassrooms\data\P4/2016_03_ready.csv (473286, 12)
Index(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'UNIQUE_CARRIER',
       'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'ARR_DELAY', 'DISTANCE',
       'CRS_ELAPSED_TIME', 'HDAYS', 'ARR_HOUR', 'DEP_HOUR'],
      dtype='object')
(1322681, 13)
C:\Work\OpenClassrooms\data\P4/2016_04_ready.csv (474753, 12)
Index(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'UNIQUE_CARRIER',
       'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'ARR_DELAY', 'DISTANCE',
       'CRS_ELAPSED_T

In [4]:
# Méthodes d'import export
import pickle
from sklearn.externals import joblib
CT_DIR = '../flightdelay/save/'

def save_obj(obj, name):
    fn = CT_DIR + name + '.pkl'
    try:
        os.remove(fn)
    except OSError:
        pass
    with open(fn, 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    print(fn, 'saved')

def load_obj(name):
    with open(CT_DIR + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
def save_sklearn_obj(obj, name):
    fn = CT_DIR + name + '.pkl'
    try:
        os.remove(fn)
    except OSError:
        pass
    joblib.dump(obj, fn)
    print(fn, 'saved')

def load_sklearn_obj(name):
    return joblib.load(CT_DIR + name + '.pkl')

In [5]:
# Export de la liste des companies
if ALLFILES:
    save_obj(list(sorted(set(dataraw['UNIQUE_CARRIER']))), 'model_carrier')


web/flightdelay/save/model_carrier.pkl saved


## Préparation au calcul

In [4]:
# Conversion pour les noms des colonnes
dataraw[['ORIGIN_AIRPORT_ID']] = dataraw[['ORIGIN_AIRPORT_ID']].astype(np.int64)

In [5]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

def TestModel(p_df, c):

    # On fait un choix de colonnes restreint en pensant à ce que l'utilisateur pourra renseigner dans la page web
    # Les numériques
    scalingDF = p_df[['HDAYS']].astype('float')
    # Les catégories
    categDF = p_df[['MONTH', 'ORIGIN_AIRPORT_ID', 'DEP_HOUR', 'DAY_OF_WEEK']]
    
    y_final = p_df['ARR_DELAY'].values
    y_final.mean()        
    
    # Binarisation en dummies pour garder la maitrise des noms des colonnes
    categDF_encoded = pd.get_dummies(categDF.astype(np.str))

    # Concaténation
    x_final = pd.concat([scalingDF, categDF_encoded], axis=1)

    # Sauvegarde de la liste des colonnes pour le web
    x_save = x_final[x_final.HDAYS==-1]
    save_obj(x_save, 'model_columns_'+c)

    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x_final,y_final,test_size = 0.2,random_state = 0)

    # Seules les xnum premières colonnes sont numériques
    xnum = scalingDF.shape[1]
    x_train_numerical = x_train.iloc[:, 0:xnum]
    x_test_numerical = x_test.iloc[:, 0:xnum]

    # Création d'un scaler pour les valeurs numériques 
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    # Qu'on entraine avec le training set
    scaler.fit(x_train_numerical) 

    # Sauvegarde pour le web
    save_sklearn_obj(scaler, 'model_scaler_'+c)

    x_train_numerical = scaler.transform(x_train_numerical)
    x_test_numerical = scaler.transform(x_test_numerical)

    x_train = x_train.copy()
    x_test = x_test.copy()
    x_train.loc[:, 0:xnum] = x_train_numerical
    x_test.loc[:, 0:xnum] = x_test_numerical

    min_MAE = 100

    # SGDRegressor est meilleur que:
    # LinearRegresssion
    # ElasticNetCV
    # DummyRegressor (heureusement !)
    
    # Le meilleur résulat est toujours avec alpha=10e-6
    SGD_params = [
#        {'alpha': 10.0**-np.arange(4,7)},
        {'alpha': 10.0**-np.arange(6,7)},
    ]

    # huber bcp mieux que squared_loss
    # et epsilon_insensitive encore mieux
    #loss_values = ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
    loss_values = ['epsilon_insensitive']

    # Le meilleur ration est toujours 0 !
    #l1_ratios = [0, 0.1, 0.2]
    l1_ratios = [0]

    for k, loss in enumerate(loss_values):
        for j, ratio in enumerate(l1_ratios):
            print('Lancement du calcul avec loss =', loss, ', ratio =', ratio)
            try:
                model = SGDRegressor(loss=loss, l1_ratio=ratio, penalty='elasticnet', random_state = 0, max_iter=1000)
                SGD_model = GridSearchCV(model, SGD_params, scoring = 'neg_mean_absolute_error', cv = 5)
                SGD_model.fit(x_train, y_train)
                # Prédiction sur le test set
                y_true, y_pred = y_test, SGD_model.predict(x_test)
                mae = mean_absolute_error(y_true, y_pred)
                if mae < min_MAE:
                    min_MAE = mae
                    print ('SGD regression : MAE = %.3f' % mae)
                    print("best_params_=", SGD_model.best_params_)
                    # Sauvegarde pour le web
                    save_sklearn_obj(SGD_model, 'model_SGD_'+c)

            except Exception as e:
                print('Erreur : ', e)
            print(time.ctime())

    print('Done!')


## GO

In [6]:
lst_carriers = list(sorted(set(dataraw['UNIQUE_CARRIER'])))
for c in lst_carriers:
    print(c)
    df = dataraw[dataraw.UNIQUE_CARRIER == c]
    print (df.shape)
    df.to_csv(CT_DIR_DATA + '2016_' + c + '.csv', sep="\t", encoding='utf-8', index=False)
    
    TestModel(df, c)

AA
(873209, 13)
web/flightdelay/save/model_cols_AA.pkl saved
web/flightdelay/save/model_scaler_AA.pkl saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Lancement du calcul avec loss = epsilon_insensitive , ratio = 0
SGD regression : MAE = 19.989
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_AA.pkl saved
Thu Feb  8 01:32:02 2018
Done!
AS
(165641, 13)
web/flightdelay/save/model_cols_AS.pkl saved
web/flightdelay/save/model_scaler_AS.pkl saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Lancement du calcul avec loss = epsilon_insensitive , ratio = 0
SGD regression : MAE = 14.370
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_AS.pkl saved
Thu Feb  8 01:42:02 2018
Done!
B6
(280251, 13)
web/flightdelay/save/model_cols_B6.pkl saved
web/flightdelay/save/model_scaler_B6.pkl saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Lancement du calcul avec loss = epsilon_insensitive , ratio = 0
SGD regression : MAE = 23.572
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_B6.pkl saved
Thu Feb  8 02:01:10 2018
Done!
DL
(964797, 13)
web/flightdelay/save/model_cols_DL.pkl saved
web/flightdelay/save/model_scaler_DL.pkl saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Lancement du calcul avec loss = epsilon_insensitive , ratio = 0
SGD regression : MAE = 17.268
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_DL.pkl saved
Thu Feb  8 03:40:52 2018
Done!
EV
(489510, 13)
web/flightdelay/save/model_cols_EV.pkl saved
web/flightdelay/save/model_scaler_EV.pkl saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Lancement du calcul avec loss = epsilon_insensitive , ratio = 0
SGD regression : MAE = 21.298
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_EV.pkl saved
Thu Feb  8 04:33:33 2018
Done!
F9
(94506, 13)
web/flightdelay/save/model_cols_F9.pkl saved
web/flightdelay/save/model_scaler_F9.pkl saved
Lancement du calcul avec loss = epsilon_insensitive , ratio = 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


SGD regression : MAE = 24.722
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_F9.pkl saved
Thu Feb  8 04:38:33 2018
Done!
HA
(76334, 13)
web/flightdelay/save/model_cols_HA.pkl saved
web/flightdelay/save/model_scaler_HA.pkl saved
Lancement du calcul avec loss = epsilon_insensitive , ratio = 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


SGD regression : MAE = 9.291
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_HA.pkl saved
Thu Feb  8 04:41:25 2018
Done!
NK
(135371, 13)
web/flightdelay/save/model_cols_NK.pkl saved
web/flightdelay/save/model_scaler_NK.pkl saved
Lancement du calcul avec loss = epsilon_insensitive , ratio = 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


SGD regression : MAE = 22.555
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_NK.pkl saved
Thu Feb  8 04:47:53 2018
Done!
OO
(593938, 13)
web/flightdelay/save/model_cols_OO.pkl saved
web/flightdelay/save/model_scaler_OO.pkl saved
Lancement du calcul avec loss = epsilon_insensitive , ratio = 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


SGD regression : MAE = 19.226
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_OO.pkl saved
Thu Feb  8 06:00:23 2018
Done!
UA
(538190, 13)
web/flightdelay/save/model_cols_UA.pkl saved
web/flightdelay/save/model_scaler_UA.pkl saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Lancement du calcul avec loss = epsilon_insensitive , ratio = 0
SGD regression : MAE = 21.154
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_UA.pkl saved
Thu Feb  8 06:42:20 2018
Done!
VX
(66500, 13)
web/flightdelay/save/model_cols_VX.pkl saved
web/flightdelay/save/model_scaler_VX.pkl saved
Lancement du calcul avec loss = epsilon_insensitive , ratio = 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


SGD regression : MAE = 20.650
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_VX.pkl saved
Thu Feb  8 06:44:55 2018
Done!
WN
(1277949, 13)
web/flightdelay/save/model_cols_WN.pkl saved
web/flightdelay/save/model_scaler_WN.pkl saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Lancement du calcul avec loss = epsilon_insensitive , ratio = 0
SGD regression : MAE = 15.497
best_params_= {'alpha': 9.9999999999999995e-07}
web/flightdelay/save/model_SGD_WN.pkl saved
Thu Feb  8 08:15:04 2018
Done!
