In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
import pandas as pd
import os
import pickle
import numpy as np

In [2]:
def combineDF(dfList):
    tempDF = []
    for dF in dfList:
        df = pd.read_csv(dataDir + dF)
        tempDF.append(df)
    df = pd.concat(tempDF)
    df = df.set_index("Time_Start")
    df.sort_index(axis=1)
    return df

In [3]:
rootDir = 'D:/SARP/SARP-Aerosol-ML-BrC/Data/'
dataDir = rootDir + 'Cleaned/'
savePath = rootDir + 'Network/'
dataList = os.listdir(dataDir)

dataSet = combineDF(dataList)

In [4]:
def angstromExponentAbs(df):
    lowerLambda = "WSAbs320_Aero"
    upperLambda = "WSAbs420_Aero"
    cols = ["",""]
    for output in df:
        if lowerLambda in output:
            cols[0]=output
        elif upperLambda in output:
            cols[1]=output
    goalDF=dataSet[cols].copy()
    
    denom = -np.log(320/420)
    
    goalDF["AEA"] = goalDF.apply(lambda x: (np.log(x[0]/x[1])/denom), axis=1)
    
    goalDF['AEA'] = goalDF['AEA'].fillna(goalDF['AEA'].median())
    
    return goalDF['AEA']

In [6]:
inputHeaders = []
outputHeaders = []

for header in dataSet:
    if header not in inputHeaders and header not in outputHeaders:
        if "WEBER" in header:
            outputHeaders.append(header)
        else:
            if "YANG" not in header and "Unnamed" not in header:
                inputHeaders.append(header)
            elif "Relative_Humidity" in header or "Solar_Zenith_Angle" in header:
                inputHeaders.append(header)

naList = []
for inpuT in inputHeaders:
    if dataSet[inpuT].isna().all():
        naList.append(inpuT)
    elif dataSet[inpuT].isna().any():
        dataSet[inpuT] = dataSet[inpuT].fillna(dataSet[inpuT].median())

dataSet.drop(columns=naList)


for inpuT in naList:
    inputHeaders.remove(inpuT)


for output in outputHeaders:
    dataSet[output] = dataSet[output].fillna(dataSet[output].median())


    
outPutSet = angstromExponentAbs(dataSet)


['WSAbs320_Aero_WEBER', 'WSAbs420_Aero_WEBER']
Time_Start
66840.0    2.455868
67260.0    1.037779
67740.0    6.185801
68220.0    4.756671
68820.0    9.092865
             ...   
81938.0    4.898056
82089.0    4.953729
82240.0    4.953729
82541.0    4.953729
82842.0    3.624277
Name: AEA, Length: 4832, dtype: float64


  from ipykernel import kernelapp as app


In [7]:
dataSet.to_csv(dataDir+"input")
outPutSet.to_csv(dataDir+"output")

In [190]:
x = dataSet[inputHeaders].values
y = outPutSet.values

In [191]:
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=.75, random_state=4)

In [6]:
multiregr = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=30, random_state=0, verbose=1))

In [7]:
multiregr.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   18.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   16.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   16.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   17.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   17.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   17.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   20.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   22.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   21.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   20.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   20.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   20.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   20.6s finished
[Parallel(n_jobs=1)]: Us

MultiOutputRegressor(estimator=RandomForestRegressor(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     criterion='mse',
                                                     max_depth=30,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                                

In [9]:
y_multi = multiregr.predict(x_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [192]:
regr_rf = RandomForestRegressor(n_estimators=100, max_depth=30, random_state=0, verbose=1)

In [193]:
regr_rf.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.5min finished


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=1, warm_start=False)

In [157]:
print(multiregr.score(x_test,y_test))

Index(['Unnamed: 0', 'Time_Stop', 'Day_Of_Year_YANG', 'Latitude_YANG',
       'Longitude_YANG', 'MSL_GPS_Altitude_YANG', 'HAE_GPS_Altitude_YANG',
       'Pressure_Altitude_YANG', 'Radar_Altitude_YANG', 'Ground_Speed_YANG',
       ...
       'jHOBr_OH_Br_CAFS_HALL', 'jBrNO_Br_NO_CAFS_HALL',
       'jBrONO_Br_NO2_CAFS_HALL', 'jBrONO_BrO_NO_CAFS_HALL',
       'jBrNO2_Br_NO2_CAFS_HALL', 'jBrONO2_BrO_NO2_CAFS_HALL',
       'jBrONO2_Br_NO3_CAFS_HALL', 'jBrCl_Br_Cl_CAFS_HALL',
       'jCHBr3_NoProductsSpecified_CAFS_HALL', 'Fractional_Day'],
      dtype='object', length=536)


In [14]:
with open(savePath + "multi_rf_v1", "wb") as file:
    pickle.dump(multiregr, file)

In [194]:
print(regr_rf.score(x_test,y_test))

0.8630070723955717


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [195]:
with open(savePath + "rf_v1", "wb") as file:
    pickle.dump(regr_rf,file)

In [8]:
print(outPutSet.median())
print(outPutSet.mean())
print(outPutSet)

5.108496179160695
5.204377988867245
Time_Start
66840.0    2.455868
67260.0    1.037779
67740.0    6.185801
68220.0    4.756671
68820.0    9.092865
             ...   
81938.0    4.898056
82089.0    4.953729
82240.0    4.953729
82541.0    4.953729
82842.0    3.624277
Name: AEA, Length: 4832, dtype: float64
