# ROHP-PAZ Data modelling

### Ignacio Cobas (UAB)

## Introduction

### Imports

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import torch

from imblearn.over_sampling import SMOTE


from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, PolynomialFeatures


from sklearn.linear_model import TweedieRegressor, PassiveAggressiveRegressor, LogisticRegression, Ridge
from sklearn.linear_model import Lasso, ElasticNet, LassoLars, LinearRegression, BayesianRidge, ARDRegression
from sklearn.linear_model import SGDRegressor, RANSACRegressor, HuberRegressor, QuantileRegressor

from sklearn.cluster import KMeans

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


from sklearn.metrics import mean_squared_error

from time import time
from tabulate import tabulate

import copy

### Data loading

#### Data transformation moved to PAZ-Data_trans

In [2]:
full_data = pd.read_pickle(r'../data/full_data.pkl')
full_data = full_data.drop(['startimeUTC', 'az_surf', 'precipBelow12', 'irTemp_2', 'irTemp_below', 'dphi_0010', 'height_flag_comb'], axis=1)

In [3]:
full_data.head()

Unnamed: 0,roid,lat,lon,meanP_2,region,profiles_h036,profiles_h038,profiles_h040,profiles_h042,profiles_h044,...,Vp_h142,Vp_h144,Vp_h146,Vp_h148,Vp_h150,Vp_h152,Vp_h154,Vp_h156,Vp_h158,Vp_h160
0,PAZ1.2018.130.20.55.G18,37.48,126.1,0.0,1,-0.106622,-0.169008,1.89789,1.59365,2.53151,...,0.001481,0.001213,0.000993,0.000813,0.000667,0.00057,0.000541,0.000513,0.000486,0.000461
1,PAZ1.2018.130.21.01.G19,4.68,153.15,0.03,2,-0.358579,0.625408,-0.823484,-0.457033,0.424057,...,0.003695,0.002991,0.002366,0.001922,0.001534,0.001213,0.000965,0.000817,0.000645,0.000518
2,PAZ1.2018.130.21.10.G24,-16.88,135.77,0.0,0,-0.722735,-0.492319,0.358507,-0.264436,0.682555,...,0.002862,0.00237,0.001966,0.00163,0.001351,0.00112,0.000946,0.000843,0.000752,0.00067
3,PAZ1.2018.130.21.16.G10,-36.43,112.66,0.0,1,-0.855325,-0.502248,-0.352526,-0.611806,-1.13704,...,0.000987,0.000854,0.000739,0.000641,0.00056,0.00052,0.00049,0.000462,0.000436,0.00041
4,PAZ1.2018.130.21.39.G29,-53.91,-64.58,0.01,1,-0.208949,-0.162441,0.41712,0.42887,0.740477,...,0.000535,0.000502,0.000477,0.000461,0.000444,0.000428,0.000412,0.000397,0.000384,0.000385


In [4]:
full_data.describe()

Unnamed: 0,lat,lon,meanP_2,region,profiles_h036,profiles_h038,profiles_h040,profiles_h042,profiles_h044,profiles_h046,...,Vp_h142,Vp_h144,Vp_h146,Vp_h148,Vp_h150,Vp_h152,Vp_h154,Vp_h156,Vp_h158,Vp_h160
count,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0,...,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0,79626.0
mean,-4.601046,-11.826029,0.12437,1.199558,0.377428,0.372485,0.369929,0.363114,0.356734,0.351934,...,0.001504,0.001303,0.001135,0.000993,0.000875,0.000778,0.000702,0.000646,0.000596,0.000552
std,35.73541,107.680083,0.559441,1.163198,1.49752,1.484715,1.470275,1.466265,1.454742,1.444821,...,0.001083,0.000861,0.000682,0.000539,0.000426,0.000336,0.000265,0.000218,0.00018,0.00015
min,-54.99,-180.0,0.0,0.0,-4.86702,-4.49009,-4.37159,-4.94831,-5.19117,-4.04266,...,0.000117,0.000126,4.6e-05,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,2e-06
25%,-37.93,-105.4975,0.0,1.0,-0.350406,-0.335246,-0.321802,-0.321125,-0.314243,-0.301621,...,0.000663,0.000626,0.000592,0.000561,0.000536,0.000515,0.000496,0.000478,0.00046,0.000441
50%,-12.56,-25.47,0.0,1.0,0.134893,0.127959,0.131544,0.122093,0.119715,0.119037,...,0.001062,0.000956,0.000862,0.000777,0.000705,0.000649,0.000608,0.000574,0.000542,0.000509
75%,28.79,84.6175,0.02,1.0,0.700736,0.673924,0.658615,0.641563,0.624132,0.601545,...,0.002103,0.001806,0.001561,0.001347,0.001162,0.001,0.000871,0.000783,0.000706,0.000639
max,54.99,180.0,26.66,5.0,35.6476,33.7817,34.0975,32.9811,34.8729,34.8365,...,0.011836,0.009289,0.007452,0.006239,0.005084,0.004143,0.003376,0.002886,0.00241,0.001999


In [5]:
val_data = full_data.iloc[int(full_data.shape[0]*0.7):]
val_data = val_data.reset_index(drop = True)

In [6]:
full_data = full_data.iloc[:int(full_data.shape[0]*0.7)]

### Selecting columns

In [7]:
def get_data_height(full_data, min_height, n_cols=1):
    # Returns a Dataframe containing only variables on selected heights
    # If n_cols > 1 it returns varibles on column min_height and the next n_cols heights
    start = list(full_data.columns).index(next(filter(lambda c: c[:8] == 'profiles', full_data.columns)))
    data_columns = []
    for col in range(n_cols):
        if min_height < 100:
            height = '0' + str(int(min_height))
        else:
            height = str(int(min_height))

        data_columns += [c for c in full_data.columns if c[-3:] == height]
        min_height += 2
    
    data_columns += list(full_data.columns[1:start])
    data = full_data.loc[:, data_columns] 
    
    return data

## Model

### Regressors

Moved them to archive/PAZ-Regressors

### Classifiers

In [8]:
def classify_data(data, height=12, profile_threshold=4):
    # Adds a column that indicates wether the profile is higher or lower than profile_threshold
    # and deletes the profile column
    
    data_classified = pd.DataFrame(data[data.columns].values, columns=data.columns)
    
    data_classified['rain'] = 0
    data_classified.loc[data_classified.iloc[:, 0] > profile_threshold, 'rain'] = 1
    
    data_classified = data_classified.drop(data.columns[0], axis=1)
    
    return data_classified

In [9]:
profile_threshold = 4
height = 80

In [10]:
data = get_data_height(full_data, height)
data = classify_data(data, height=height, profile_threshold=profile_threshold)

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
classifiers = [
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    VotingClassifier(estimators=[('gnb1', GaussianNB()),
                                  ('gnb2', GaussianNB()),
                                  ('gnb3', GaussianNB()),
                                  ('gnb4', GaussianNB()),
                                  ('gnb5', GaussianNB()),
                                  ('qda1', QuadraticDiscriminantAnalysis(reg_param=0.1)),
                                 ], voting='hard')
    ]

In [12]:
error_classifiers = []
times = []
for clf in classifiers:
    st = time()
    clf.fit(X_train, y_train)
    error_classifiers.append(np.round(clf.score(X_test, y_test), 3))
    times.append(np.round(time() - st, 3))
print('Error:', error_classifiers)
print('Time:', times)

Error: [0.992, 0.992, 0.992, 0.941, 0.96, 0.941]
Time: [0.144, 0.146, 1.734, 0.011, 0.018, 0.245]


In [13]:
best_clf = classifiers[np.argmax(error_classifiers)]
print(best_clf)
print(best_clf.score(X_test, y_test))

DecisionTreeClassifier(max_depth=5)
0.9924650161463939


The good results are probably a consequence of the imbalanced dataset, so it should be a good idea to upsample the rain class.

#### Upsampling with SMOTE

https://towardsdatascience.com/upsampling-with-smote-for-classification-projects-e91d7c44e4bf

In [14]:
def upsample_SMOTE(X_train, y_train, ratio=1.0):
    # Upsamples minority class using SMOTE.
    # Ratio argument is the percentage of the upsampled minority class in relation
    # to the majority class. Default is 1.0
    
    sm = SMOTE(random_state=23, sampling_strategy=ratio)
    X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
    return X_train_sm, y_train_sm

In [15]:
profile_threshold = 4
height = 80

In [16]:
data = get_data_height(full_data, height)
data = classify_data(data, height=height, profile_threshold=profile_threshold)

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train, y_train = upsample_SMOTE(X_train, y_train, 1.0)

In [17]:
error_classifiers = []
times = []
for clf in classifiers:
    st = time()
    clf.fit(X_train, y_train)
    error_classifiers.append(np.round(clf.score(X_test, y_test), 3))
    times.append(np.round(time() - st, 3))
print('Error:', error_classifiers)
print('Time:', times)

Error: [0.948, 0.954, 0.959, 0.904, 0.926, 0.904]
Time: [0.325, 0.317, 3.982, 0.015, 0.035, 0.272]


In [18]:
best_clf = classifiers[np.argmax(error_classifiers)]
print(best_clf)
print(best_clf.score(X_test, y_test))

AdaBoostClassifier()
0.9588087549336204


### One model per height

In [19]:
def model_every_height(full_data, parameters):
    # creats and trains one model for every height and returns them and the scores
    
    # parameters
    profile_threshold = parameters['profile_threshold']
    max_height = parameters['max_height']
    min_height = parameters['min_height']
    step = parameters['step']
    classifier = parameters['classifier']

    predictions = dict.fromkeys(range(min_height, max_height, step))
    # scores = dict.fromkeys(range(min_height, max_height, step))

    scores = dict.fromkeys(['Total', 'Rain1', 'Rain0'], np.array([]))
    classifiers = dict.fromkeys(range(min_height, max_height, step))
   
    # height loop
    for height in range(min_height, max_height, step):
        classifier = parameters['classifier']
        try:
            # get data
            data = get_data_height(full_data, height)
            data = classify_data(data, height, profile_threshold)

            X = data.iloc[:, :-1]
            y = data.iloc[:, -1]
            
            X_train, X_test, y_train, y_test = train_test_split(X, y)
            
            # upsample
            X_train, y_train = upsample_SMOTE(X_train, y_train, 0.5)
            
            # Train
            classifier.fit(X_train, y_train)

            # Test
            total = classifier.predict(X_test)
            rain1 = classifier.predict(X_test[y_test == 1])
            rain0 = classifier.predict(X_test[y_test == 0])
            
            predictions[height] = {'Total': total, 'Rain1': rain1, 'Rain0': rain0}
            
            total = (total == y_test).mean()
            rain1 = (rain1 == 1).mean()
            rain0 = (rain0 == 0).mean()
            
            # scores[height] = {'Total': total, 'Rain1': rain1, 'Rain0': rain0}
            
            scores['Total'] = np.append(scores['Total'], total)
            scores['Rain1'] = np.append(scores['Rain1'], rain1)
            scores['Rain0'] = np.append(scores['Rain0'], rain0)
            
            classifiers[height] = copy.deepcopy(classifier)

        except Exception as e:
            predictions[height] = {'Total': -1, 'Rain1': -1, 'Rain0': -1}
            # scores[height] = {'Total': 0, 'Rain1': 0, 'Rain0': 0}
            
            scores['Total'] = np.append(scores['Total'], 0)
            scores['Rain1'] = np.append(scores['Rain1'], 0)
            scores['Rain0'] = np.append(scores['Rain0'], 0)
            
            classifiers[height] = None
            
    scores = pd.DataFrame(scores)
    scores.index = range(min_height, max_height, step)
    
    return scores, classifiers, predictions

In [20]:
# by setting min height to 60 and max height to 140, precision goes to 80%
# at very low heights (or very high) there probably are very few rain clouds,
# which means that upsampling does not have much to start with, so it doesn't do very well

parameters = {
    'profile_threshold': 2,
    'max_height': 140,
    'min_height': 60,
    'step': 2,
    'classifier': best_clf
}

In [21]:
scores, classifiers, predictions = model_every_height(full_data, parameters)

In [22]:
scores.Total.mean()

0.9599228561176893

In [23]:
scores.Rain1.mean()

0.8129090003867023

In [24]:
scores.Rain0.mean()

0.9627229539859534

In [25]:
predictions[60]['Rain0']

array([0, 0, 0, ..., 0, 0, 0])

### Building a regression

In [26]:
precision_parameters = {
    'min_profile_threshold': 1,
    'max_profile_threshold': 5,
    'precision': 10,
    
    'max_height': 140,
    'min_height': 60,
    'step': 2,
    'classifier': best_clf
}

In [27]:
def precision_model_every_height(full_data, precision_parameters):
    # builds many models for each height, one for each profile_threshold
    # with these predictions a regression can be built
    
    min_profile_threshold = precision_parameters['min_profile_threshold']
    max_profile_threshold = precision_parameters['max_profile_threshold']
    precision = precision_parameters['precision']
    classifier = precision_parameters['classifier']
    
    results = dict.fromkeys(np.linspace(min_profile_threshold, max_profile_threshold, precision))
    
    for profile_threshold in np.linspace(min_profile_threshold, max_profile_threshold, precision):
        
        parameters = {
            'profile_threshold': profile_threshold,
            'max_height': precision_parameters['max_height'],
            'min_height': precision_parameters['min_height'],
            'step': precision_parameters['step'],
            'classifier': classifier
        }
        
        scores, classifiers, predictions = model_every_height(full_data, parameters)
        
        results[profile_threshold] = {'Score': scores, 'Classifiers': classifiers, 'Predictions': predictions}

    return results

In [28]:
results = precision_model_every_height(full_data, precision_parameters)

In [29]:
headers = ['Profile threshold', 'Total', 'Rain1', 'Rain0']
mean_scores = [[i]+list(results[i]['Score'].mean().values) for i in list(results.keys())]

print(tabulate(mean_scores, headers=headers))

  Profile threshold     Total     Rain1     Rain0
-------------------  --------  --------  --------
            1        0.903127  0.565576  0.919026
            1.44444  0.942253  0.75496   0.947898
            1.88889  0.95706   0.827938  0.96016
            2.33333  0.963312  0.833154  0.965246
            2.77778  0.968581  0.796861  0.970036
            3.22222  0.971697  0.789391  0.972803
            3.66667  0.975237  0.799842  0.976107
            4.11111  0.977607  0.768017  0.978386
            4.55556  0.979874  0.806794  0.980464
            5        0.957079  0.75822   0.957589


### Multiheight

In [30]:
def multiheight(full_data, height, extra):
    # creats and trains one model for a specific height but adds data from higher and lower heights (extra)
    # extra=0 doesn't add data from any other heights
    # extra=1 means adding one above and one below
    # extra=2 means adding two above and two below
    
    metadata = get_data_height(full_data, height, 1)
    metadata['height'] = 0
    columns = [x for x in list(metadata.columns[:5])] + list(metadata.columns[5:])
    
    data = pd.DataFrame(columns=columns)
    
    for i in range(1, extra+1):
        data2 = get_data_height(full_data, height+2*i, 1)
        data2['height'] = i
        data2 = data2.rename(columns=dict(zip(data2.columns, columns)))

        data0 = get_data_height(full_data, height-2*i, 1)
        data0['height'] = -i
        data0 = data0.rename(columns=dict(zip(data0.columns, columns)))
        
        data = pd.concat([data, data0, data2])

    data = pd.concat([metadata, data])
    
    return data

In [31]:
data = multiheight(full_data, 60, 1)
data = classify_data(data, height=60, profile_threshold=1.5)

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train, y_train = upsample_SMOTE(X_train, y_train, 1.0)

In [32]:
classifiers = [
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    VotingClassifier(estimators=[('gnb1', GaussianNB()),
                                  ('gnb2', GaussianNB()),
                                  ('gnb3', GaussianNB()),
                                  ('gnb4', GaussianNB()),
                                  ('gnb5', GaussianNB()),
                                  ('qda1', QuadraticDiscriminantAnalysis(reg_param=0.1)),
                                 ], voting='hard')
    ]

In [33]:
error_classifiers = []
times = []
for clf in classifiers:
    st = time()
    clf.fit(X_train, y_train)
    error_classifiers.append(np.round(clf.score(X_test, y_test), 3))
    times.append(np.round(time() - st, 3))
print('Error:', error_classifiers)
print('Time:', times)

Error: [0.885, 0.878, 0.87, 0.881, 0.893, 0.881]
Time: [0.941, 1.082, 12.205, 0.057, 0.123, 0.672]


In [34]:
best_clf = classifiers[np.argmax(error_classifiers)]
print(best_clf)
print(best_clf.score(X_test, y_test))

QuadraticDiscriminantAnalysis()
0.8929049851688834


In [35]:
def model_every_height(full_data, parameters):
    # update function model_every_height by changing what data it uses
    # this update uses data=multiheight(full_data, height, extra)
    # when extra=0 it behaves just like the original model_every_height
    
    # parameters
    profile_threshold = parameters['profile_threshold']
    max_height = parameters['max_height']
    min_height = parameters['min_height']
    step = parameters['step']
    extra = parameters['extra']
    classifier = parameters['classifier']

    predictions = dict.fromkeys(range(min_height, max_height, step))
    # scores = dict.fromkeys(range(min_height, max_height, step))

    scores = dict.fromkeys(['Total', 'Rain1', 'Rain0'], np.array([]))
    classifiers = dict.fromkeys(range(min_height, max_height, step))
   
    # height loop
    for height in range(min_height, max_height, step):
        classifier = parameters['classifier']
        try:
            # get data
            data = multiheight(full_data, height, extra)
            data = classify_data(data, height, profile_threshold)

            X = data.iloc[:, :-1]
            y = data.iloc[:, -1]
            
            X_train, X_test, y_train, y_test = train_test_split(X, y)
            
            # upsample
            X_train, y_train = upsample_SMOTE(X_train, y_train, 0.5)
            
            # Train
            classifier.fit(X_train, y_train)

            # Test
            total = classifier.predict(X_test)
            rain1 = classifier.predict(X_test[y_test == 1])
            rain0 = classifier.predict(X_test[y_test == 0])
            
            predictions[height] = {'Total': total, 'Rain1': rain1, 'Rain0': rain0}
            
            total = (total == y_test).mean()
            rain1 = (rain1 == 1).mean()
            rain0 = (rain0 == 0).mean()
            
            # scores[height] = {'Total': total, 'Rain1': rain1, 'Rain0': rain0}
            
            scores['Total'] = np.append(scores['Total'], total)
            scores['Rain1'] = np.append(scores['Rain1'], rain1)
            scores['Rain0'] = np.append(scores['Rain0'], rain0)
            
            classifiers[height] = copy.deepcopy(classifier)

        except Exception as e:
            predictions[height] = {'Total': -1, 'Rain1': -1, 'Rain0': -1}
            # scores[height] = {'Total': 0, 'Rain1': 0, 'Rain0': 0}
            
            scores['Total'] = np.append(scores['Total'], 0)
            scores['Rain1'] = np.append(scores['Rain1'], 0)
            scores['Rain0'] = np.append(scores['Rain0'], 0)
            
            classifiers[height] = None
            
    scores = pd.DataFrame(scores)
    scores.index = range(min_height, max_height, step)
    
    return scores, classifiers, predictions

In [36]:
# by setting min height to 60 and max height to 140, precision goes to 80%
# at very low heights (or very high) there probably are very few rain clouds,
# which means that upsampling does not have much to start with, so it doesn't do very well

parameters = {
    'profile_threshold': 2,
    'max_height': 100, # 142
    'min_height': 80, # 60
    'step': 2,
    'extra': 2,
    'classifier': best_clf
}

In [41]:
scores, classifiers, predictions = model_every_height(full_data, parameters)

In [42]:
scores.Total.mean()

0.895896545290141

In [43]:
scores.Rain1.mean()

0.8831963729291248

In [44]:
scores.Rain0.mean()

0.8963052702958816

In [45]:
def precision_model_per_layer(full_data, precision_parameters):
    # update function precision_model_every_height to include the new feature in model_every_height
    # when extra=0 it behaves just like the original precision_model_every_height
    
    min_profile_threshold = precision_parameters['min_profile_threshold']
    max_profile_threshold = precision_parameters['max_profile_threshold']
    precision = precision_parameters['precision']
    
    classifier = precision_parameters['classifier']
    
    # list(np.linspace(min_profile_threshold, max_profile_threshold, precision))
    results = dict.fromkeys(np.linspace(min_profile_threshold, max_profile_threshold, precision))
    
    for profile_threshold in np.linspace(min_profile_threshold, max_profile_threshold, precision):
        
        parameters = {
            'profile_threshold': profile_threshold,
            'max_height': precision_parameters['max_height'],
            'min_height': precision_parameters['min_height'],
            'step': precision_parameters['step'],
            'extra': precision_parameters['extra'],
            'classifier': classifier
        }
        
        scores, classifiers, predictions = model_every_height(full_data, parameters)
        
        results[profile_threshold] = {'Score': scores, 'Classifiers': classifiers, 'Predictions': predictions}

    return results

In [46]:
precision_parameters = {
    'min_profile_threshold': 1,
    'max_profile_threshold': 5,
    'precision': 10,
    
    'max_height': 140,
    'min_height': 60,
    'step': 2,
    'extra': 0,
    'classifier': best_clf
}

In [47]:
results = precision_model_per_layer(full_data, precision_parameters)

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.d

In [53]:
results[list(results.keys())[2]]['Score'].Rain1

60     0.0
62     0.0
64     0.0
66     0.0
68     0.0
70     0.0
72     0.0
74     0.0
76     0.0
78     0.0
80     0.0
82     0.0
84     0.0
86     0.0
88     0.0
90     0.0
92     0.0
94     0.0
96     0.0
98     0.0
100    0.0
102    0.0
104    0.0
106    0.0
108    0.0
110    0.0
112    0.0
114    0.0
116    0.0
118    0.0
120    0.0
122    0.0
124    0.0
126    0.0
128    0.0
130    0.0
132    0.0
134    0.0
136    0.0
138    0.0
Name: Rain1, dtype: float64

In [52]:
results[list(results.keys())[4]]['Score'].Total.mean()

0.9896358091137423

In [54]:
precision_parameters = {
    'min_profile_threshold': 1,
    'max_profile_threshold': 5,
    'precision': 10,
    
    'max_height': 140,
    'min_height': 60,
    'step': 2,
    'extra': 1,
    'classifier': best_clf
}

In [55]:
results = precision_model_per_layer(full_data, precision_parameters)

In [56]:
results[list(results.keys())[2]]['Score'].Rain1.mean()

0.8699316661712249

In [57]:
results[list(results.keys())[4]]['Score'].Total.mean()

0.8847400966414696

In [68]:
precision_parameters = {
    'min_profile_threshold': 1,
    'max_profile_threshold': 5,
    'precision': 10,
    
    'max_height': 140,
    'min_height': 60,
    'step': 2,
    'extra': 2,
    'classifier': classifier
}

In [69]:
results = precision_model_per_layer(full_data, precision_parameters)

In [49]:
results[list(results.keys())[2]]['Score']

Unnamed: 0,Total,Rain1,Rain0
60,0.953211,0.0,1.0
62,0.953355,0.0,1.0
64,0.953427,0.0,1.0
66,0.950341,0.0,1.0
68,0.954647,0.0,1.0
70,0.954503,0.0,1.0
72,0.96211,0.0,1.0
74,0.963402,0.0,1.0
76,0.967851,0.0,1.0
78,0.971511,0.0,1.0


In [71]:
results[list(results.keys())[4]]['Score'].Total.mean()

0.8908285849611758

In [72]:
precision_parameters = {
    'min_profile_threshold': 1,
    'max_profile_threshold': 5,
    'precision': 10,
    
    'max_height': 140,
    'min_height': 60,
    'step': 2,
    'extra': 3,
    'classifier': classifier
}

In [73]:
results = precision_model_per_layer(full_data, precision_parameters)

In [74]:
results[list(results.keys())[2]]['Score'].Rain1.mean()

0.8246236773084773

In [75]:
results[list(results.keys())[4]]['Score'].Total.mean()

0.8966921941317587