In [1]:
import numpy as np
import gzip, pickle
import pandas as pd

patients = pd.read_csv('internacoes_charlson_zero.csv.gz', compression='gzip', \
                       nrows=None, usecols=['target'])
target = patients['target'].values
patients.shape

(48907, 1)

In [2]:
#with gzip.open("data_10k_multigram.npy.gz", "rb") as wfp:   #Pickling
#    data = pickle.load(wfp)
#    wfp.close()
#data.shape

In [3]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=100, max_iter=10,
                                learning_method='batch',
                                learning_offset=50.,
                                verbose=1,
                                n_jobs=-1)
target1 = np.asarray(target)
#data1 = lda.fit_transform(data)
#data1.shape, target1.shape

In [4]:
with gzip.open("data_100_lda.npy.gz", "rb") as wfp:   #Pickling
    data1 = pickle.load(wfp)
    wfp.close()

In [5]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import sys
import time
import warnings
warnings.filterwarnings('ignore')

In [6]:
from collections import OrderedDict
regressors = OrderedDict({ 
    'KNeighborsRegressor': KNeighborsRegressor(n_jobs=5),
    'LogisticRegression': LogisticRegression(n_jobs=5),
    'LinearSVR': LinearSVR(),
    #'GradientBoostingRegressor': GradientBoostingRegressor(),
    'RandomForestRegressor': RandomForestRegressor(n_jobs=5),
})

In [7]:
#kfold1 = StratifiedKFold(n_splits=2, shuffle=True)
kfold2 = StratifiedKFold(n_splits=6, shuffle=True)

model_result = {}

#for i1, (train, test) in enumerate(kfold1.split(data1, target1)):
    
    #if i1 > 0: break
    
data2 = data1
target2 = target1

print ('Shape:' , data2.shape)
sys.stdout.flush()

for m_name, model in regressors.items():

    values = []
    prediction = []
    models = []
    maes_sk = []
    times = []

    print('')
    print(m_name)
    sys.stdout.flush()

    for i2, (train2, test2) in enumerate(kfold2.split(data2, target2)):

        if i2 > 2: break

        start = time.time()

        model.fit(data2[train2], target2[train2])
        target_pred = model.predict(data2[test2])
        mae = mean_absolute_error(target2[test2], target_pred)

        values.extend(target2[test2])
        prediction.extend(target_pred)
        models.append(model)
        maes_sk.append(mae)
        times.append(time.time() - start)

        print (i2, 'mea: ', round(mae,4), round(time.time() - start,2))
        sys.stdout.flush()

    print('MEA Mean: ', round(np.mean(maes_sk),4), round(np.mean(times),4))
    model_result[m_name] = str(round(np.mean(maes_sk),4)) + ', ' + str(round(np.mean(times),4))
    sys.stdout.flush()
        
print('')
print ('Shape:' , data2.shape)
print(model_result)

Shape: (48907, 100)

KNeighborsRegressor
0 mea:  2.6629 21.71
1 mea:  2.6615 20.31
2 mea:  2.6598 21.85
MEA Mean:  2.6614 21.291

LogisticRegression
0 mea:  2.8116 4.06
1 mea:  2.8101 3.96
2 mea:  2.8058 3.94
MEA Mean:  2.8092 3.9866

LinearSVR
0 mea:  2.4066 0.21
1 mea:  2.4013 0.15
2 mea:  2.3976 0.13
MEA Mean:  2.4018 0.1635

RandomForestRegressor
0 mea:  2.4258 0.33
1 mea:  2.4257 0.35
2 mea:  2.4223 0.36
MEA Mean:  2.4246 0.347

Shape: (48907, 100)
{'KNeighborsRegressor': '2.6614, 21.291', 'LogisticRegression': '2.8092, 3.9866', 'LinearSVR': '2.4018, 0.1635', 'RandomForestRegressor': '2.4246, 0.347'}


### Shape: (24457, UniGram)
- 'RandomForestRegressor': '1.3853, 138.317'
- 'GradientBoostingRegressor': '1.3947, 529.5245'
- 'LogisticRegression': '2.7489, 14.9533'
- 'LinearSVR': '1.9028, 0.9797'
- 'KNeighborsRegressor': '1.8279, 245.6863'

### Shape: (24457, MultiGram)
- 'RandomForestRegressor':  1.4462 130.0258
- 'GradientBoostingRegressor': 
- 'LogisticRegression': 2.8139 11.429
- 'LinearSVR': 2.1496 0.8715
- 'KNeighborsRegressor': 