In [7]:
import numpy as np
import gzip, pickle
import pandas as pd

patients = pd.read_csv('internacoes_charlson_zero.csv.gz', compression='gzip', \
                       nrows=None, usecols=['target'])
target = patients['target'].values
patients.shape

(48907, 1)

In [8]:
with gzip.open("data_5k_unigram.npy.gz", "rb") as wfp:   #Pickling
    data = pickle.load(wfp)
    wfp.close()
data.shape

(48907, 5000)

In [9]:
target1 = np.asarray(target)
data1 = data[:len(target1),:5000].todense()
data1.shape, target1.shape

((48907, 5000), (48907,))

In [10]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import sys
import time
import warnings
warnings.filterwarnings('ignore')

In [11]:
from collections import OrderedDict
regressors = OrderedDict({ 
    'KNeighborsRegressor': KNeighborsRegressor(n_jobs=5),
    'LogisticRegression': LogisticRegression(n_jobs=5),
    'LinearSVR': LinearSVR(),
    #'GradientBoostingRegressor': GradientBoostingRegressor(),
    'RandomForestRegressor': RandomForestRegressor(n_jobs=5),
})

In [12]:
kfold1 = StratifiedKFold(n_splits=2, shuffle=True)
kfold2 = StratifiedKFold(n_splits=6, shuffle=True)

model_result = {}

for i1, (train, test) in enumerate(kfold1.split(data1, target1)):
    
    if i1 > 0: break
    
    data2 = data1[test]
    target2 = target1[test]

    print ('Shape:' , data2.shape)
    sys.stdout.flush()

    for m_name, model in regressors.items():
    
        values = []
        prediction = []
        models = []
        maes_sk = []
        times = []
    
        print('')
        print(m_name)
        sys.stdout.flush()
    
        for i2, (train2, test2) in enumerate(kfold2.split(data2, target2)):
        
            if i2 > 2: break
        
            start = time.time()

            model.fit(data2[train2], target2[train2])
            target_pred = model.predict(data2[test2])
            mae = mean_absolute_error(target2[test2], target_pred)

            values.extend(target2[test2])
            prediction.extend(target_pred)
            models.append(model)
            maes_sk.append(mae)
            times.append(time.time() - start)
            
            print (i2, 'mea: ', round(mae,4), round(time.time() - start,2))
            sys.stdout.flush()

        print('MEA Mean: ', round(np.mean(maes_sk),4), round(np.mean(times),4))
        model_result[m_name] = str(round(np.mean(maes_sk),4)) + ', ' + str(round(np.mean(times),4))
        sys.stdout.flush()
        
print('')
print ('Shape:' , data2.shape)
print(model_result)

Shape: (24458, 5000)

KNeighborsRegressor
0 mea:  1.8941 232.53
1 mea:  1.859 232.42
2 mea:  1.8483 231.7
MEA Mean:  1.8671 232.2175

LogisticRegression
0 mea:  2.7573 10.91
1 mea:  2.7547 10.88
2 mea:  2.7414 10.31
MEA Mean:  2.7511 10.6976

LinearSVR
0 mea:  1.8832 0.88
1 mea:  1.8848 0.88
2 mea:  1.8725 0.88
MEA Mean:  1.8802 0.8823

RandomForestRegressor
0 mea:  1.4057 153.09
1 mea:  1.3811 168.42
2 mea:  1.3932 139.38
MEA Mean:  1.3933 153.6297

Shape: (24458, 5000)
{'KNeighborsRegressor': '1.8671, 232.2175', 'LogisticRegression': '2.7511, 10.6976', 'LinearSVR': '1.8802, 0.8823', 'RandomForestRegressor': '1.3933, 153.6297'}


### Shape: (24457, UniGram)
- 'RandomForestRegressor': '1.3853, 138.317'
- 'GradientBoostingRegressor': '1.3947, 529.5245'
- 'LogisticRegression': '2.7489, 14.9533'
- 'LinearSVR': '1.9028, 0.9797'
- 'KNeighborsRegressor': '1.8279, 245.6863'

### Shape: (24457, MultiGram)
- 'RandomForestRegressor':  1.4462 130.0258
- 'GradientBoostingRegressor': 
- 'LogisticRegression': 2.8139 11.429
- 'LinearSVR': 2.1496 0.8715
- 'KNeighborsRegressor': 