In [8]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import mean_absolute_error
from nltk.corpus import stopwords
import nltk.stem

In [9]:
corpus = pd.read_csv('training_data.csv')
stopwords = stopwords.words("portuguese")

In [10]:
corpus.groupby('Target').count()

Unnamed: 0_level_0,Unnamed: 0,Evolucao,Condições do paciente antes da queda,Dano não classificado adequadamente,Dúvida para Janete,Fator de Risco - Comportamental,Fator de Risco - Extrínseco,Fator de Risco - Intrínseco,Foi necessário algum procedimento. Qual?,Grau do Dano - Grave,...,Tipo de Queda I - Acidental,Tipo de Queda I - Fisiológica Antecipada,Tipo de Queda I - Não Antecipada,Tipo de Queda II - Ao Solo,Tipo de Queda II - Nível inferior,Tipo de Queda III - Outro,Tipo de Queda III - Própria Altura,Tipo do Dano - Físico,Tipo do Dano - Psicológico,Tipo do Dano - Social
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,442,442,442,442,442,442,442,442,442,442,...,442,442,442,442,442,442,442,442,442,442
1,195,195,195,195,195,195,195,195,195,195,...,195,195,195,195,195,195,195,195,195,195


In [11]:
corpus.shape

(637, 34)

In [12]:
target = corpus['Target']
corpus = corpus['Evolucao']
model = RandomForestClassifier(n_jobs=16)

In [13]:
portuguese_stemmer = nltk.stem.RSLPStemmer()
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (portuguese_stemmer.stem(w) for w in analyzer(doc))

In [14]:
for i in range(100,2000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus)
    
    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1, 4)) 
          + '), acc(' + str(round(acc, 4)) 
          + '), precision(' + str(round(precision, 4)) 
          + '), recall(' + str(round(recall,4)) + ')')

100: f1(0.6511), acc(0.8134), precision(0.7383), recall(0.6368)
200: f1(0.6535), acc(0.8322), precision(0.7732), recall(0.5837)
300: f1(0.6969), acc(0.8541), precision(0.791), recall(0.6426)
400: f1(0.7117), acc(0.8462), precision(0.783), recall(0.6466)
500: f1(0.6554), acc(0.8368), precision(0.8144), recall(0.6626)
600: f1(0.6753), acc(0.8292), precision(0.776), recall(0.6308)
700: f1(0.6792), acc(0.8211), precision(0.8061), recall(0.6318)
800: f1(0.669), acc(0.8401), precision(0.7744), recall(0.6487)
900: f1(0.701), acc(0.8323), precision(0.7734), recall(0.6584)
1000: f1(0.6784), acc(0.8322), precision(0.8147), recall(0.6113)
1100: f1(0.7195), acc(0.8321), precision(0.7495), recall(0.6061)
1200: f1(0.6673), acc(0.829), precision(0.8048), recall(0.6411)
1300: f1(0.6598), acc(0.8321), precision(0.79), recall(0.6318)
1400: f1(0.6982), acc(0.8194), precision(0.762), recall(0.6413)
1500: f1(0.6351), acc(0.835), precision(0.7549), recall(0.5797)
1600: f1(0.7263), acc(0.8243), precision(0.7

In [15]:
vectorizer = TfidfVectorizer(max_features=700, strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()

print(str(700) + ': ' + 'f1(' + str(round(f1, 4)) 
          + '), acc(' + str(round(acc, 4)) 
          + '), precision(' + str(round(precision, 4)) 
          + '), recall(' + str(round(recall,4)) + ')')

700: f1(0.6826), acc(0.8337), precision(0.7751), recall(0.6511)


In [None]:
feature_names = vectorizer.get_feature_names()

In [None]:
model.fit(data.toarray(),target)

In [18]:
import pandas as pd
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = feature_names,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [19]:
feature_importances[:15]

Unnamed: 0,importance
queda,0.078792
solo,0.069953
plantao,0.03617
cama,0.024594
chao,0.020743
tentar,0.020648
nao,0.018866
coren,0.01883
apresentou,0.017708
hnsc,0.014754
