How To Compare Machine Learning Algorithms in Python with scikit-learn

https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines

https://dziganto.github.io/Sparse-Matrices-For-Efficient-Machine-Learning/

## load dataset

In [1]:
import pandas as pd

colunas = ['ROTULO_MANUAL', 'EMENTA_NORM']

df = pd.read_csv('../../data/ementas_pre-processadas.csv', header=0, sep=',', quotechar='"', usecols=colunas)

In [2]:
X_train = df['EMENTA_NORM'].values.astype('U')
Y = df['ROTULO_MANUAL'].values

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(raw_documents=X_train)

tfidf_transformer = TfidfTransformer(use_idf=False)
X = tfidf_transformer.fit_transform(X_train_counts)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

In [5]:
# prepare configuration for cross validation test harness
seed = 7

## evaluate each model in turn

In [6]:
from sklearn import model_selection

results = []
names = []
scoring = 'accuracy'

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X.toarray(), Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)



LR: 0.673547 (0.068416)




LDA: 0.195267 (0.068142)
KNN: 0.585213 (0.082045)
CART: 0.610348 (0.077508)
NB: 0.496135 (0.068898)




SVM: 0.172840 (0.085355)
