# Constantes y Funciones Auxiliares

In [1]:
# Semilla
SEED = 333

# Preparamos el lematizado
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

# Lematizar un string
import re
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

## Columnas de guardado para los algortimos 
COLUMNS = ['mean_fit_time','std_fit_time','mean_test_neg_log_loss','std_test_neg_log_loss','rank_test_neg_log_loss',
           'mean_test_accuracy','rank_test_accuracy',
           'mean_test_f1_macro','rank_test_f1_macro',
           'mean_test_roc_auc_ovr','rank_test_roc_auc_ovr']

# Funcion de guardado de resultados que es un subconjunto de cv_results. 
# Guarda los resultados de los parametros del algoritmo y las metricas que le pasamos como parametro.
def save_results(rs,params_to_evaluate,columns=COLUMNS):
    aux = pd.DataFrame(rs.cv_results_)
    gs_res = pd.DataFrame()
    for col in params_to_evaluate:
        gs_res[col] = aux[col]
    for col in columns:
        gs_res[col] = aux[col]
    return gs_res

# Carga

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train_variants_df = pd.read_csv(r"C:\Users\Junio\Libretas\data-c\training_variants", engine='python')
train_txt_df = pd.read_csv(r"C:\Users\Junio\Libretas\data-c/training_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
train_txt_df['Class'] = train_variants_df['Class']
train_txt_df.sample(10,random_state=SEED)

Unnamed: 0,ID,Text,Class
3198,3198,Inactivation of Ras GTPase activating proteins...,1
2161,2161,The PTEN (phosphatase and tensin homolog) phos...,4
1083,1083,"EZH2, the catalytic subunit of the PRC2 comple...",9
113,113,Endometrial cancer is the most common gynecolo...,4
885,885,Purpose: Platelet-derived growth factor recept...,7
3210,3210,Hereditary predisposition to retinoblastoma is...,1
153,153,Many studies have reported the EGFR mutations ...,2
2312,2312,A systematic characterization of the genetic a...,7
1107,1107,"Multiple endocrine neoplasia type 1 (MEN1, OM...",4
1648,1648,"In acute myeloid leukemia (AML), two clusters ...",7


# Preparacion del Dataframe

In [3]:
# Inicializamos el dataframe que vamos a utilizar
W = pd.DataFrame()

# Añadimos una columna que nos indica el tamaño del texto de cada instancia
W['Text_count']  = train_txt_df["Text"].apply(lambda x: len(str(x).split()))

# Copiamos la clase y el texto
W['Class'] = train_txt_df['Class'].copy()
W['Text'] = train_txt_df["Text"].copy()

# Nos quedamos con las instancias que no tengan el texto nulo
W = W[W['Text_count']!=1]

# Mostramos el dataframe
W.sample(10,random_state=SEED)

Unnamed: 0,Text_count,Class,Text
3198,3343,1,Inactivation of Ras GTPase activating proteins...
2084,3005,7,MYD88 L265P is a somatic mutation that has bee...
3185,11198,6,We describe the case of a patient presenting w...
113,19533,4,Endometrial cancer is the most common gynecolo...
3133,15135,7,Transforming mutations in NRAS and KRAS are th...
1110,14851,1,Abstract Fanconi anemia is characterized by c...
153,3837,2,Many studies have reported the EGFR mutations ...
1242,65740,7,The development of array comparative genomic h...
2768,6895,7,Over 30 mutations of the B-RAF gene associated...
1045,10154,1,Tuberous sclerosis (TSC) is an autosomal domin...


# Preparando la Clasificacion 

In [4]:
# Separacion training/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(W['Text'], W['Class'], test_size=0.2, random_state=SEED)

In [5]:
# Preparando stop_words
from wordcloud import STOPWORDS

# Capturando las palabras que no se encuentran en STOPWORDS por ser una contraccion de la palabra original (salida de stemming_tokenizer)
contract_words = {'abov', 'afterward', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 'anyon', 'anyth', 
              'anywher', 'becam', 'becaus', 'becom', 'befor', 'besid', 'cri', 'describ', 'dure', 'els', 
              'elsewher', 'empti', 'everi', 'everyon', 'everyth', 'everywher', 'fifti', 'formerli', 
              'forti', 'ha', 'henc', 'hereaft', 'herebi', 'hi', 'howev', 'hundr', 'inde', 'latterli', 
              'mani', 'meanwhil', 'moreov', 'mostli', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 'onli', 
              'otherwis', 'ourselv', 'perhap', 'pleas', 'seriou', 'sever', 'sinc', 'sincer', 'sixti', 'someon', 
              'someth', 'sometim', 'somewher', 'themselv', 'thenc', 
              'thereaft', 'therebi', 'therefor', 'thi', 'thu', 'togeth', 'twelv', 
              'twenti', 'veri', 'wa', 'whatev', 'whenc', 'whenev', 'wherea', 
              'whereaft', 'wherebi', 'wherev', 'whi', 'yourselv'}

l2 = {'anywh', 'aren', 'becau', 'couldn', 'd', 'didn', 'doe', 
      'doesn', 'don', 'el', 'elsewh', 'everywh', 'hadn', 'hasn', 
      'haven', 'ind', 'isn', 'let', 'll', 'm', 'mustn', 
      'otherwi', 'plea', 're', 's', 'shan', 'shouldn', 
      'somewh', 't', 've', 'wasn', 'weren', 'won', 'wouldn'}

custom_words = {"fig", "figure", "et", "al", "al.", "also",
                "data", "analyze", "study", "table", "using",
                "method", "result", "conclusion", "author", 
                "find", "found", "show", "casita","non","name","image",
                'analyz', 'conclus', 'figur','conclu', 'imag', 'studi', 'tabl', 'use'}
# Unimos ambas listas
stop_words = STOPWORDS.union(contract_words).union(l2).union(custom_words)

from sklearn.pipeline import Pipeline
#from sklearn.feature_selection import RFECV
#from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier
def create_pipeline(clf):
    return Pipeline([('tfidf', TfidfVectorizer(analyzer="word", tokenizer=stemming_tokenizer,stop_words= stop_words,max_features=1000)),
                     ('sel',SelectKBest(chi2, k=100)),
                     ('clf', clf)])


# Validacion Cruzada Stratificada(n_splits=3):
from sklearn.model_selection import StratifiedKFold
CV = StratifiedKFold(n_splits=3, random_state=SEED, shuffle=True)

from sklearn.model_selection import RandomizedSearchCV
def create_rscv(pipeline,params,n_iterations = 10,scoring = ["neg_log_loss","accuracy","f1_macro","roc_auc_ovr"],cv = CV):
    return RandomizedSearchCV(
            pipeline,
            params,
            n_iter = n_iterations,
            verbose = 1,
            random_state = SEED,
            cv = cv,
            n_jobs = -1,
            scoring = scoring,
            refit = "neg_log_loss" 
            )
# Importamos la metrica principal de evaluacion
from sklearn import metrics

# Dataframe de guardado del test
df_results = pd.DataFrame(columns = ["clf","log_loss","accuracy","f1-macro","ROC"])

# Funcion de guardado del resultado del test
def add_res(clf,name,X_test = X_test):   
    # Guardamos las predicciones
    y_predict_proba = clf.predict_proba(X_test)
    y_pred = clf.predict(X_test)
    
    # Guardamos los resultados de las distintas metricas
    log_loss = metrics.log_loss(y_test,y_predict_proba)
    acc = metrics.accuracy_score(y_test,y_pred)
    f1 = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test,y_predict_proba,multi_class='ovr')
    
    # Actualizamos el dataframe
    df_results.loc[len(df_results)]=[name,log_loss,acc,f1,roc]

* https://towardsdatascience.com/how-to-get-feature-importances-from-any-sklearn-pipeline-167a19f1214

* https://towardsdatascience.com/the-triune-pipeline-for-three-major-transformers-in-nlp-18c14e20530

* https://towardsdatascience.com/text-analysis-feature-engineering-with-nlp-502d6ea9225d

* https://towardsdatascience.com/feature-selection-on-text-classification-1b86879f548e

## Clasificacion

## Naive Bayes Multinomial

In [6]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
parameters = {
    'tfidf__ngram_range': ((1, 1),(2, 2),(1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2')
 }
pipeline = create_pipeline(clf)
rs_NB_M = create_rscv(pipeline,parameters)

In [7]:
rs_NB_M.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 222.3min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=333, shuffle=True),
                   estimator=Pipeline(steps=[('tfidf',
                                              TfidfVectorizer(max_features=1000,
                                                              stop_words={'a',
                                                                          'about',
                                                                          'abov',
                                                                          'above',
                                                                          'after',
                                                                          'afterward',
                                                                          'again',
                                                                          'against',
                                                                          'al',
                                    

In [9]:
params_to_evaluate = ["param_tfidf__ngram_range","param_tfidf__use_idf",
                      "param_tfidf__norm"]
NB_M_res = save_results(rs_NB_M,params_to_evaluate)
NB_M_res.sort_values(by='mean_test_neg_log_loss',ascending=False).head(5)

Unnamed: 0,param_tfidf__ngram_range,param_tfidf__use_idf,param_tfidf__norm,mean_fit_time,std_fit_time,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,mean_test_accuracy,rank_test_accuracy,mean_test_f1_macro,rank_test_f1_macro,mean_test_roc_auc_ovr,rank_test_roc_auc_ovr
5,"(1, 1)",True,l2,832.66977,7.31561,-1.352277,0.018803,1,0.496983,1,0.27934,1,0.820865,2
8,"(1, 2)",True,l2,867.496801,5.077653,-1.360769,0.018256,2,0.486425,2,0.273254,2,0.823043,1
9,"(2, 2)",True,l2,712.081686,95.545264,-1.440547,0.023772,3,0.460407,4,0.249352,4,0.800748,3
3,"(2, 2)",False,l2,861.374049,11.606753,-1.44823,0.024598,4,0.465686,3,0.249962,3,0.767584,6
6,"(2, 2)",True,l1,875.210688,14.031734,-1.777598,0.006466,5,0.305807,5,0.072405,5,0.768752,5


In [10]:
# Exportamos los resultados
NB_M_res.to_csv('NB_M.csv')

# Testing
add_res(rs_NB_M,'NB_M')

## Complement Naive Bayes

In [11]:
from sklearn.naive_bayes import ComplementNB
clf2 = ComplementNB()
parameters = {
    'tfidf__ngram_range': ((1, 1),(2, 2),(1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2')
 }
pipeline2 = create_pipeline(clf2)
rs_NB_C = create_rscv(pipeline2,parameters)

In [12]:
rs_NB_C.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 219.8min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=333, shuffle=True),
                   estimator=Pipeline(steps=[('tfidf',
                                              TfidfVectorizer(max_features=1000,
                                                              stop_words={'a',
                                                                          'about',
                                                                          'abov',
                                                                          'above',
                                                                          'after',
                                                                          'afterward',
                                                                          'again',
                                                                          'against',
                                                                          'al',
                                    

In [14]:
params_to_evaluate = ["param_tfidf__ngram_range","param_tfidf__use_idf",
                      "param_tfidf__norm"]
NB_C_res = save_results(rs_NB_C,params_to_evaluate)
NB_C_res.sort_values(by='mean_test_neg_log_loss',ascending=False).head(5)

Unnamed: 0,param_tfidf__ngram_range,param_tfidf__use_idf,param_tfidf__norm,mean_fit_time,std_fit_time,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,mean_test_accuracy,rank_test_accuracy,mean_test_f1_macro,rank_test_f1_macro,mean_test_roc_auc_ovr,rank_test_roc_auc_ovr
8,"(1, 2)",True,l2,872.700056,5.901097,-1.929047,0.00205,1,0.487934,5,0.346247,3,0.81142,4
5,"(1, 1)",True,l2,838.188001,10.77816,-1.936767,0.001177,2,0.496229,3,0.353996,1,0.817023,3
9,"(2, 2)",True,l2,698.338261,110.145135,-1.952468,0.002028,3,0.458899,9,0.317487,9,0.790708,7
3,"(2, 2)",False,l2,866.117339,6.514185,-1.961592,0.002667,4,0.476621,7,0.310295,10,0.798124,6
6,"(2, 2)",True,l1,866.282724,4.560122,-2.163175,0.001538,5,0.444947,10,0.325764,7,0.802587,5


In [15]:
# Exportamos los resultados
NB_C_res.to_csv('NB_C.csv')

# Testing
add_res(rs_NB_C,'NB_C')

## KNN

In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
parameters = {
    'tfidf__ngram_range': ((1, 1),(2, 2),(1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__n_neighbors': tuple(range(1,60,2))
 }
pipeline5 = create_pipeline(knn)
rs_KNN = create_rscv(pipeline5,parameters)

In [17]:
rs_KNN.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 217.0min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=333, shuffle=True),
                   estimator=Pipeline(steps=[('tfidf',
                                              TfidfVectorizer(max_features=1000,
                                                              stop_words={'a',
                                                                          'about',
                                                                          'abov',
                                                                          'above',
                                                                          'after',
                                                                          'afterward',
                                                                          'again',
                                                                          'against',
                                                                          'al',
                                    

In [18]:
params_to_evaluate = ["param_tfidf__ngram_range","param_tfidf__use_idf",
                      "param_tfidf__norm","param_clf__n_neighbors"]
KNN_res = save_results(rs_KNN,params_to_evaluate)
KNN_res.sort_values(by='mean_test_neg_log_loss',ascending=False).head(5)

Unnamed: 0,param_tfidf__ngram_range,param_tfidf__use_idf,param_tfidf__norm,param_clf__n_neighbors,mean_fit_time,std_fit_time,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,mean_test_accuracy,rank_test_accuracy,mean_test_f1_macro,rank_test_f1_macro,mean_test_roc_auc_ovr,rank_test_roc_auc_ovr
9,"(1, 1)",True,l1,57,677.003482,114.817929,-1.795803,0.088022,1,0.49736,6,0.303354,7,0.806368,5
3,"(1, 1)",True,l1,47,841.448306,11.810896,-1.837525,0.090814,2,0.500754,5,0.304371,6,0.810575,4
7,"(2, 2)",False,l1,53,855.881713,9.259894,-1.940506,0.04716,3,0.471719,10,0.249374,10,0.768794,9
2,"(1, 2)",True,l2,43,858.314894,8.914673,-2.002848,0.167654,4,0.491704,8,0.295591,8,0.81681,3
1,"(2, 2)",False,l1,39,844.940996,3.430423,-2.100618,0.063853,5,0.473228,9,0.262192,9,0.774652,8


In [19]:
# Exportamos los resultados
KNN_res.to_csv('KNN.csv')

# Testing
add_res(rs_KNN,'KNN')

## Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
clf3 = RandomForestClassifier(random_state=SEED)
parameters = {
    'tfidf__ngram_range': ((1, 1),(2, 2),(1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__criterion':('giny','entropy')
 }
pipeline3 = create_pipeline(clf3)
rs_RF = create_rscv(pipeline3,parameters)

In [21]:
rs_RF.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 152.1min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=333, shuffle=True),
                   estimator=Pipeline(steps=[('tfidf',
                                              TfidfVectorizer(max_features=1000,
                                                              stop_words={'a',
                                                                          'about',
                                                                          'abov',
                                                                          'above',
                                                                          'after',
                                                                          'afterward',
                                                                          'again',
                                                                          'against',
                                                                          'al',
                                    

In [22]:
params_to_evaluate = ["param_tfidf__ngram_range","param_tfidf__use_idf",
                      "param_tfidf__norm","param_clf__criterion"]
RF_res = save_results(rs_RF,params_to_evaluate)
RF_res.sort_values(by='mean_test_neg_log_loss',ascending=False).head(5)

Unnamed: 0,param_tfidf__ngram_range,param_tfidf__use_idf,param_tfidf__norm,param_clf__criterion,mean_fit_time,std_fit_time,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,mean_test_accuracy,rank_test_accuracy,mean_test_f1_macro,rank_test_f1_macro,mean_test_roc_auc_ovr,rank_test_roc_auc_ovr
8,"(1, 1)",False,l2,entropy,846.342358,6.367995,-1.716357,0.109455,1,0.636878,1,0.509188,2,0.860015,4
3,"(1, 2)",False,l1,entropy,875.439546,7.584559,-1.733858,0.079476,2,0.630468,3,0.500353,3,0.871041,1
0,"(1, 2)",True,l1,entropy,864.009314,6.41285,-1.79154,0.091041,3,0.632353,2,0.509479,1,0.868033,2
5,"(2, 2)",False,l1,entropy,871.857783,6.314054,-1.99469,0.064708,4,0.625566,4,0.497413,4,0.86195,3
1,"(1, 1)",False,l1,giny,825.540492,6.7193,,,5,,5,,5,,5


In [23]:
# Exportamos los resultados
RF_res.to_csv('RF.csv')

# Testing
add_res(rs_RF,'RF')

## SVC

In [24]:
from sklearn.svm import SVC
clf4 = SVC(probability=True)
parameters = {
    'tfidf__ngram_range': ((1, 1),(2, 2),(1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__kernel':('linear','rbf','sigmoid','poly')
 }
pipeline4 = create_pipeline(clf4)
rs_SVC = create_rscv(pipeline4,parameters)

In [25]:
rs_SVC.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 218.1min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=333, shuffle=True),
                   estimator=Pipeline(steps=[('tfidf',
                                              TfidfVectorizer(max_features=1000,
                                                              stop_words={'a',
                                                                          'about',
                                                                          'abov',
                                                                          'above',
                                                                          'after',
                                                                          'afterward',
                                                                          'again',
                                                                          'against',
                                                                          'al',
                                    

In [26]:
params_to_evaluate = ["param_tfidf__use_idf","param_tfidf__ngram_range","param_tfidf__norm","param_clf__kernel"]
SVC_res = save_results(rs_SVC,params_to_evaluate)
SVC_res.sort_values(by='mean_test_neg_log_loss',ascending=False).head(5)

Unnamed: 0,param_tfidf__use_idf,param_tfidf__ngram_range,param_tfidf__norm,param_clf__kernel,mean_fit_time,std_fit_time,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,mean_test_accuracy,rank_test_accuracy,mean_test_f1_macro,rank_test_f1_macro,mean_test_roc_auc_ovr,rank_test_roc_auc_ovr
7,True,"(1, 1)",l2,rbf,845.883572,3.057758,-1.198334,0.019055,1,0.556184,2,0.41302,2,0.844356,2
8,False,"(1, 2)",l2,poly,876.303451,7.451146,-1.199707,0.016758,2,0.571644,1,0.450205,1,0.849967,1
5,True,"(1, 2)",l2,rbf,869.909851,10.996822,-1.209642,0.011874,3,0.554676,3,0.408786,3,0.841868,3
4,False,"(1, 2)",l2,linear,871.635556,5.475896,-1.232457,0.03732,4,0.532051,4,0.328426,5,0.836723,4
0,True,"(2, 2)",l2,rbf,861.795021,2.723015,-1.312631,0.010962,5,0.5181,5,0.357043,4,0.792663,8


In [27]:
# Exportamos los resultados
SVC_res.to_csv('SVC.csv')

# Testing
add_res(rs_SVC,'SVC')

In [28]:
# Exportamos resultados totales
df_results.to_csv('df_rs.csv')
df_results

Unnamed: 0,clf,log_loss,accuracy,f1-macro,ROC
0,NB_M,1.330832,0.5,0.262989,0.82732
1,NB_C,1.916608,0.495482,0.338819,0.802271
2,KNN,1.829654,0.490964,0.291027,0.817899
3,RF,1.622715,0.623494,0.525726,0.848051
4,SVC,1.172317,0.560241,0.421162,0.845402
