# Constantes y Funciones Auxiliares

In [1]:
# Semilla
SEED = 333

# Preparamos el lematizado
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

# Lematizar un string
import re
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

## Columnas de guardado para los algortimos 
COLUMNS = ['mean_fit_time','std_fit_time','mean_test_neg_log_loss','std_test_neg_log_loss','rank_test_neg_log_loss',
           'mean_test_accuracy','rank_test_accuracy',
           'mean_test_f1_macro','rank_test_f1_macro',
           'mean_test_roc_auc_ovr','rank_test_roc_auc_ovr']

# Funcion de guardado de resultados que es un subconjunto de cv_results. 
# Guarda los resultados de los parametros del algoritmo y las metricas que le pasamos como parametro.
def save_results(rs,params_to_evaluate,columns=COLUMNS):
    aux = pd.DataFrame(rs.cv_results_)
    gs_res = pd.DataFrame()
    for col in params_to_evaluate:
        gs_res[col] = aux[col]
    for col in columns:
        gs_res[col] = aux[col]
    return gs_res

# Carga

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Carga de los archivos 
train_variants_df = pd.read_csv(r"C:\Users\Junio\Libretas\data-c\training_variants", index_col='ID',engine='python')
train_txt_df = pd.read_csv(r"C:\Users\Junio\Libretas\data-c/training_text", sep="\|\|", index_col='ID',engine='python', header=None, skiprows=1, names=["ID","Text"])

# Union de ambos archivos en un dataframe
df_all = pd.merge(train_variants_df, train_txt_df, how='left', on='ID')
df_all.head()

Unnamed: 0_level_0,Gene,Variation,Class,Text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


# Preparacion del Dataframe

In [3]:
# Eliminamos Variation pues no aporta mucha informacion (hay casi un valor para cada caso)
df = df_all.drop(["Variation"], axis=1)

# Añadimos una columna que nos indica el tamaño del texto de cada instancia
df['Text_count']  = df_all["Text"].apply(lambda x: len(str(x).split()))

# Nos quedamos con las instancias que no tengan el texto nulo
df = df[df['Text_count']!=1]

# Mostramos el dataframe
df.sample(10,random_state=SEED)

Unnamed: 0_level_0,Gene,Class,Text,Text_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3198,RASA1,1,Inactivation of Ras GTPase activating proteins...,3343
2084,MYD88,7,MYD88 L265P is a somatic mutation that has bee...,3005
3185,RARA,6,We describe the case of a patient presenting w...,11198
113,MSH6,4,Endometrial cancer is the most common gynecolo...,19533
3133,KRAS,7,Transforming mutations in NRAS and KRAS are th...,15135
1110,FANCA,1,Abstract Fanconi anemia is characterized by c...,14851
153,EGFR,2,Many studies have reported the EGFR mutations ...,3837
1242,YAP1,7,The development of array comparative genomic h...,65740
2768,BRAF,7,Over 30 mutations of the B-RAF gene associated...,6895
1045,TSC2,1,Tuberous sclerosis (TSC) is an autosomal domin...,10154


# Preparando la Clasificacion 

In [4]:
# Separacion training/test
from sklearn.model_selection import train_test_split
X = df.drop(["Class","Text_count"], axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=SEED)

In [5]:
# Preparando stop_words
from wordcloud import STOPWORDS

# Capturando las palabras que no se encuentran en STOPWORDS por ser una contraccion de la palabra original (salida de stemming_tokenizer)
contract_words = {'abov', 'afterward', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 'anyon', 'anyth', 
              'anywher', 'becam', 'becaus', 'becom', 'befor', 'besid', 'cri', 'describ', 'dure', 'els', 
              'elsewher', 'empti', 'everi', 'everyon', 'everyth', 'everywher', 'fifti', 'formerli', 
              'forti', 'ha', 'henc', 'hereaft', 'herebi', 'hi', 'howev', 'hundr', 'inde', 'latterli', 
              'mani', 'meanwhil', 'moreov', 'mostli', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 'onli', 
              'otherwis', 'ourselv', 'perhap', 'pleas', 'seriou', 'sever', 'sinc', 'sincer', 'sixti', 'someon', 
              'someth', 'sometim', 'somewher', 'themselv', 'thenc', 
              'thereaft', 'therebi', 'therefor', 'thi', 'thu', 'togeth', 'twelv', 
              'twenti', 'veri', 'wa', 'whatev', 'whenc', 'whenev', 'wherea', 
              'whereaft', 'wherebi', 'wherev', 'whi', 'yourselv'}

l2 = {'anywh', 'aren', 'becau', 'couldn', 'd', 'didn', 'doe', 
      'doesn', 'don', 'el', 'elsewh', 'everywh', 'hadn', 'hasn', 
      'haven', 'ind', 'isn', 'let', 'll', 'm', 'mustn', 
      'otherwi', 'plea', 're', 's', 'shan', 'shouldn', 
      'somewh', 't', 've', 'wasn', 'weren', 'won', 'wouldn'}

custom_words = {"fig", "figure", "et", "al", "al.", "also",
                "data", "analyze", "study", "table", "using",
                "method", "result", "conclusion", "author", 
                "find", "found", "show", "casita","non","name","image",
                'analyz', 'conclus', 'figur','conclu', 'imag', 'studi', 'tabl', 'use'}
# Unimos ambas listas
stop_words = STOPWORDS.union(contract_words).union(l2).union(custom_words)

# Creacion del pipeline 
from imblearn.pipeline import make_pipeline as make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
def create_pipeline(estimator,ngram_range=(1,1)):
    # Segun la columna, elegimos un preprocesado u otro
    preprocess = ColumnTransformer(
          # Binarizado para Gene
        [('binarizado_gene', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['Gene']),
         ('tfidf_evidence', TfidfVectorizer(analyzer="word", 
                            tokenizer=stemming_tokenizer,stop_words= stop_words,ngram_range=ngram_range), 'Text')],
        remainder='passthrough')

    return make_pipeline(preprocess,estimator)

# Validacion Cruzada Stratificada(n_splits=3):
from sklearn.model_selection import StratifiedKFold
CV = StratifiedKFold(n_splits=3, random_state=SEED, shuffle=True)

from sklearn.model_selection import RandomizedSearchCV
def create_rscv(pipeline,params,n_iterations = 10,scoring = ["neg_log_loss","accuracy","f1_macro","roc_auc_ovr"],cv = CV):
    return RandomizedSearchCV(
            pipeline,
            params,
            n_iter = n_iterations,
            verbose = 1,
            random_state = SEED,
            cv = cv,
            n_jobs = -1,
            scoring = scoring,
            refit = "neg_log_loss" 
            )
# Importamos la metrica principal de evaluacion
from sklearn import metrics

# Dataframe de guardado del test
df_results = pd.DataFrame(columns = ["clf","log_loss","accuracy","f1-macro","ROC"])

# Funcion de guardado del resultado del test
def add_res(clf,name,X_test = X_test):   
    # Guardamos las predicciones
    y_predict_proba = clf.predict_proba(X_test)
    y_pred = clf.predict(X_test)
    
    # Guardamos los resultados de las distintas metricas
    log_loss = metrics.log_loss(y_test,y_predict_proba)
    acc = metrics.accuracy_score(y_test,y_pred)
    f1 = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test,y_predict_proba,multi_class='ovr')
    
    # Actualizamos el dataframe
    df_results.loc[len(df_results)]=[name,log_loss,acc,f1,roc]

## Clasificacion

## Naive Bayes Multinomial

* https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f
* https://towardsdatascience.com/columntransformer-meets-natural-language-processing-da1f116dd69f
* https://towardsdatascience.com/columntransformer-meets-natural-language-processing-da1f116dd69f

In [8]:
from sklearn.naive_bayes import MultinomialNB
clf2 = MultinomialNB()
parameters = {
    'columntransformer__tfidf_evidence__ngram_range': ((1, 1),(2, 2),(1,2)),
    'columntransformer__tfidf_evidence__use_idf': (True, False),
    'columntransformer__tfidf_evidence__max_features': (1000,2000,3000,4000,5000,6000,7000,8000,9000,None),
    'columntransformer__tfidf_evidence__norm': ('l1', 'l2')
 }
pipeline = create_pipeline(clf2)
rs_NB_M = create_rscv(pipeline,parameters)

In [9]:
rs_NB_M.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 216.8min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=333, shuffle=True),
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('binarizado_gene',
                                                                               OneHotEncoder(dtype='int',
                                                                                             handle_unknown='ignore'),
                                                                               ['Gene']),
                                                                              ('tfidf_evidence',
                                                                               TfidfVectorizer(stop_words={'a',
                                                                                                           'about',
                       

In [10]:
params_to_evaluate = ["param_columntransformer__tfidf_evidence__max_features",
                      "param_columntransformer__tfidf_evidence__ngram_range",
                      "param_columntransformer__tfidf_evidence__use_idf",
                      "param_columntransformer__tfidf_evidence__norm"]
NB_M_res = save_results(rs_NB_M,params_to_evaluate)
NB_M_res.sort_values(by='mean_test_neg_log_loss',ascending=False).head(5)

Unnamed: 0,param_columntransformer__tfidf_evidence__max_features,param_columntransformer__tfidf_evidence__ngram_range,param_columntransformer__tfidf_evidence__use_idf,param_columntransformer__tfidf_evidence__norm,mean_fit_time,std_fit_time,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,mean_test_accuracy,rank_test_accuracy,mean_test_f1_macro,rank_test_f1_macro,mean_test_roc_auc_ovr,rank_test_roc_auc_ovr
0,3000,"(2, 2)",True,l1,855.009356,4.248354,-1.271191,0.018626,1,0.548643,3,0.28723,3,0.840346,2
9,6000,"(2, 2)",True,l1,692.936533,118.172069,-1.283196,0.018733,2,0.54638,4,0.276995,4,0.838525,3
4,5000,"(1, 1)",True,l1,826.079748,7.797303,-1.289574,0.019074,3,0.543363,6,0.274864,5,0.836459,6
5,6000,"(1, 2)",True,l1,876.746928,15.788748,-1.29069,0.01882,4,0.543741,5,0.274754,6,0.836819,4
3,3000,"(1, 2)",False,l1,862.549008,9.225927,-1.29166,0.018897,5,0.541855,7,0.273358,7,0.836763,5


In [11]:
# Exportamos los resultados
NB_M_res.to_csv('NB_M.csv')

# Testing
add_res(rs_NB_M,'NB_M')

## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
clf3 = RandomForestClassifier(random_state=SEED)
parameters = {
    'columntransformer__tfidf_evidence__ngram_range': ((1, 1),(2, 2),(1,2)),
    'columntransformer__tfidf_evidence__use_idf': (True, False),
    'columntransformer__tfidf_evidence__max_features': (1000,2000,3000,4000,5000,6000,7000,8000,9000,None),
    'columntransformer__tfidf_evidence__norm': ('l1', 'l2'),
    'randomforestclassifier__criterion':('giny','entropy')
 }
pipeline3 = create_pipeline(clf3)
rs_RF = create_rscv(pipeline3,parameters)

In [13]:
rs_RF.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 156.5min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=333, shuffle=True),
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('binarizado_gene',
                                                                               OneHotEncoder(dtype='int',
                                                                                             handle_unknown='ignore'),
                                                                               ['Gene']),
                                                                              ('tfidf_evidence',
                                                                               TfidfVectorizer(stop_words={'a',
                                                                                                           'about',
                       

In [14]:
params_to_evaluate = ["param_columntransformer__tfidf_evidence__max_features",
                      "param_columntransformer__tfidf_evidence__ngram_range",
                      "param_columntransformer__tfidf_evidence__use_idf",
                      "param_columntransformer__tfidf_evidence__norm",
                      "param_randomforestclassifier__criterion"]
RF_res = save_results(rs_RF,params_to_evaluate)
RF_res.sort_values(by='mean_test_neg_log_loss',ascending=False).head(5)

Unnamed: 0,param_columntransformer__tfidf_evidence__max_features,param_columntransformer__tfidf_evidence__ngram_range,param_columntransformer__tfidf_evidence__use_idf,param_columntransformer__tfidf_evidence__norm,param_randomforestclassifier__criterion,mean_fit_time,std_fit_time,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,mean_test_accuracy,rank_test_accuracy,mean_test_f1_macro,rank_test_f1_macro,mean_test_roc_auc_ovr,rank_test_roc_auc_ovr
2,5000.0,"(1, 1)",True,l1,entropy,871.21556,8.14463,-1.688463,0.075811,1,0.643665,1,0.5058,3,0.860355,3
0,4000.0,"(1, 2)",True,l2,entropy,916.800501,5.654871,-1.723829,0.192871,2,0.637632,3,0.506809,2,0.876764,1
3,,"(1, 1)",True,l1,entropy,1152.113806,17.60498,-1.769758,0.136692,3,0.633107,4,0.493645,4,0.859302,4
1,6000.0,"(2, 2)",False,l2,entropy,897.049993,6.941904,-1.784257,0.101149,4,0.642911,2,0.520365,1,0.865104,2
4,2000.0,"(1, 1)",True,l2,giny,829.985152,8.41672,,,5,,5,,5,,5


In [15]:
# Exportamos los resultados
RF_res.to_csv('RF.csv')

# Testing
add_res(rs_RF,'RF')

## SVC

In [16]:
from sklearn.svm import SVC
clf4 = SVC(probability=True)
parameters = {
    'columntransformer__tfidf_evidence__ngram_range': ((1, 1),(2, 2),(1,2)),
    'columntransformer__tfidf_evidence__use_idf': (True, False),
    'columntransformer__tfidf_evidence__max_features': (1000,2000,3000,4000,5000,6000,7000,8000,9000,None),
    'columntransformer__tfidf_evidence__norm': ('l1', 'l2'),
    'svc__kernel':('linear','rbf','sigmoid','poly')
 }
pipeline4 = create_pipeline(clf4)
rs_SVC = create_rscv(pipeline4,parameters)

In [17]:
rs_SVC.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 296.7min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=333, shuffle=True),
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('binarizado_gene',
                                                                               OneHotEncoder(dtype='int',
                                                                                             handle_unknown='ignore'),
                                                                               ['Gene']),
                                                                              ('tfidf_evidence',
                                                                               TfidfVectorizer(stop_words={'a',
                                                                                                           'about',
                       

In [20]:
params_to_evaluate = ["param_columntransformer__tfidf_evidence__max_features",
                      "param_columntransformer__tfidf_evidence__use_idf",
                      "param_columntransformer__tfidf_evidence__ngram_range",
                      "param_columntransformer__tfidf_evidence__norm",
                      "param_svc__kernel"]
SVC_res = save_results(rs_SVC,params_to_evaluate)
SVC_res.sort_values(by='mean_test_neg_log_loss',ascending=False).head(5)

Unnamed: 0,param_columntransformer__tfidf_evidence__max_features,param_columntransformer__tfidf_evidence__use_idf,param_columntransformer__tfidf_evidence__ngram_range,param_columntransformer__tfidf_evidence__norm,param_svc__kernel,mean_fit_time,std_fit_time,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,mean_test_accuracy,rank_test_accuracy,mean_test_f1_macro,rank_test_f1_macro,mean_test_roc_auc_ovr,rank_test_roc_auc_ovr
2,9000.0,False,"(2, 2)",l2,rbf,1085.558751,1.146183,-1.029789,0.009968,1,0.634992,1,0.50099,3,0.88937,2
5,2000.0,True,"(2, 2)",l2,rbf,989.852848,13.765539,-1.034687,0.010845,2,0.63273,2,0.495277,5,0.890712,1
0,1000.0,False,"(2, 2)",l2,rbf,921.3674,6.774758,-1.035005,0.01267,3,0.631599,3,0.495362,4,0.887268,4
9,,True,"(1, 2)",l2,rbf,2513.720478,155.218383,-1.044673,0.007975,4,0.631599,3,0.502675,2,0.888048,3
4,9000.0,True,"(1, 2)",l2,linear,1251.142795,11.898674,-1.055133,0.00525,5,0.625943,5,0.535414,1,0.884747,5


In [21]:
# Exportamos los resultados
SVC_res.to_csv('SVC.csv')

# Testing
add_res(rs_SVC,'SVC')

In [22]:
# Exportamos resultados totales
df_results.to_csv('df_rs.csv')
df_results

Unnamed: 0,clf,log_loss,accuracy,f1-macro,ROC
0,NB_M,1.22077,0.53012,0.259952,0.845683
1,RF,1.603281,0.623494,0.545862,0.860197
2,SVC,1.002137,0.641566,0.500013,0.885254
