In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn import metrics

## Procesamiento de datos

In [2]:
# Importamos los datos utilizando pandas
datos=pd.read_csv("./data/data.csv")

# Creo la tabla de candidatos a mano
candidatos=pd.DataFrame(
    [
        [1,'Oscar Andrade', 'Frente Amplio'],
        [2,'Mario Bergara', 'Frente Amplio'],
        [3,'Carolina Cosse', 'Frente Amplio'],
        [4,'Daniel Martínez', 'Frente Amplio'],
        [5,'Verónica Alonso', 'Partido Nacional'],
        [6,'Enrique Antía', 'Partido Nacional'],
        [8,'Carlos Iafigliola', 'Partido Nacional'],
        [9,'Luis Lacalle Pou', 'Partido Nacional'],
        [10,'Jorge Larrañaga', 'Partido Nacional'],
        [11,'Juan Sartori', 'Partido Nacional'],
        [12,'José Amorín', 'Partido Colorado'],
        [13,'Pedro Etchegaray', 'Partido Colorado'],
        [14,'Edgardo Martínez', 'Partido Colorado'],
        [15,'Héctor Rovira', 'Partido Colorado'],
        [16,'Julio María Sanguinetti', 'Partido Colorado'],
        [17,'Ernesto Talvi', 'Partido Colorado'],
        [18,'Pablo Mieres', 'La Alternativa'],
        [19,'Gonzalo Abella', 'Unidad Popular'],        
        [20,'Edgardo Novick', 'Partido de la Gente'],
        [21,'Cèsar Vega', 'PERI'],
        [22,'Rafael Fernández', 'Partido de los Trabajadores'],
        [23,'Justin Graside', 'Partido Digital'],        
        [24,'Gustavo Salle', 'Partido Verde'],
        [25,'Carlos Techera', 'Partido de Todos']
    ],
    columns=['candidatoId','name','party'],
)

datos=datos.merge(candidatos,on=['candidatoId'])

# Sólo por si necesita, cargamos un diccionario con el texto de cada pregunta
preguntas={
    '1': 'Controlar la inflación es más importante que controlar el desempleo. ',
    '2': 'Hay que reducir la cantidad de funcionarios pùblicos',
    '3': 'Deberia aumentar la carga de impuestos para los ricos.',
    '4': 'El gobierno no debe proteger la industria nacional, si las fábricas no son competitivas esta bien que desaparezcan.',
    '5': 'La ley de inclusión financiera es positiva para la sociedad. ',
    '6': 'Algunos sindicatos tienen demasiado poder. ',
    '7': 'Cuanto más libre es el mercado, más libre es la gente. ',
    '8': 'El campo es y debe ser el motor productivo de Uruguay. ',
    '9': 'La inversión extranjera es vital para que Uruguay alcance el desarrollo. ',
    '10': 'Los supermercados abusan del pueblo con sus precios excesivos. ',
    '11': 'Con la vigilancia gubernamental (escuchas telefonicas, e-mails y camaras de seguridad) el que no tiene nada que esconder, no tiene de que preocuparse. ',
    '12': 'La pena de muerte debería ser una opción para los crímenes mas serios. ',
    '13': 'Uruguay debería aprobar más leyes anti corrupción y ser más duro con los culpables. ',
    '14': 'Las FF.AA. deberían tener un rol activo en la seguridad pública. ',
    '15': 'Las carceles deberían ser administradas por organizaciones privadas. ',
    '16': 'Hay que aumentar el salario de los policias significativamente. ',
    '17': 'Para los delitos más graves hay que bajar la edad de imputabilidad a 16 años. ',
    '18': 'Uruguay no necesita un ejército. ',
    '19': 'Uruguay es demasiado generoso con los inmigrantes. ',
    '20': 'La ley trans fue un error. ',
    '21': 'El feminismo moderno no busca la igualdad sino el poder. ',
    '22': 'Para la ley no deberia diferenciarse homicidio de femicidio. ',
    '23': 'La separación de estado y religión me parece importante. ',
    '24': 'La legalización de la marihuana fue un error. ',
    '25': 'La legalización del aborto fue un error. ',
    '26': 'El foco del próximo gobierno debe ser mejorar la educación pública. '
}

# Ordeno los datos por partido y luego por candidato

datos = datos.sort_values(by=['party','name'])

# eliminate candidates with less than 1000 votes
counts_candidate = datos.name.value_counts().reset_index(name="count").query("count > 1000")
counts_candidate.columns = ["name", "count"] 
filtered_data_candidate = datos.merge(counts_candidate, on="name", how="inner")

# eliminate parties with less than 1000 votes
counts_party =  datos.party.value_counts().reset_index(name="count").query("count > 1000")
counts_party.columns = ["party", "count"] 
filtered_data_party = datos.merge(counts_party, on="party", how="inner")


data_candidate = np.array(filtered_data_candidate)
data_party = np.array(filtered_data_party)


#### Creando cropus

In [3]:
np.random.shuffle(data_party)
cant_tuples = len(data_party)
amount_training = round(cant_tuples*0.8)
training_party, test_party = data_party[:amount_training,:], data_party[amount_training:,:]

In [4]:
np.random.shuffle(data_candidate)
cant_tuples = len(data_candidate)
amount_training = round(cant_tuples*0.8)
training_candidate, test_candidate = data_candidate[:amount_training,:], data_candidate[amount_training:,:]

## Regresión logistica

In [5]:
penalization_methods = ['l1', 'l2', 'elasticnet', 'none']

In [6]:
# create classfier party

for pen in penalization_methods:
    if pen == 'elasticnet':
        classifier_party = linear_model.LogisticRegression(penalty=pen, max_iter=10000, tol=0.000001, solver='saga', multi_class='multinomial', l1_ratio='0.5')
    else:
        classifier_party = linear_model.LogisticRegression(penalty=pen, max_iter=10000, tol=0.000001, solver='saga', multi_class='multinomial')
    print("USING " + pen)
    # k fold cross validation training and evaluation
    kf = KFold(n_splits = 5, shuffle=False)
    for train_index, test_index in kf.split(training_party):
        # get the current training and validation
        party_train, party_test = data_party[train_index], data_party[test_index]
        # strip the labels and extra metainfo
        party_train_stripped = party_train[:,2:28]
        party_test_stripped = party_test[:,2:28]
        # get only the labels
        party_train_labels = party_train[:,[30]]
        party_test_labels = party_test[:,[30]]
        # train
        classifier_party.fit(party_train_stripped, party_train_labels.ravel())
        # get the accuracy
        print(classifier_party.score(party_test_stripped, party_test_labels.ravel()))
    print()

USING l1
0.6564715414219161
0.661535345351428
0.6612641815235009
0.6665316045380876
0.6559967585089141

USING l2
0.6562689892647356
0.6619404496657889
0.6610615883306321
0.6663290113452188
0.6553889789303079

USING elasticnet
0.6562689892647356
0.6619404496657889
0.6610615883306321
0.6663290113452188
0.6553889789303079

USING none
0.6562689892647356
0.6619404496657889
0.6610615883306321
0.6663290113452188
0.6553889789303079



In [7]:
# create classfier candidate

for pen in penalization_methods:
    if pen == 'elasticnet':
        classifier_candidate = linear_model.LogisticRegression(penalty=pen, max_iter=10000, tol=0.000001, solver='saga', multi_class='multinomial', l1_ratio='0.5') 
    else:
        classifier_candidate = linear_model.LogisticRegression(penalty=pen, max_iter=10000, tol=0.000001, solver='saga', multi_class='multinomial')
    print("USING " + pen)
    # k fold cross validation training and evaluation
    kf = KFold(n_splits = 5, shuffle=False)
    np.random.shuffle(data_candidate)
    for train_index, test_index in kf.split(data_candidate):
        # get the current training and validation
        candidate_train, candidate_test = data_candidate[train_index], data_candidate[test_index]
        # strip the labels and extra metainfo
        candidate_train_stripped = candidate_train[:,2:28]
        candidate_test_stripped = candidate_test[:,2:28]
        # get only the labels
        candidate_train_labels = candidate_train[:,[29]]
        candidate_test_labels = candidate_test[:,[29]]
        # train
        classifier_candidate.fit(candidate_train_stripped, candidate_train_labels.ravel())
        # get the accuracy
        print(classifier_candidate.score(candidate_test_stripped, candidate_test_labels.ravel()))
    print()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='multinomial',
          n_jobs=1, penalty='l1', random_state=None, solver='saga',
          tol=1e-06, verbose=0, warm_start=False)
USING l1
0.3940911367050576
0.385411450509097
0.38607911867801703
0.3940911367050576
0.38263772954924874

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='saga',
          tol=1e-06, verbose=0, warm_start=False)
USING l2
0.4021031547320981
0.3875813720580871
0.37973627107327657
0.386246035720247
0.38447412353923205

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='multinomial',
          n_jobs=1, penalty='elasticnet', random_state=None, solver='saga',
          tol=1e-06, ve

## Análisis de componentes principales

In [8]:
party_accum_dict = {'Frente Amplio': 0, 'Partido Nacional': 0, 'Partido Colorado': 0, 'La Alternativa': 0, 'Unidad Popular': 0, 'Partido de la Gente': 0, 'PERI': 0, 'Partido de los Trabajadores': 0, 'Partido Digital': 0, 'Partido Verde': 0, 'Partido de Todos': 0}
party_color_dict = {'Frente Amplio': '#15b01a', 'Partido Nacional': '#0343df', 'Partido Colorado': '#e50000', 'La Alternativa': '#029386', 'Unidad Popular': '#f97306', 'Partido de la Gente': '#ffff15', 'PERI': '#033500', 'Partido de los Trabajadores': '#ceb301', 'Partido Digital': '#0cff0c', 'Partido Verde': '#fe01b1', 'Partido de Todos': '#0485d1'}
for key in party_accum_dict:
    party_accum_dict[key] = len(datos[datos.party == key])
sorted_accum_dict_keys = sorted(party_accum_dict)
for i in range(1, len(sorted_accum_dict_keys)):
    party_accum_dict[sorted_accum_dict_keys[i]] += party_accum_dict[sorted_accum_dict_keys[i - 1]]

In [9]:
respuestas = datos[[str(i) for i in range(1,27)]]

In [10]:
matriz_respuestas = np.matrix(respuestas)
matriz_respuestas = matriz_respuestas.T

# Las filas de la matriz son los atributos. Las instancias están por columnas.

### Matriz de covarianza

In [11]:
valor_medio = np.mean(matriz_respuestas, axis=1)
matriz_respuestas_rescalada = matriz_respuestas - valor_medio

In [12]:
matriz_covarianza = np.cov(matriz_respuestas_rescalada)
valores_propios, vectores_propios = np.linalg.eig(matriz_covarianza)

In [13]:
eigen_pair = [(np.abs(valores_propios[i]), vectores_propios[:,i], i) for i in range(len(valores_propios))]
eigen_pair.sort()
eigen_pair.reverse()

In [14]:
#n is the amount of components
n = 6
matriz_w = np.hstack([(eigen_pair[i][1].reshape(26,1)) for i in range(0,n)])

In [15]:
transformed = np.dot(matriz_respuestas_rescalada.T, matriz_w).T

In [16]:
np.random.shuffle(data_candidate)
cant_tuples = len(data_candidate)
amount_training = round(cant_tuples*0.8)
training_candidate, test_candidate = data_candidate[:amount_training,:], data_candidate[amount_training:,:]

In [None]:
#Candidate

# Obtain corpus partitions, note this gets all columns
np.random.shuffle(data_candidate)
cant_tuples = len(data_candidate)
amount_training = round(cant_tuples*0.8)
training_candidate, test_candidate = data_candidate[:amount_training,:], data_candidate[amount_training:,:]

# create classifier
classifier_candidate = linear_model.LogisticRegression(penalty='l1', max_iter=10000, tol=0.000001, solver='saga', multi_class='multinomial')

# n is the amount of columns for PCA
for n in range(1, 27):
    matriz_w = np.hstack([(eigen_pair[i][1].reshape(26,1)) for i in range(0,n)])

    transformed = np.dot(matriz_respuestas_rescalada.T, matriz_w).T

    # Create 'filter' array of desired columns
    principal_indices = [eigen_pair[i][2] for i in range(n)]
    indices = np.array([j in principal_indices for j in range(26)])
    indices = np.append([False,False],indices)
    indices = np.append(indices,np.array([False,True,False,False]))
    
    # Apply filter to data
    filtered_columns_training_candidate = training_candidate[:,indices]
    filtered_columns_testing_candidate = test_candidate[:,indices]
    
    print('USING n = ' + str(n) + ':')
    # k fold crossvalidation
    kf = KFold(n_splits = 5, shuffle=False)
    for train_index, test_index in kf.split(filtered_columns_training_candidate):
        # get the current training and validation
        candidate_train, candidate_test = filtered_columns_training_candidate[train_index], filtered_columns_training_candidate[test_index]
        
        # Strip the labels
        candidate_train_stripped = candidate_train[:,0:n]
        candidate_test_stripped = candidate_test[:,0:n]

        # Obtain the labels
        candidate_train_labels = candidate_train[:,[n]]
        candidate_test_labels = candidate_test[:,[n]]
    
        #Train the classifier
        classifier_candidate.fit(candidate_train_stripped, candidate_train_labels.ravel())

        #Evaluate
        
        print(classifier_candidate.score(candidate_test_stripped, candidate_test_labels.ravel()))
    print()

USING n = 1:
0.21468808679324014
0.21281034842478613
0.20905487168787815
0.20909849749582637
0.2080550918196995

USING n = 2:
0.28958898393490506
0.2658042979344878
0.2712288754433549
0.27295492487479134
0.2773372287145242

USING n = 3:
0.3131650323388275
0.29188399749634886
0.3073231796369706
0.3013355592654424
0.3161519198664441

USING n = 4:
0.3137909451283121
0.30398497809305236
0.3131650323388275
0.31030884808013354
0.31176961602671116

USING n = 5:
0.3336115167953265
0.31295639474233256
0.32735238890047985
0.3290901502504174
0.3313856427378965

USING n = 6:
0.34174838305862715
0.3250573753390361
0.3379929063217192
0.3332637729549249
0.34286310517529217

USING n = 7:
0.34091383267264763
0.32568328812852076
0.3388274567076987
0.33931552587646074
0.3482888146911519

USING n = 8:
0.3402879198831629
0.33110786563738787
0.34759023576048403
0.34202838063439067
0.3434891485809683

USING n = 9:
0.34007928228666806


In [None]:
#%%
#Party

# Obtain corpus partitions, note this gets all columns
np.random.shuffle(data_party)
cant_tuples = len(data_party)
amount_training = round(cant_tuples*0.8)
training_party, test_party = data_party[:amount_training,:], data_party[amount_training:,:]

# create classifier
classifier_party = linear_model.LogisticRegression(penalty='l1', max_iter=1000, tol=0.00001, solver='saga', multi_class='multinomial')

# n is the amount of columns for PCA
for n in range(1, 27):
    matriz_w = np.hstack([(eigen_pair[i][1].reshape(26,1)) for i in range(0,n)])

    transformed = np.dot(matriz_respuestas_rescalada.T, matriz_w).T

    # Create 'filter' array of desired columns
    principal_indices = [eigen_pair[i][2] for i in range(n)]
    indices = np.array([j in principal_indices for j in range(26)])
    indices = np.append([False,False],indices)
    indices = np.append(indices,np.array([False,False,True,False]))
    
    # Apply filter to data
    filtered_columns_training_party = training_party[:,indices]
    filtered_columns_testing_party = test_party[:,indices]
    
    print('USING n = ' + str(n) + ':')
    # k fold crossvalidation
    kf = KFold(n_splits = 5, shuffle=False)
    for train_index, test_index in kf.split(filtered_columns_training_party):
        # get the current training and validation
        party_train, party_test = filtered_columns_training_party[train_index], filtered_columns_training_party[test_index]
        
        # Strip the labels
        party_train_stripped = party_train[:,0:n]
        party_test_stripped = party_test[:,0:n]

        # Obtain the labels
        party_train_labels = party_train[:,[n]]
        party_test_labels = party_test[:,[n]]
    
        #Train the classifier
        classifier_party.fit(party_train_stripped, party_train_labels.ravel())

        #Evaluate
        print(classifier_party.score(party_test_stripped, party_test_labels.ravel()))
    print()

## Evaluación

In [None]:
# NON PCA PARTY.
# Obtain corpus partitions, note this gets all columns
np.random.shuffle(data_party)
cant_tuples = len(data_party)
amount_training = round(cant_tuples*0.8)
training_party, test_party = data_party[:amount_training,:], data_party[amount_training:,:]
classifier_party = linear_model.LogisticRegression(penalty='l1', max_iter=1000, tol=0.00001, solver='saga', multi_class='multinomial')

# Create 'filter' array of desired columns
indices = np.array([True for _ in range(26)])
indices = np.append([False,False],indices)
indices = np.append(indices,np.array([False,False,True,False]))

# Apply filter to data
filtered_columns_training_party = training_party[:,indices]
filtered_columns_testing_party = test_party[:,indices]

party_train_stripped = filtered_columns_training_party[:,0:26]
party_test_stripped = filtered_columns_testing_party[:,0:26]

# Obtain the labels
party_train_labels = filtered_columns_training_party[:,[26]]
party_test_labels = filtered_columns_testing_party[:,[26]]
#Train the classifier
classifier_party.fit(party_train_stripped, party_train_labels.ravel())

#Get labels non repeated:
res=[]
for elem in party_test_labels:
    res.append(elem[0])
print(list(set(res)))
#Evaluate
prediction_labels = classifier_party.predict(party_test_stripped)
accuracy = metrics.accuracy_score(party_test_labels, prediction_labels)
precision = metrics.precision_score(party_test_labels, prediction_labels,average='macro')
recall = metrics.recall_score(party_test_labels, prediction_labels,average='macro')
f_score = metrics.f1_score(party_test_labels, prediction_labels,average='macro')
confusion_matrix = metrics.confusion_matrix(party_test_labels, prediction_labels, labels=list(set(res)))
print(accuracy)
print(precision)
print(recall)
print(f_score)
print(confusion_matrix)

#La precision tira warning porque la regresión tiene clases que no clasificó, esto se detalla en el informe

In [None]:
# PCA n = 25 PARTY.

classifier_party = linear_model.LogisticRegression(penalty='l1', max_iter=1000, tol=0.00001, solver='saga', multi_class='multinomial')

# Create 'filter' array of desired columns
principal_indices = [eigen_pair[i][2] for i in range(25)]
indices = np.array([j in principal_indices for j in range(26)])
indices = np.append([False,False],indices)
indices = np.append(indices,np.array([False,False,True,False]))

# Apply filter to data
filtered_columns_training_party = training_party[:,indices]
filtered_columns_testing_party = test_party[:,indices]

print('evaluating n = 25')

party_train_stripped = filtered_columns_training_party[:,0:25]
party_test_stripped = filtered_columns_testing_party[:,0:25]

# Obtain the labels
party_train_labels = filtered_columns_training_party[:,[25]]
party_test_labels = filtered_columns_testing_party[:,[25]]
#Train the classifier
classifier_party.fit(party_train_stripped, party_train_labels.ravel())

#Get labels non repeated:
res=[]
for elem in party_test_labels:
    res.append(elem[0])
print(list(set(res)))
#Evaluate
prediction_labels = classifier_party.predict(party_test_stripped)
accuracy = metrics.accuracy_score(party_test_labels, prediction_labels)
precision = metrics.precision_score(party_test_labels, prediction_labels,average='macro')
recall = metrics.recall_score(party_test_labels, prediction_labels,average='macro')
f_score = metrics.f1_score(party_test_labels, prediction_labels,average='macro')
confusion_matrix = metrics.confusion_matrix(party_test_labels, prediction_labels, labels=list(set(res)))
print(accuracy)
print(precision)
print(recall)
print(f_score)
print(confusion_matrix)

#La precision tira warning porque la regresión tiene clases que no clasificó, esto se detalla en el informe

In [None]:
# NON PCA CANDIDATE.
# Obtain corpus partitions, note this gets all columns
np.random.shuffle(data_candidate)
cant_tuples = len(data_candidate)
amount_training = round(cant_tuples*0.8)
training_candidate, test_candidate = data_candidate[:amount_training,:], data_candidate[amount_training:,:]
classifier_candidate = linear_model.LogisticRegression(penalty='l1', max_iter=1000, tol=0.00001, solver='saga', multi_class='multinomial')

# Create 'filter' array of desired columns
indices = np.array([True for _ in range(26)])
indices = np.append([False,False],indices)
indices = np.append(indices,np.array([False,True,False,False]))

# Apply filter to data
filtered_columns_training_candidate = training_candidate[:,indices]
filtered_columns_testing_candidate = test_candidate[:,indices]

party_train_stripped = filtered_columns_training_candidate[:,0:26]
party_test_stripped = filtered_columns_testing_candidate[:,0:26]

# Obtain the labels
party_train_labels = filtered_columns_training_candidate[:,[26]]
party_test_labels = filtered_columns_testing_candidate[:,[26]]
#Train the classifier
classifier_candidate.fit(party_train_stripped, party_train_labels.ravel())

#Get labels non repeated:
res=[]
for elem in candidate_test_labels:
    res.append(elem[0])
print(list(set(res)))
#Evaluate
prediction_labels = classifier_candidate.predict(party_test_stripped)
accuracy = metrics.accuracy_score(party_test_labels, prediction_labels)
precision = metrics.precision_score(party_test_labels, prediction_labels,average='macro')
recall = metrics.recall_score(party_test_labels, prediction_labels,average='macro')
f_score = metrics.f1_score(party_test_labels, prediction_labels,average='macro')
confusion_matrix = metrics.confusion_matrix(party_test_labels, prediction_labels, labels=list(set(res)))
print(accuracy)
print(precision)
print(recall)
print(f_score)
print(confusion_matrix)

In [None]:
# PCA n=25 CANDIDATE.
# Obtain corpus partitions, note this gets all columns
np.random.shuffle(data_candidate)
cant_tuples = len(data_candidate)
amount_training = round(cant_tuples*0.8)
training_candidate, test_candidate = data_candidate[:amount_training,:], data_candidate[amount_training:,:]
classifier_candidate = linear_model.LogisticRegression(penalty='l1', max_iter=1000, tol=0.00001, solver='saga', multi_class='multinomial')

principal_indices = [eigen_pair[i][2] for i in range(25)]
indices = np.array([j in principal_indices for j in range(26)])
indices = np.append([False,False],indices)
indices = np.append(indices,np.array([False,True,False,False]))

# Apply filter to data
filtered_columns_training_candidate = training_candidate[:,indices]
filtered_columns_testing_candidate = test_candidate[:,indices]

party_train_stripped = filtered_columns_training_candidate[:,0:25]
party_test_stripped = filtered_columns_testing_candidate[:,0:25]

# Obtain the labels
party_train_labels = filtered_columns_training_candidate[:,[25]]
party_test_labels = filtered_columns_testing_candidate[:,[25]]
#Train the classifier
classifier_candidate.fit(party_train_stripped, party_train_labels.ravel())

#Get labels non repeated:
res=[]
for elem in candidate_test_labels:
    res.append(elem[0])
print(list(set(res)))
#Evaluate
prediction_labels = classifier_candidate.predict(party_test_stripped)
accuracy = metrics.accuracy_score(party_test_labels, prediction_labels)
precision = metrics.precision_score(party_test_labels, prediction_labels,average='macro')
recall = metrics.recall_score(party_test_labels, prediction_labels,average='macro')
f_score = metrics.f1_score(party_test_labels, prediction_labels,average='macro')
confusion_matrix = metrics.confusion_matrix(party_test_labels, prediction_labels, labels=list(set(res)))
print(accuracy)
print(precision)
print(recall)
print(f_score)
print(confusion_matrix)

## Clasificar por partido

### Generar mejor clasificador de partidos

##### Obtener mejor clasificador de candidatos

In [None]:
# Obtain corpus partitions, note this gets all columns
np.random.shuffle(data_candidate)
cant_tuples = len(data_candidate)
amount_training = round(cant_tuples*0.8)
training_candidate, test_candidate = data_candidate[:amount_training,:], data_candidate[amount_training:,:]

# Create classifier
classifier_candidate = linear_model.LogisticRegression(penalty='l1', max_iter=10000, tol=0.000001, solver='saga', multi_class='multinomial')

# Only use the n best columns according to PCA
PCA = True
if PCA:
    best_n = 25
else:
    best_n = 26

matriz_w = np.hstack([(eigen_pair[i][1].reshape(26,1)) for i in range(0,best_n)])

transformed = np.dot(matriz_respuestas_rescalada.T, matriz_w).T

# Create 'filter' array of desired columns
principal_indices = [eigen_pair[i][2] for i in range(best_n)]
indices = np.array([j in principal_indices for j in range(26)])
indices = np.append([False,False],indices)
indices = np.append(indices,np.array([False,True,False,False]))

# Apply filter to data
filtered_columns_training_candidate = training_candidate[:,indices]
filtered_columns_testing_candidate = test_candidate[:,indices]

candidate_train, candidate_test = filtered_columns_training_candidate[train_index], filtered_columns_training_candidate[test_index]

# Strip the labels
candidate_train_stripped = candidate_train[:,0:best_n]
candidate_test_stripped = candidate_test[:,0:best_n]

# Obtain the labels
candidate_train_labels = candidate_train[:,[best_n]]
candidate_test_labels = candidate_test[:,[best_n]]

#Train the classifier
classifier_candidate.fit(candidate_train_stripped, candidate_train_labels.ravel())

In [None]:
# Generate the candidates dictionary
candidates = filtered_data_candidate.copy()
del candidates['candidatoId']
candidates_dict = {}
for elem in candidates.to_dict('records'):
    candidates_dict[elem['name']] = elem['party']

In [None]:
# Define the function that returns the party from the candidate
def classify_party(classified_candidate):
    return candidates_dict[classified_candidate]

In [None]:
classified_candidates_test = classifier_candidate.predict(candidate_test_stripped)

In [None]:
map_func = np.vectorize(classify_party)
classified_entries = map_func(classified_candidates_test)
print(classified_entries)

### Evaluate the classifier for parties that uses the candidate classifier

In [None]:
#Evaluate
prediction_labels = classified_entries
party_test_labels = map_func(candidate_test_labels)
labels = list(candidates_dict.values())
labels = np.unique(labels)
print(labels)
accuracy = metrics.accuracy_score(party_test_labels, prediction_labels)
precision = metrics.precision_score(party_test_labels, prediction_labels,average='macro')
recall = metrics.recall_score(party_test_labels, prediction_labels,average='macro')
f_score = metrics.f1_score(party_test_labels, prediction_labels,average='macro')
confusion_matrix = metrics.confusion_matrix(party_test_labels, prediction_labels, labels=labels)
print(accuracy)
print(precision)
print(recall)
print(f_score)
print(confusion_matrix)