# Comparación de métodos de ordenamiento para la clasificación de resultados deportivos por puntuación o tiempo

Algunas funciones utilizadas

In [38]:
import random

def toSeconds(time):
    if '-' in time: 
        return None  
    else:
        hours, minutes, seconds = map(int, time.split(':'))
        time_in_seconds = hours * 3600 + minutes * 60 + seconds
        return time_in_seconds

## Clasificación de resultados por tiempo

### Descarga del dataset desde Kaggle

Decidimos utilizar un conjunto de datos de Kaggle ([Finishers Boston Marathon 2015, 2016 & 2017](https://www.kaggle.com/datasets/rojour/boston-results/data?select=marathon_results_2016.csv)) para llevar a cabo pruebas de rendimiento con los métodos de ordenamiento seleccionados. Este conjunto de datos proporciona información sobre los tiempos finales en diversas longitudes de corredores en la Maratón de Boston de 2016.

In [10]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [20]:
import pandas as pd

ruta_archivo = 'kaggle/marathon_results_2016.csv'
marathon_results = pd.read_csv(ruta_archivo)

marathon_results.head(200)

Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 8,5K,...,25K,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division
0,5,"Hayle, Lemi Berhanu",21,M,Addis Ababa,,ETH,,,0:15:47,...,1:19:15,1:34:17,1:50:24,2:05:59,0:05:04,2:12:45,2:12:45,1,1,1
1,1,"Desisa, Lelisa",26,M,Ambo,,ETH,,,0:15:47,...,1:19:15,1:34:17,1:50:24,2:05:59,0:05:06,2:13:32,2:13:32,2,2,2
2,6,"Tsegay, Yemane Adhane",31,M,Addis Ababa,,ETH,,,0:15:46,...,1:19:15,1:34:45,1:50:48,2:06:47,0:05:07,2:14:02,2:14:02,3,3,3
3,11,"Korir, Wesley",33,M,Kitale,,KEN,,,0:15:46,...,1:19:16,1:34:45,1:50:48,2:06:47,0:05:07,2:14:05,2:14:05,4,4,4
4,14,"Lonyangata, Paul",23,M,Eldoret,,KEN,,,0:15:46,...,1:19:18,1:34:46,1:51:30,2:08:11,0:05:11,2:15:45,2:15:45,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,904,"Beauchamp, Thomas E",27,M,Smyrna,GA,USA,,,0:18:34,...,1:33:42,1:53:48,2:14:23,2:34:00,0:06:13,2:42:55,2:42:55,196,182,158
196,887,"Quick, Michael",28,M,Saint Louis,MO,USA,,,0:19:31,...,1:36:20,1:55:26,2:15:01,2:34:23,0:06:13,2:42:57,2:42:57,197,183,159
197,315,"Weston, Rob",27,M,Middletown,CT,USA,,,0:17:49,...,1:30:59,1:50:49,2:11:17,2:32:44,0:06:14,2:43:01,2:43:01,198,184,160
198,1164,"Gervais, David A Jr.",25,M,Nashua,NH,USA,,,0:18:47,...,1:33:05,1:52:59,2:13:30,2:33:55,0:06:14,2:43:02,2:43:02,199,185,161


### Preprocesamiento de los datos

Para facilitar las pruebas realizaremos un preprocesamiento de los datos, en el cual definiremos que columnas nos son útiles y eliminaremos valores que puedan interferir con el experimento.

Veamos el tamaño del dataset

In [21]:
print(f'Marathon results shape: {marathon_results.shape}')

Marathon results shape: (26630, 24)


Veamos de qué columnas tenemos y el tipo de dato almacenada en cada una de ellas

In [22]:
columns = marathon_results.columns

for column in columns:
  print('%20s'%column, marathon_results[column].dtype)

                 Bib object
                Name object
                 Age int64
                 M/F object
                City object
               State object
             Country object
             Citizen object
          Unnamed: 8 object
                  5K object
                 10K object
                 15K object
                 20K object
                Half object
                 25K object
                 30K object
                 35K object
                 40K object
                Pace object
           Proj Time object
       Official Time object
             Overall int64
              Gender int64
            Division int64


Para realizar las pruebas con los métodos de ordenamiento seleccionados necesitaremos solo las columnas: Bib, 5k, 10k, 20k, Half, 25K, 30K, 35K y 40k. Dichas columnas representan el número asignado al corredor y los distintos tiempos que demoró en recorrer las diferentes distancias.

In [23]:
columns_to_delete = ['Name', 'Age', 'M/F', 'City', 'State', 'Country', 'Citizen', 'Unnamed: 8', 'Pace', 'Proj Time', 
                     'Official Time', 'Overall', 'Gender', 'Division']

marathon_results = marathon_results.drop(columns = columns_to_delete)

In [24]:
columns = marathon_results.columns

for column in columns:
  print('%20s'%column, marathon_results[column].dtype)

                 Bib object
                  5K object
                 10K object
                 15K object
                 20K object
                Half object
                 25K object
                 30K object
                 35K object
                 40K object


In [25]:
marathon_results.head(200)

Unnamed: 0,Bib,5K,10K,15K,20K,Half,25K,30K,35K,40K
0,5,0:15:47,0:31:20,0:47:07,1:03:14,1:06:45,1:19:15,1:34:17,1:50:24,2:05:59
1,1,0:15:47,0:31:21,0:47:08,1:03:14,1:06:46,1:19:15,1:34:17,1:50:24,2:05:59
2,6,0:15:46,0:31:20,0:47:07,1:03:13,1:06:44,1:19:15,1:34:45,1:50:48,2:06:47
3,11,0:15:46,0:31:21,0:47:07,1:03:14,1:06:46,1:19:16,1:34:45,1:50:48,2:06:47
4,14,0:15:46,0:31:21,0:47:08,1:03:14,1:06:46,1:19:18,1:34:46,1:51:30,2:08:11
...,...,...,...,...,...,...,...,...,...,...
195,904,0:18:34,0:37:01,0:55:36,1:14:32,1:18:42,1:33:42,1:53:48,2:14:23,2:34:00
196,887,0:19:31,0:38:58,0:58:26,1:17:38,1:21:45,1:36:20,1:55:26,2:15:01,2:34:23
197,315,0:17:49,0:35:42,0:53:51,1:12:22,1:16:25,1:30:59,1:50:49,2:11:17,2:32:44
198,1164,0:18:47,0:36:58,0:55:28,1:14:07,1:18:11,1:33:05,1:52:59,2:13:30,2:33:55


Ahora podemos ver que los tiempos están dados en formato horas:minutos:segundos, necesitamos hacer la conversión a segundos, así quedaríamos con números enteros un poco más trabajables.

In [29]:
time_columns = ['5K', '10K', '15K', '20K', 'Half', '25K', '30K', '35K', '40K']

for column in time_columns:
    marathon_results[column] = marathon_results[column].apply(toSeconds)

marathon_results.head(200)

Unnamed: 0,Bib,5K,10K,15K,20K,Half,25K,30K,35K,40K
0,5,947.0,1880.0,2827.0,3794.0,4005.0,4755.0,5657.0,6624.0,7559.0
1,1,947.0,1881.0,2828.0,3794.0,4006.0,4755.0,5657.0,6624.0,7559.0
2,6,946.0,1880.0,2827.0,3793.0,4004.0,4755.0,5685.0,6648.0,7607.0
3,11,946.0,1881.0,2827.0,3794.0,4006.0,4756.0,5685.0,6648.0,7607.0
4,14,946.0,1881.0,2828.0,3794.0,4006.0,4758.0,5686.0,6690.0,7691.0
...,...,...,...,...,...,...,...,...,...,...
195,904,1114.0,2221.0,3336.0,4472.0,4722.0,5622.0,6828.0,8063.0,9240.0
196,887,1171.0,2338.0,3506.0,4658.0,4905.0,5780.0,6926.0,8101.0,9263.0
197,315,1069.0,2142.0,3231.0,4342.0,4585.0,5459.0,6649.0,7877.0,9164.0
198,1164,1127.0,2218.0,3328.0,4447.0,4691.0,5585.0,6779.0,8010.0,9235.0


Para facilitar las pruebas eliminaremos filas que contengan valores nulos (None, NaN).

In [33]:
nan = marathon_results.isna().sum()

print('Valores nulos por columna:')
print(nan)

Valores nulos por columna:
Bib      0
5K      52
10K     29
15K     14
20K     23
Half    17
25K     10
30K     24
35K     12
40K     14
dtype: int64


In [35]:
columns = marathon_results.columns
for column in columns:
    marathon_results = marathon_results.dropna(subset=[column])

nan = marathon_results.isna().sum()

print('Valores nulos por columna:')
print(nan)

Valores nulos por columna:
Bib     0
5K      0
10K     0
15K     0
20K     0
Half    0
25K     0
30K     0
35K     0
40K     0
dtype: int64


### Hora de probar

Para establecer un orden de llegada de los corredores en cada distancia según sus tiempos, generaremos listas de tuplas. Cada tupla contendrá el número del corredor y su respectivo tiempo de llegada.

In [40]:
times_5k = [(bib, tiempo) for bib, tiempo in zip(marathon_results['Bib'], marathon_results['5K'])]
times_10k = [(bib, tiempo) for bib, tiempo in zip(marathon_results['Bib'], marathon_results['10K'])]
times_15k = [(bib, tiempo) for bib, tiempo in zip(marathon_results['Bib'], marathon_results['15K'])]
times_20k = [(bib, tiempo) for bib, tiempo in zip(marathon_results['Bib'], marathon_results['20K'])]
times_half = [(bib, tiempo) for bib, tiempo in zip(marathon_results['Bib'], marathon_results['Half'])]
times_25k = [(bib, tiempo) for bib, tiempo in zip(marathon_results['Bib'], marathon_results['25K'])]
times_30k = [(bib, tiempo) for bib, tiempo in zip(marathon_results['Bib'], marathon_results['30K'])]
times_35k = [(bib, tiempo) for bib, tiempo in zip(marathon_results['Bib'], marathon_results['35K'])]
times_40k = [(bib, tiempo) for bib, tiempo in zip(marathon_results['Bib'], marathon_results['40K'])]

Los tiempos de llegada ya se encuentran desordenados, pero para evitar sesgos, procederemos a aleatorizar estas listas.

In [41]:
lists = [times_5k, times_10k, times_15k, times_20k, times_half, times_25k, times_30k, times_35k, times_40k]

for list in lists:
    random.shuffle(list)
    print("Lista aleatorizada:", list[:10])

Lista aleatorizada: [('31762', 1656.0), ('31065', 2002.0), ('15086', 1465.0), ('30829', 1778.0), ('7323', 1304.0), ('2541', 1246.0), ('F16', 1115.0), ('29791', 1687.0), ('30230', 1766.0), ('18135', 1498.0)]
Lista aleatorizada: [('17901', 2953.0), ('19970', 3242.0), ('18769', 2939.0), ('9207', 2756.0), ('6898', 2630.0), ('20911', 3092.0), ('2613', 2403.0), ('15066', 3430.0), ('4657', 2537.0), ('24411', 3252.0)]
Lista aleatorizada: [('6040', 4388.0), ('9804', 4310.0), ('24961', 5549.0), ('7535', 4075.0), ('25345', 5362.0), ('26544', 4486.0), ('24951', 5444.0), ('24151', 5146.0), ('4333', 3803.0), ('27123', 5429.0)]
Lista aleatorizada: [('18831', 6464.0), ('4460', 5463.0), ('5065', 5132.0), ('5437', 5399.0), ('1346', 4988.0), ('9668', 5543.0), ('358', 4625.0), ('23109', 6999.0), ('3832', 4979.0), ('10980', 5998.0)]
Lista aleatorizada: [('17615', 6420.0), ('5309', 5503.0), ('19323', 6115.0), ('25270', 7229.0), ('14400', 6242.0), ('3655', 5392.0), ('17819', 6195.0), ('15380', 6430.0), ('215