In [63]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve

## 1. Загрузка данных

In [53]:
athletes_df = pd.read_csv('./Data/athletes.csv')
athletes_df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0


In [54]:
athletes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11538 entries, 0 to 11537
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           11538 non-null  int64  
 1   name         11538 non-null  object 
 2   nationality  11538 non-null  object 
 3   sex          11538 non-null  object 
 4   dob          11537 non-null  object 
 5   height       11208 non-null  float64
 6   weight       10879 non-null  float64
 7   sport        11538 non-null  object 
 8   gold         11538 non-null  int64  
 9   silver       11538 non-null  int64  
 10  bronze       11538 non-null  int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 991.7+ KB


> Мы видим пропуски в данных, у нас есть 1 пропуск по фиче dob, его можно просто удалить, height и weight можно заполнить средним значением

## 2. Обработка пропусков

### 2.1 Заполнение средним значением

In [55]:
athletes_df['height'] = athletes_df['height'].fillna(athletes_df['height'].mean())
athletes_df['weight'] = athletes_df['weight'].fillna(athletes_df['weight'].mean())

### 2.2 Удаление пропусков

In [56]:
athletes_df.dropna(inplace=True)

> Получился набор данных без пропусков:

In [57]:
athletes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11537 entries, 0 to 11537
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           11537 non-null  int64  
 1   name         11537 non-null  object 
 2   nationality  11537 non-null  object 
 3   sex          11537 non-null  object 
 4   dob          11537 non-null  object 
 5   height       11537 non-null  float64
 6   weight       11537 non-null  float64
 7   sport        11537 non-null  object 
 8   gold         11537 non-null  int64  
 9   silver       11537 non-null  int64  
 10  bronze       11537 non-null  int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 1.1+ MB


## 3. Кодировка категориальных переменных

В моей случае я буду предсказывать пол спортсмена(возьмем его как целевую переменную)

In [58]:
label_encoder = LabelEncoder()

In [59]:
athletes_df['sex_encoded'] = label_encoder.fit(athletes_df['sex']).transform(athletes_df['sex'])
athletes_df['sport_encoded'] = label_encoder.fit((athletes_df['sport'])).transform(athletes_df['sport'])
athletes_df['nationality_encoded'] = label_encoder.fit(athletes_df['nationality']).transform(athletes_df['nationality'])

## 4. Разделение данных на train и test

In [60]:
X = athletes_df.drop([
    'sex', 
    'sex_encoded', 
    'name', 
    'id',
    'sport',
    'nationality',
    'dob'
], axis=1)

y = athletes_df['sex_encoded']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## 5. Обучение модели логистической регрессии 

In [62]:
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

## 6. Построение ROC-AUC кривой