In [None]:
import kagglehub
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

path = kagglehub.dataset_download("open-powerlifting/powerlifting-database")
print("Path to dataset files:", path)
print("Files in dataset:", os.listdir(path))

filepath = path + "/openpowerlifting.csv"
print("Path to main dataset:", filepath)

ds = pd.read_csv(filepath)

Using Colab cache for faster access to the 'powerlifting-database' dataset.
Path to dataset files: /kaggle/input/powerlifting-database
Files in dataset: ['openpowerlifting-2024-01-06-4c732975.csv', 'openpowerlifting.csv']
Path to main dataset: /kaggle/input/powerlifting-database/openpowerlifting.csv


  ds = pd.read_csv(filepath)


In [None]:
# Размер датасета и кол-во признаков
print("Dataset size:", ds.shape[0])
print("Number of features:", ds.shape[1])

Dataset size: 1423354
Number of features: 37


In [None]:
# Число пропущенных значений
total_missing = ds.isnull().sum().sum()
print("Number of missing values:", total_missing)
total_cells = ds.shape[0] * ds.shape[1]
missing_percentage = total_missing / total_cells * 100
print("Percentage of missing values:", round(missing_percentage, 2))

Number of missing values: 18215965
Percentage of missing values: 34.59


In [None]:
# Категориальные столбцы
categorical_columns = ds.select_dtypes(include=['object']).columns # Тут можно еще добавить в include 'categorical', но у меня таких нет
print("Categorial columns:")
categorical_columns = categorical_columns.drop(['Date', 'Name'])
print(categorical_columns.tolist())
print("Number of categorial columns:", len(categorical_columns))


Categorial columns:
['Sex', 'Event', 'Equipment', 'AgeClass', 'Division', 'WeightClassKg', 'Place', 'Tested', 'Country', 'Federation', 'MeetCountry', 'MeetState', 'MeetName']
Number of categorial columns: 13


In [None]:
# Посчитать баланс для каждого категориального признака

def check_balance(column_name):
  value_counts = ds[column_name].value_counts()
  total = len(ds)
  percentages = []

  for class_name, count in value_counts.items():
    percentage = round((count / total * 100), 2)
    percentages.append(percentage)

  return percentages

for col in categorical_columns:
  print(col, check_balance(col))

Sex [74.49, 25.51]
Event [75.4, 18.07, 3.98, 1.94, 0.34, 0.17, 0.09]
Equipment [55.3, 32.84, 7.29, 4.57, 0.0]
AgeClass [17.16, 9.6, 4.66, 4.5, 4.49, 3.24, 3.21, 2.53, 1.8, 1.54, 1.08, 0.61, 0.41, 0.2, 0.17, 0.08]
Division [23.74, 19.69, 8.36, 3.54, 2.08, 1.93, 1.79, 1.55, 1.37, 1.33, 1.24, 1.24, 0.97, 0.68, 0.67, 0.61, 0.61, 0.51, 0.5, 0.48, 0.47, 0.47, 0.44, 0.42, 0.4, 0.38, 0.38, 0.35, 0.34, 0.32, 0.3, 0.27, 0.26, 0.25, 0.25, 0.24, 0.24, 0.24, 0.23, 0.22, 0.22, 0.21, 0.21, 0.21, 0.2, 0.2, 0.18, 0.18, 0.18, 0.18, 0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.14, 0.14, 0.14, 0.14, 0.14, 0.13, 0.12, 0.12, 0.12, 0.12, 0.12, 0.11, 0.11, 0.11, 0.1, 0.1, 0.1, 0.1, 0.1, 0.09, 0.09, 0.09, 0.09, 0.09, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06,

In [None]:
# Посчитать выбросы через межквартильный размах
def quantile(column_name):
  Q1 = ds_positive[column_name].quantile(0.25)
  Q3 = ds_positive[column_name].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR


  outsiders = ds_positive[(ds_positive[column_name] < lower_bound) | (ds_positive[column_name] > upper_bound)]
  return len(outsiders)

# Еще не считала количественные признаки
number_columns = ds.select_dtypes(include=['float64', 'int64']).columns
print(number_columns.tolist())
print("Number of number columns:", len(number_columns))

# Отрицательное число означает проваленную попытку, это приводит к смещению медианы
ds_positive = ds.copy()
for col in number_columns:
  ds_positive[col] = ds[col].abs()

# Теперь считаем выбросы
sum = 0
column_numbers = 0
max = 0
for col in number_columns:
  outsiders_count = quantile(col)
  total_count = len(ds_positive[col].dropna())
  percentage = round(outsiders_count / total_count * 100, 2)
  if max < percentage:
   max = percentage
  sum += percentage
  column_numbers += 1
  print(col, percentage)
print(round(sum / column_numbers, 2))
print(max)

['Age', 'BodyweightKg', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Squat4Kg', 'Best3SquatKg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Bench4Kg', 'Best3BenchKg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Deadlift4Kg', 'Best3DeadliftKg', 'TotalKg', 'Wilks', 'McCulloch', 'Glossbrenner', 'IPFPoints']
Number of number columns: 22
Age 1.33
BodyweightKg 1.02
Squat1Kg 1.05
Squat2Kg 0.9
Squat3Kg 0.86
Squat4Kg 1.41
Best3SquatKg 1.04
Bench1Kg 1.47
Bench2Kg 1.13
Bench3Kg 1.11
Bench4Kg 0.54
Best3BenchKg 1.15
Deadlift1Kg 0.04
Deadlift2Kg 0.05
Deadlift3Kg 0.08
Deadlift4Kg 0.38
Best3DeadliftKg 0.15
TotalKg 0.22
Wilks 0.06
McCulloch 0.07
Glossbrenner 0.05
IPFPoints 0.83
0.68
1.47
