In [1]:
import pandas as pd

# Загрузка данных
sample_accidents = pd.read_csv('../data/sample/sample_accidents.csv')
sample_participants = pd.read_csv('../data/sample/sample_participants.csv')
sample_vehicles = pd.read_csv('../data/sample/sample_vehicles.csv')

# Проверка размеров
print("Accidents shape:", sample_accidents.shape)
print("Participants shape:", sample_participants.shape)
print("Vehicles shape:", sample_vehicles.shape)

Accidents shape: (1000, 18)
Participants shape: (2521, 8)
Vehicles shape: (1578, 7)


In [3]:
# Удаление нерелевантных для анализа столбцов
sample_accidents = sample_accidents.drop(columns=['county', 'address', 'nearby'], errors='ignore')
sample_vehicles = sample_vehicles.drop(columns=['color'], errors='ignore')

# Проверка оставшихся столбцов
print("Accidents columns after dropping:", sample_accidents.columns)
print("Participants columns:", sample_participants.columns)
print("Vehicles columns after dropping:", sample_vehicles.columns)

Accidents columns after dropping: Index(['id', 'tags', 'category', 'region', 'longitude', 'latitude', 'datetime',
       'light', 'weather', 'road_conditions', 'participants_count',
       'participant_categories', 'severity', 'dead_count', 'injured_count'],
      dtype='object')
Participants columns: Index(['accident_id', 'vehicle_id', 'participant_id', 'role', 'gender',
       'violations', 'health_status', 'years_of_driving_experience'],
      dtype='object')
Vehicles columns after dropping: Index(['accident_id', 'vehicle_id', 'category', 'brand', 'model', 'year'], dtype='object')


In [5]:
# Проверка пропущенных значений
print("Accidents missing values:\n", sample_accidents.isnull().sum())
print("Participants missing values:\n", sample_participants.isnull().sum())
print("Vehicles missing values:\n", sample_vehicles.isnull().sum())
# Пропуски оставляем для обработки перед моделированием

Accidents missing values:
 id                         0
tags                       0
category                   0
region                     0
longitude                 10
latitude                  10
datetime                   0
light                      0
weather                    0
road_conditions            0
participants_count         0
participant_categories     0
severity                   0
dead_count                 0
injured_count              0
dtype: int64
Participants missing values:
 accident_id                       0
vehicle_id                      342
participant_id                    0
role                              0
gender                           47
violations                     1349
health_status                     5
years_of_driving_experience    1120
dtype: int64
Vehicles missing values:
 accident_id     0
vehicle_id      0
category        0
brand          48
model          48
year           57
dtype: int64


In [7]:
# Проверка типов данных
print("Accidents dtypes:\n", sample_accidents.dtypes)
print("Participants dtypes:\n", sample_participants.dtypes)
print("Vehicles dtypes:\n", sample_vehicles.dtypes)

Accidents dtypes:
 id                          int64
tags                       object
category                   object
region                     object
longitude                 float64
latitude                  float64
datetime                   object
light                      object
weather                    object
road_conditions            object
participants_count          int64
participant_categories     object
severity                   object
dead_count                  int64
injured_count               int64
dtype: object
Participants dtypes:
 accident_id                      int64
vehicle_id                      object
participant_id                  object
role                            object
gender                          object
violations                      object
health_status                   object
years_of_driving_experience    float64
dtype: object
Vehicles dtypes:
 accident_id      int64
vehicle_id      object
category        object
brand           object

In [11]:
# Исправление типов данных
sample_accidents['datetime'] = pd.to_datetime(sample_accidents['datetime'])
sample_vehicles['manufacture_year'] = sample_vehicles['year'].astype('Int64') # Переименовываем, чтобы не путать со столбцом year для даты
sample_vehicles = sample_vehicles.drop(columns=['year']) # Удаляем старый столбец year

# Преобразование даты
sample_accidents['year'] = sample_accidents['datetime'].dt.year
sample_accidents['month'] = sample_accidents['datetime'].dt.month
sample_accidents['day'] = sample_accidents['datetime'].dt.day

# Проверка
print("Аварии с элементами даты:\n", sample_accidents[['datetime', 'year', 'month', 'day']].head())

Аварии с элементами даты:
              datetime  year  month  day
0 2018-03-11 12:45:00  2018      3   11
1 2019-09-06 02:20:00  2019      9    6
2 2015-04-28 13:10:00  2015      4   28
3 2022-11-14 10:00:00  2022     11   14
4 2018-10-28 10:30:00  2018     10   28


In [13]:
# Сохранение очищенных данных
sample_accidents.to_csv('../data/processed/cleaned_accidents.csv', index=False)
sample_participants.to_csv('../data/processed/cleaned_participants.csv', index=False)
sample_vehicles.to_csv('../data/processed/cleaned_vehicles.csv', index=False)

print("Очищенные данные сохранены в data/processed/")

Очищенные данные сохранены в data/processed/
