In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Первичная предобработка данных и визуализация

Импорт исходного датасета в формате gpd.
Основное отличие GeoDataFrame - наличие специального столбца 'geometry', который содержит геометрические объекты, такие как точки, линии и полигоны. Это позволяет представлять сложные географические структуры в табличном формате, сохраняя при этом их пространственные характеристики.

In [None]:
# Импорт загруженных станций
shapefile = gpd.read_file("/kaggle/input/rzhd-hack/selected1.shp")
# Импорт участков дорог между ними
shapefile2 = gpd.read_file("/kaggle/input/rzhd-hack/all_routes_v2.shp")

Большое количество пустых значений получили для maxspeed. Посмотрев, что 90 - максимальная разрешенная скорость для товарных поездов по дефолту взяли 80, так как на валидационной выборке большинство результатов было сильно занижено. 

Это привело к увеличению целевой метрики на 2%.

In [None]:
shapefile2['maxspeed'].fillna(90, inplace=True)

for i, a in enumerate(shapefile2['maxspeed']):
    if type(a) is str:
        if a[0] == '[':
            shapefile2['maxspeed'].iloc[i] = a.strip('][').split(", ")
            shapefile2['maxspeed'].iloc[i] = [int(j.strip("''")) for j in shapefile2['maxspeed'].iloc[i]]
            shapefile2['maxspeed'].iloc[i] = sum([int(j) for j in shapefile2['maxspeed'].iloc[i]]) / len(shapefile2['maxspeed'].iloc[i])
        else:
            shapefile2['maxspeed'].iloc[i] = int(shapefile2['maxspeed'].iloc[i])
    elif type(a) is list:
        shapefile2['maxspeed'].iloc[i] = sum([int(j) for j in a]) / len(a)

In [None]:
# Удаляем лишний столбец
geometry = shapefile2.geometry
non_geometry = shapefile2.drop(columns='geometry')

# Функция для вычисления гармонического среднего
def harmonic_mean(group):
    total_length = group['length'].sum()
    weighted_sum = (group['length'] / group['maxspeed']).sum()
    return total_length / weighted_sum if weighted_sum != 0 else 0

# Агрегирование негеометрических столбцов путем суммирования расстояний и вычисления гармонического среднего для maxspeed
aggregated_non_geometry = non_geometry.groupby(['origin', 'destinatio']).agg({
    'length': 'sum',
    'maxspeed': lambda x: harmonic_mean(non_geometry.loc[x.index])
}).reset_index()

# Объединение геометрических столбцов путем группировки и применения unary_union
aggregated_geometry = shapefile2.groupby(['origin', 'destinatio'])['geometry'].apply(lambda x: x.unary_union).reset_index()

# Объединение агрегированных негеометрических данных с агрегированной геометрией
aggregated_data = aggregated_non_geometry.merge(aggregated_geometry, on=['origin', 'destinatio'])

# Преобразуем в формат GeoDataFrame
aggregated_data = gpd.GeoDataFrame(aggregated_data, geometry='geometry')

print(aggregated_data)

Отдельные графики точек и путей

In [None]:
# График станций
shape2 = aggregated_data
shapefile.plot()
plt.show()

# График путей
shape2.plot()
plt.show()

In [None]:
aggregated_data['length_km'] = aggregated_data['geometry'].length / 1000

print(aggregated_data[['origin','destinatio','geometry', 'length_km', "maxspeed"]])

Импорт тренировочного датасета, который состоит из времени поезда в пути и пунктов прибытия и отбытия.

In [None]:
train_df = pd.read_csv("/kaggle/input/rzhd-hack/dataset_external.csv")
train_df

Создание графа, на котором будет обучаться сетка.

In [None]:
all_routes = aggregated_data

# Граф для расстояний (кратчайшее по расстоянию)
G = nx.Graph()

# Добавляем узлы в виде станций
stations = set(all_routes['origin']).union(set(all_routes['destinatio']))
G.add_nodes_from(stations)

# Добавляем ребра в виде расстояний между пунктами
for index, row in all_routes.iterrows():
    G.add_edge(row['origin'], row['destinatio'], weight=row['length_km'])
    # У нас есть переменная reverced, которая должна сделать граф ориентированным
    G.add_edge(row['destinatio'], row['origin'], weight=row['length_km'])

# Ищем самый дешевый путь с точки зрения часов
F = nx.Graph()

stations = set(all_routes['origin']).union(set(all_routes['destinatio']))
F.add_nodes_from(stations)
for index, row in all_routes.iterrows():
    F.add_edge(row['origin'], row['destinatio'], weight=row['length_km']/row['maxspeed'])
    # Assuming undirected graph, add the reverse edge
    F.add_edge(row['destinatio'], row['origin'], weight=row['length_km']/row['maxspeed']) 

In [None]:
# Функция поиска кратчайшего пути между начальной и конечной станциями
def find_shortest_path(start_station, end_station):
    try:
        if not(start_station in G and end_station in G):
            return 0,0,0
        else:
            shortest_path = nx.shortest_path(G, start_station, end_station, weight='weight')
            shortest_distance = nx.shortest_path_length(G, start_station, end_station, weight='weight')
            num_stations = len(shortest_path) - 1
            return shortest_path, shortest_distance, num_stations
    except nx.NetworkXNoPath:
        return None, float('inf'), float('inf')  # No path found


def find_fastest_path(start_station, end_station):
    try:
        if not(start_station in F and end_station in F):
            return 0,0,0
        else:
            fastest_path = nx.dijkstra_path(F, start_station, end_station, weight='weight')
            fastest_distance = nx.dijkstra_path_length(F, start_station, end_station, weight='weight')
            num_stations_fast = len(fastest_path) - 1
            return fastest_path, fastest_distance, num_stations_fast
    except nx.NetworkXNoPath:
        return None, float('inf'), float('inf')  # No path found
    
    
for index, row in train_df.iterrows():
    start_station = row['route_start']
    end_station = row['route_end']
    
    shortest_path, shortest_distance, num_stations = find_shortest_path(start_station, end_station)
    fastest_path, fastest_distance, num_stations_fast = find_fastest_path(start_station, end_station)
    
    if shortest_path:
        train_df.at[index, 'optimal_distance_km'] = shortest_distance
        train_df.at[index, 'num_stations'] = num_stations
    else:
        train_df.at[index, 'optimal_distance_km'] = 0  # or any default value for no path found
        train_df.at[index, 'num_stations'] = 0
    if fastest_path:
        train_df.at[index, 'optimal_distance_hours'] = fastest_distance
        train_df.at[index, 'num_stations_fast'] = num_stations_fast
    else:
        train_df.at[index, 'optimal_distance_hours'] = 0
        train_df.at[index, 'num_stations_fast'] = 0
print(train_df)

Загружаем тестовый датасет:

In [None]:
test_df = pd.read_csv("/kaggle/input/rzhd-hack/dataset_internal.csv")

In [None]:
# Поиск кратчайших путей на графе
for index, row in test_df.iterrows():
    start_station = row['route_start']
    end_station = row['route_end']
    
    shortest_path, shortest_distance, num_stations = find_shortest_path(start_station, end_station)
    fastest_path, fastest_distance, num_stations_fast = find_fastest_path(start_station, end_station)
    
    if shortest_path:
        test_df.at[index, 'optimal_distance_km'] = shortest_distance
        test_df.at[index, 'num_stations'] = num_stations
    else:
        test_df.at[index, 'optimal_distance_km'] = 0  # or any default value for no path found
        test_df.at[index, 'num_stations'] = 0
    if fastest_path:
        test_df.at[index, 'optimal_distance_hours'] = fastest_distance
        test_df.at[index, 'num_stations_fast'] = num_stations_fast
    else:
        test_df.at[index, 'optimal_distance_hours'] = 0
        test_df.at[index, 'num_stations_fast'] = 0

# Print updated train_df
print(test_df)

## Построение предсказательной модели

In [None]:
# Assuming 'optimal_distance_km' is the target variable
X = train_df.drop(columns=["index",'value'])  # Features
y = train_df['value']  # Target variable

# Split data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.001, random_state=42)
X

In [None]:
catboost_params = {
    'iterations': 1500,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'RMSE',
    'l2_leaf_reg': 3,
    'leaf_estimation_iterations': 10,
    'random_seed': 42
}

# Convert data to CatBoost Pool format
train_pool = Pool(data=X_train, label=y_train, cat_features = ['route_start',"route_end"])
val_pool = Pool(data=X_val, label=y_val, cat_features = ['route_start',"route_end"])

# Запускаем кросс-валидацию
cv_results = cv(pool=train_pool,
                params=catboost_params,
                fold_count=5,  
                verbose=100,  
                plot=True)  

best_iteration = 620

print(f"Best RMSE: {cv_results['test-RMSE-mean'].min():.2f} ± {cv_results['test-RMSE-std'].mean():.2f} at iteration {best_iteration}")

In [None]:
# CatBoostRegressor с лучшими параметрами
best_model = CatBoostRegressor(iterations=best_iteration,
                               learning_rate=0.1,
                               depth=6,
                               l2_leaf_reg = 3,
                               leaf_estimation_iterations = 10,
                               loss_function='RMSE',
                               random_seed=42)

train_pool = Pool(data=X, label=y, cat_features=['route_start',"route_end"])
best_model.fit(train_pool,verbose=100)  # Verbose to see progress every 100 iterations
best_model.fit(X, y, verbose=100)

Метрики:

In [None]:
# Predict on test data
y_pred = best_model.predict(X_val)

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate RMSE and MAE
rmse = mean_squared_error(y_val, y_pred, squared=False)
mae = mean_absolute_error(y_val, y_pred)

print(f"RMSE: {rmse:.2f} hours")
print(f"MAE: {mae:.2f} hours")

Предсказания:

In [None]:
test_pool = Pool(data=test_df, cat_features=['route_start',"route_end"])

# Assuming 'test_df' has similar structure as 'train_df' without 'optimal_distance_km'
test_pred = best_model.predict(test_pool)

# Print or use 'test_pred' as needed for your application
print(test_pred)

In [None]:
test_csv = pd.read_csv("/kaggle/input/rzhd-hack/dataset_internal.csv")
test_csv

In [None]:
test_csv["value_predict"] = test_pred
test_csv.to_csv("cb_predict.csv")