In [188]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import folium
import geojson
from IPython.display import HTML
from ipywidgets import widgets, fixed
from IPython.display import HTML
import datetime
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# 1. Подготовка данных

Загрузим данные по регионам

In [6]:
regions_df = pd.read_csv('regions.csv', sep=';')

Составим объект GeoJSON по которому будет строиться распределние числа поездок на картах Folium

In [11]:
lonPoints = regions_df.iloc[::50, 1].values
lonPoints = np.append(lonPoints, regions_df.iloc[-1, 2])
latPoints = regions_df.iloc[:50, 3].values
latPoints = np.append(latPoints, regions_df.iloc[-1, 4])

regions_geo = 'map.geojson'
def region2geojson(reg_df, save_fnm='map.geojson'):
    features = []
    reg_df.apply(
        lambda X: features.append(
            geojson.Feature(
                geometry=geojson.Polygon(
                    [
                        [
                            [X['west'], X['south']],
                            [X['east'], X['south']],
                            [X['east'], X['north']],
                            [X['west'], X['north']],
                            [X['west'], X['south']]
                        ]
                    ]
                ), 
                id=X['region']
            )
        ),
        axis=1
    )
    with open(save_fnm, 'w') as f:
        geojson.dump(geojson.FeatureCollection(features), f, sort_keys=True)
region2geojson(regions_df, regions_geo)

Обрабатываем агрегированные данные за июнь 2016 года (ноутбук с первой недели который это делает, приложен в архиве)

In [26]:
agg_data = pd.read_csv('Yellow_taxi_aggregated_data_06.csv', sep='\t')
pivot_data = pd.pivot_table(agg_data, values='trips',index='pickup_datehour', columns='region_number')

Загружаем данные предсказаний с прошлой недели за июнь (файл приложен в архиве)

In [85]:
predicted_data = pd.read_csv('06_pred_data.csv', sep=',')

In [217]:
predicted_data.head()

Unnamed: 0,Time,region_1075_pred_1,region_1075_pred_2,region_1075_pred_3,region_1075_pred_4,region_1075_pred_5,region_1075_pred_6,region_1076_pred_1,region_1076_pred_2,region_1076_pred_3,...,region_2119_pred_3,region_2119_pred_4,region_2119_pred_5,region_2119_pred_6,region_2168_pred_1,region_2168_pred_2,region_2168_pred_3,region_2168_pred_4,region_2168_pred_5,region_2168_pred_6
0,2016-06-01 00:00:00,16.275969,8.395596,5.22324,5.900229,6.075602,21.104652,18.784014,8.166911,6.418157,...,9.836746,4.118093,1.596856,16.479183,48.286045,14.661408,5.560575,13.777409,68.21791,100.520355
1,2016-06-01 01:00:00,10.199077,2.086159,1.090883,8.195955,20.094337,49.345028,9.194854,7.260076,10.103031,...,1.302887,6.501968,16.845543,29.875998,20.469501,14.912552,13.649523,72.135506,99.14274,111.85985
2,2016-06-01 02:00:00,2.969585,1.596949,7.932816,19.107397,42.493942,71.383575,5.818487,8.218873,24.205297,...,4.312404,12.373288,25.853992,18.350248,2.733718,18.607346,68.401474,93.75214,103.3594,118.80896
3,2016-06-01 03:00:00,2.906151,5.563781,16.212353,44.39297,68.58996,60.99687,7.041464,26.46824,67.04717,...,16.355448,29.822271,12.96048,23.071472,9.922137,66.78038,88.696945,116.331635,114.75919,92.33834
4,2016-06-01 04:00:00,7.579117,15.689338,46.95088,70.25183,57.88121,55.656567,22.201027,59.806683,121.40197,...,31.661474,16.935438,20.14892,33.026173,51.728554,92.13661,114.037056,130.32745,91.39245,91.33082


# 2. Визуализация реального и предсказанного числа поездок на карте

Зададим функции для отображения распределения числа поездок на карте и используем ipywidgets для интерактивного управления датой и временем

Т.к. в html версии ноутбука карты не всегда отображаются хорошо, я приложил в архиве 7 картинок с распределениями (реальные и предсказнные на 1-6 часов для момента времени 1 июня 2016 года, 19 часов)

In [207]:
def show_real_june_data(date):
    index = (date.day - 1) * 24 + date.hour
    
    latESB, lonESB = 40.748817, -73.985428
    mapNY = folium.Map(
    location=[latESB, lonESB], min_lon=lonPoints[0], max_lon=lonPoints[-1],min_lat=latPoints[0], max_lat=latPoints[-1], zoom_start=11);
    mapNY.choropleth(geo_data=regions_geo, name='choropleth', data=pivot_data.iloc[index,:], columns=['region', 'trip_numbers'],
                     key_on='feature.id', fill_color='OrRd', fill_opacity=0.7, line_opacity=0.2, legend_name='Real number of trips', 
                     reset=True);
    mapNY.save('interactive_NYmap_real_data_' + str(date.day) + '_' + str(date.hour) + '.html');
    return mapNY

In [210]:
def show_predicted_data(date, hour):
    index = (date.day - 1) * 24 + date.hour - hour
    
    columns = predicted_data.columns[hour::6]  
    predictded_n_hour = predicted_data[columns]
    
    active_cells = [x.split('_')[1] for x in predictded_n_hour.columns]
    all_cells = range(1, 2501)
    nonactive_cells = [x for x in all_cells if x not in active_cells]
    
    nonactive_columns = ['region_' + str(x) + '_pred_' + str(hour) for x in nonactive_cells]
    predictded_n_hour = predictded_n_hour.loc[:, nonactive_columns].fillna(0)

    predictded_n_hour.reindex_axis(sorted(predictded_n_hour.columns), axis=1)
    
    ser = pd.Series(predictded_n_hour.iloc[index, :].values)
    
    latESB, lonESB = 40.748817, -73.985428
    mapNY = folium.Map(
    location=[latESB, lonESB], min_lon=lonPoints[0], max_lon=lonPoints[-1],min_lat=latPoints[0], max_lat=latPoints[-1], zoom_start=11);
    mapNY.choropleth(geo_data=regions_geo, name='choropleth', data=ser,
                     key_on='feature.id', fill_color='OrRd', fill_opacity=0.7, line_opacity=0.2, legend_name='Predicted number of trips', 
                     reset=True);
    mapNY.save('interactive_NYmap_predicted_' + str(hour) + '_data_june_' + str(date.day) + '_' + str(date.hour) +'.html');
    return mapNY

In [68]:
datetimes = [datetime.datetime(2016,6,i,j) for i in range(1, 31) for j in range(0, 24)]

Отображаем реальные данные за июнь

In [208]:
widgets.interact(show_real_june_data, date=datetimes)

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUnZGF0ZScsIG9wdGlvbnM9KGRhdGV0aW1lLmRhdGV0aW1lKDIwMTYsIDYsIDEsIDAsIDApLCBkYXRldGltZS5kYXTigKY=


<function __main__.show_real_june_data>

Отображаем предсказанные данные за июнь (по предсказаниям от 1 до 6 часов вперед) в тот же момент времени (1 июня 19 часов)

In [211]:
datetimes_for_pred = datetimes[1:]
widgets.interact(show_predicted_data, date=datetimes_for_pred, hour=fixed(1))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUnZGF0ZScsIG9wdGlvbnM9KGRhdGV0aW1lLmRhdGV0aW1lKDIwMTYsIDYsIDEsIDEsIDApLCBkYXRldGltZS5kYXTigKY=


<function __main__.show_predicted_data>

In [212]:
datetimes_for_pred = datetimes[2:]
widgets.interact(show_predicted_data, date=datetimes_for_pred, hour=fixed(2))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUnZGF0ZScsIG9wdGlvbnM9KGRhdGV0aW1lLmRhdGV0aW1lKDIwMTYsIDYsIDEsIDIsIDApLCBkYXRldGltZS5kYXTigKY=


<function __main__.show_predicted_data>

In [213]:
datetimes_for_pred = datetimes[3:]
widgets.interact(show_predicted_data, date=datetimes_for_pred, hour=fixed(3))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUnZGF0ZScsIG9wdGlvbnM9KGRhdGV0aW1lLmRhdGV0aW1lKDIwMTYsIDYsIDEsIDMsIDApLCBkYXRldGltZS5kYXTigKY=


<function __main__.show_predicted_data>

In [214]:
datetimes_for_pred = datetimes[4:]
widgets.interact(show_predicted_data, date=datetimes_for_pred, hour=fixed(4))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUnZGF0ZScsIG9wdGlvbnM9KGRhdGV0aW1lLmRhdGV0aW1lKDIwMTYsIDYsIDEsIDQsIDApLCBkYXRldGltZS5kYXTigKY=


<function __main__.show_predicted_data>

In [215]:
datetimes_for_pred = datetimes[5:]
widgets.interact(show_predicted_data, date=datetimes_for_pred, hour=fixed(5))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUnZGF0ZScsIG9wdGlvbnM9KGRhdGV0aW1lLmRhdGV0aW1lKDIwMTYsIDYsIDEsIDUsIDApLCBkYXRldGltZS5kYXTigKY=


<function __main__.show_predicted_data>

In [216]:
datetimes_for_pred = datetimes[6:]
widgets.interact(show_predicted_data, date=datetimes_for_pred, hour=fixed(6))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUnZGF0ZScsIG9wdGlvbnM9KGRhdGV0aW1lLmRhdGV0aW1lKDIwMTYsIDYsIDEsIDYsIDApLCBkYXRldGltZS5kYXTigKY=


<function __main__.show_predicted_data>

# 3. Визуализация временных рядов

Визуализируем графики реального и предсказанного (на 1-6 часов) рядов в зависимости от номера региона. В архив добавлены сохраненные графики для одной из ячеек (1228) для всех 6 случаев

In [220]:
columns = predicted_data.columns[1::6]  
active_cells = [int(x.split('_')[1]) for x in columns]

In [243]:
def plot_june_data(region, hour):
    dates = [datetime.datetime(2016,6,i,j) for i in range(1, 31) for j in range(0, 24)][6:]
    real_data = pivot_data[region][6:]
    
    pred_ = predicted_data['region_' + str(region) + '_pred_' + str(hour)][6-hour:-hour]
    
    plt.figure(figsize=(15, 8))
    plt.plot(dates, real_data, color='b', lw=2, label='Real data')
    plt.plot(dates, pred_, color='r', lw=2, label=str(hour) + ' hour prediction')
    plt.ylabel("Number of trips")
    plt.legend()
    plt.savefig(str(region) + '_' + str(hour) + 'pred.png', bbox_to_inches='tight')

In [244]:
widgets.interact(plot_june_data, region=active_cells, hour=fixed(1))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUncmVnaW9uJywgb3B0aW9ucz0oMTA3NSwgMTA3NiwgMTA3NywgMTEyNSwgMTEyNiwgMTEyNywgMTEyOCwgMTEyOSzigKY=


<function __main__.plot_june_data>

In [245]:
widgets.interact(plot_june_data, region=active_cells, hour=fixed(2))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUncmVnaW9uJywgb3B0aW9ucz0oMTA3NSwgMTA3NiwgMTA3NywgMTEyNSwgMTEyNiwgMTEyNywgMTEyOCwgMTEyOSzigKY=


<function __main__.plot_june_data>

In [246]:
widgets.interact(plot_june_data, region=active_cells, hour=fixed(3))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUncmVnaW9uJywgb3B0aW9ucz0oMTA3NSwgMTA3NiwgMTA3NywgMTEyNSwgMTEyNiwgMTEyNywgMTEyOCwgMTEyOSzigKY=


<function __main__.plot_june_data>

In [247]:
widgets.interact(plot_june_data, region=active_cells, hour=fixed(4))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUncmVnaW9uJywgb3B0aW9ucz0oMTA3NSwgMTA3NiwgMTA3NywgMTEyNSwgMTEyNiwgMTEyNywgMTEyOCwgMTEyOSzigKY=


<function __main__.plot_june_data>

In [248]:
widgets.interact(plot_june_data, region=active_cells, hour=fixed(5))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUncmVnaW9uJywgb3B0aW9ucz0oMTA3NSwgMTA3NiwgMTA3NywgMTEyNSwgMTEyNiwgMTEyNywgMTEyOCwgMTEyOSzigKY=


<function __main__.plot_june_data>

In [249]:
widgets.interact(plot_june_data, region=active_cells, hour=fixed(6))

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUncmVnaW9uJywgb3B0aW9ucz0oMTA3NSwgMTA3NiwgMTA3NywgMTEyNSwgMTEyNiwgMTEyNywgMTEyOCwgMTEyOSzigKY=


<function __main__.plot_june_data>