In [1]:
import pandas
import json

import zipfile
import numpy as np
import pandas as pd

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

import re

In [2]:
data_frame = pandas.read_csv(
    './data_in/BixiMontrealRentals/2017/Stations_2017.csv',
#    parse_dates=[
#        'last_update',
#        'input_date',
#        'date_photo',
#    ],
    infer_datetime_format=True,
    na_values=['']
)

In [3]:
data_frame[data_frame.columns[:10]].head()

Unnamed: 0,code,name,latitude,longitude
0,7015,LaSalle / 4e avenue,45.43074,-73.591911
1,6714,LaSalle / Sénécal,45.434434,-73.586694
2,6712,LaSalle / Crawford,45.437914,-73.58274
3,6715,Natatorium (LaSalle / Rolland),45.444408,-73.575568
4,7048,Métro Angrignon,45.446534,-73.603541


In [4]:
data_frame.dtypes

code           int64
name          object
latitude     float64
longitude    float64
dtype: object

In [5]:
json_result_string = data_frame.to_json(
    orient='records', 
    double_precision=12,
    date_format='iso'
)
json_result = json.loads(json_result_string)

In [6]:
geojson = {
    'type': 'FeatureCollection',
    'features': []
}
for record in json_result:
    geojson['features'].append({
        'type': 'Feature',
        'id': record['code'],
        'geometry': {
            'type': 'Point',
            'coordinates': [record['longitude'], record['latitude']],
        },
        'properties': record,
    })

In [7]:
with open('./data_out/Stations_2017.geojson', 'w') as f:
    f.write(json.dumps(geojson, indent=2))

In [14]:
resp = urlopen('http://montreal.bixi.com/c/bixi/file_db/data_all.file/BixiMontrealRentals2017.zip')
zipfiles = ZipFile(BytesIO(resp.read()))
zipfiles.namelist()
# ['bbc.classes', 'bbc.docs', 'bbc.mtx', 'bbc.terms']

['2017/',
 '2017/OD_2017-04.csv',
 '2017/OD_2017-05.csv',
 '2017/OD_2017-06.csv',
 '2017/OD_2017-07.csv',
 '2017/OD_2017-08.csv',
 '2017/OD_2017-09.csv',
 '2017/OD_2017-10.csv',
 '2017/OD_2017-11.csv',
 '2017/Stations_2017.csv']

In [22]:
data_bixi_concatenate=[]

pattern = re.compile("^[0-9]{4}/OD_[0-9]{4}-[0-9]{2}.csv$")

count=0;
for file in (zipfiles.namelist()):
    if(pattern.match(file)):
        print(file)
        if (count == 0):
            data_bixi_concatenate = pd.read_csv(zipfiles.open(file))
            count = 1
        else:
            my_df = pd.read_csv(zipfiles.open(file))
            data_bixi_concatenate = (pd.DataFrame(np.concatenate((data_bixi_concatenate,my_df),axis=0), columns=list(my_df.columns.values)))

2017/OD_2017-04.csv
2017/OD_2017-05.csv
2017/OD_2017-06.csv
2017/OD_2017-07.csv
2017/OD_2017-08.csv
2017/OD_2017-09.csv
2017/OD_2017-10.csv
2017/OD_2017-11.csv


In [23]:
data_bixi_concatenate[data_bixi_concatenate.columns[:10]].head()

Unnamed: 0,start_date,start_station_code,end_date,end_station_code,duration_sec,is_member
0,2017-04-15 00:00,7060,2017-04-15 00:31,7060,1841,1
1,2017-04-15 00:01,6173,2017-04-15 00:10,6173,553,1
2,2017-04-15 00:01,6203,2017-04-15 00:04,6204,195,1
3,2017-04-15 00:01,6104,2017-04-15 00:06,6114,285,1
4,2017-04-15 00:01,6174,2017-04-15 00:11,6174,569,1


In [24]:
data_bixi_concatenate[data_bixi_concatenate.columns[:10]].tail()

Unnamed: 0,start_date,start_station_code,end_date,end_station_code,duration_sec,is_member
4740352,2017-11-15 23:57,6182,2017-11-16 00:11,6159,840,0
4740353,2017-11-15 23:58,6013,2017-11-16 00:04,6023,363,1
4740354,2017-11-15 23:58,6128,2017-11-16 00:15,6411,1014,1
4740355,2017-11-15 23:58,6748,2017-11-16 00:27,6349,1752,1
4740356,2017-11-15 23:59,6112,2017-11-16 00:06,6173,460,1


In [52]:
data_bixi_concatenate_flow = data_bixi_concatenate[['start_station_code','end_station_code']]
data_bixi_concatenate_flow['flow']=1
data_bixi_concatenate_flow[data_bixi_concatenate_flow.columns[:10]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,start_station_code,end_station_code,flow
0,7060,7060,1
1,6173,6173,1
2,6203,6204,1
3,6104,6114,1
4,6174,6174,1


In [72]:
col_flow = data_bixi_concatenate_flow.groupby(['start_station_code','end_station_code'])['flow'].sum() #multiple index serie

#make dataframe from preceding multiple index serie
df_flow=pd.DataFrame(col_flow)
df_flow.reset_index(inplace=True) 

In [73]:
df_flow[df_flow.columns[:10]].tail()

Unnamed: 0,start_station_code,end_station_code,flow
194173,10002,7076,14
194174,10002,7077,7
194175,10002,7079,4
194176,10002,7080,11
194177,10002,10002,249


In [76]:
df_flow.to_csv('data_out/bixi_flow.csv', sep=',', index=False)