# Data exploration

## Imports

In [1]:
import os
import pandas as pd

## Simplifying the `status.csv` data

In [2]:
status_file = lambda f: os.path.join("data/status", f)
status_blocks = sorted(os.listdir("data/status"))

In [3]:
status_0 = pd.read_csv(status_file(status_blocks[0]), header=0)

In [4]:
status_0["date"] = status_0["time"].map(lambda v: v.split(" ")[0])

In [5]:
status_0.drop("time", axis=1).groupby(["station_id", "date"]).agg(["sum", "size"]).reset_index()

Unnamed: 0_level_0,station_id,date,bikes_available,bikes_available,docks_available,docks_available
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,size,sum,size
0,2,2013/08/29,1439,642,15895,642
1,2,2013/08/30,6674,1288,28102,1288
2,2,2013/08/31,16069,1315,19436,1315
3,2,2013/09/01,13605,1293,21306,1293
4,2,2013/09/02,14296,1272,20048,1272
5,2,2013/09/03,13777,1273,20594,1273
6,2,2013/09/04,12143,1290,22687,1290
7,2,2013/09/05,18295,1317,17264,1317
8,2,2013/09/06,4635,309,3708,309


In [6]:
from itertools import pairwise

columns = status_0.columns

In [20]:
status = pd.DataFrame([], columns=["station_id", "date", "bikes_available_sum", "bikes_available_size", "docks_available_sum", "docks_available_size"])
n_blocks_per_read = 100

parts = []
for span_begin, span_end in pairwise(range(0, len(status_blocks) + n_blocks_per_read, n_blocks_per_read)):
    print(f"Concatenating blocks {span_begin:4}-{span_end:4} | Total rows: {status.shape[0]:>6}", end="\r")
    tmp = pd.concat(
        [
            # The first file "xaa" has a header, we can just skip it since we already know it
            pd.read_csv(status_file(f), names=columns, skiprows=1 if f == "xaa" else 0)
            for f in status_blocks[span_begin:span_end]
        ],
        axis=0,
        ignore_index=True
    )

    tmp["date"] = tmp["time"].map(lambda v: v.split(" ")[0])
    tmp = (tmp
        .drop("time", axis=1)
        .groupby(["station_id", "date"])
        # We have to aggregate each part with the sum
        # (and number of values of that sum) because each block
        # may not have all the data for each day-station pair,
        # so we can't just do the mean yet
        .agg(["sum", "size"])
        .reset_index()
    )
    tmp.columns = tmp.columns.to_flat_index()
    tmp.rename(inplace=True, columns={
        ("station_id", ""): "station_id",
        ("date", ""): "date",
        ("bikes_available", "sum"): "bikes_available_sum",
        ("bikes_available", "size"): "bikes_available_size",
        ("docks_available", "sum"): "docks_available_sum",
        ("docks_available", "size"): "docks_available_size",
    })

    status = pd.concat([status, tmp], axis=0, ignore_index=True)

Concatenating blocks 7100-7200 | Total rows:  49774

In [24]:
status["bikes_available_avg"] = status["bikes_available_sum"] / status["bikes_available_size"]
status["docks_available_avg"] = status["docks_available_sum"] / status["docks_available_size"]
status.drop(inplace=True, columns=["bikes_available_sum", "bikes_available_size", "docks_available_sum", "docks_available_size"])

In [29]:
status.to_csv("data/status_small.csv", index=False)

## Reading the other data

In [11]:
station = pd.read_csv("data/station.csv")
weather = pd.read_csv("data/weather.csv")
trip = pd.read_csv("data/trip.csv")

In [3]:
weather["zip_code"].drop_duplicates().values

array([94107, 94063, 94301, 94041, 95113])

In [4]:
city_zipcode_map = {
    94107: "San Francisco",
    94063: "Redwood City",
    94301: "Palo Alto",
    94041: "Mountain View",
    95113: "San Jose",
}

In [5]:
trip["zip_code"].drop_duplicates().count()

7439

In [6]:
station.shape

(70, 7)

In [7]:
station["city"].drop_duplicates().values

array(['San Jose', 'Redwood City', 'Mountain View', 'Palo Alto',
       'San Francisco'], dtype=object)

In [8]:
weather["events"].drop_duplicates().values

array([nan, 'Fog', 'Rain', 'Fog-Rain', 'rain', 'Rain-Thunderstorm'],
      dtype=object)