In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta 
import json
from geopy.distance import geodesic
from itertools import combinations
import mlrose
import time
from sklearn.cluster import KMeans
import math



In [2]:
#load clustered data
input_dir = '../3-output_data_cluster/data/'
# input data file here
# ex : 201808_data_cluster_4.csv
month = '201808' #input year & month here ex: 201808
input_file_name = input_dir+month+ '_data_cluster_4.csv'#input filename here ex: _data_cluster_4.csv
data = pd.read_csv(input_file_name, sep = ",")
len(data)

414

In [3]:
#load station_status.json for inventory at each station
file = '../1-raw_data/2018/station_status.json'
with open(file) as json_file:
    jdata = json.load(json_file)
file2= '../1-raw_data/2018/station_information.json'
with open(file2) as json_file2:
    jdata2 = json.load(json_file2) 

In [4]:
#record timestamp
timestamp1=jdata['last_updated']
timestamp2=jdata2['last_updated']
print("timestamp:",timestamp1,timestamp2,"; station_status, station_info ready")

timestamp: 1574476618 1574481682 ; station_status, station_info ready


In [5]:
#change dict to dataframe for json file and subsetting
station_data_json=jdata['data']['stations']
station_data=pd.DataFrame.from_dict(station_data_json, orient='columns')
station_data=station_data[['station_id','num_bikes_available','num_docks_available']]

#change dict to dataframe for json2 file and subsetting
station_info_json=jdata2['data']['stations']
station_info=pd.DataFrame.from_dict(station_info_json, orient='columns')
station_info=station_info[['station_id','short_name']]

#merge to retain full station_data
merged_data=pd.merge(station_data,station_info,how='outer',on='station_id')
merged_data.rename(columns={'short_name':'Station'},inplace=True)
merged_data['Station'] = merged_data['Station'].astype(int)
print("check length:",len(station_data),len(station_info),"==>",len(merged_data))
merged_data.head(2)

check length: 574 574 ==> 574


Unnamed: 0,station_id,num_bikes_available,num_docks_available,Station
0,72,1,18,31215
1,91,0,23,31227


In [6]:
optim_data=pd.merge(merged_data,data,how='right',on='Station')
print("check length:",len(data),len(merged_data),"==>",len(optim_data))

check length: 414 574 ==> 414


In [7]:
# Divide data into cluster and filter
cluster_data= {}
for i in np.unique(optim_data['Cluster Label']):
    cluster_data[i] = optim_data[optim_data['Cluster Label'] == i]
evaluation_data= cluster_data.copy()

In [8]:
# filter balanced stations
cluster_data[0]
for i in cluster_data:
#     rent_imbalance = cluster_data[i][cluster_data[i]['num_bikes_available'] <= 3] 
#     return_imbalance = cluster_data[i][cluster_data[i]['num_bikes_available'] >= cluster_data[i]['num_docks_available']]
    cluster_data[i] = cluster_data[i][(cluster_data[i]['num_docks_available'] <= 2) | (cluster_data[i]['num_bikes_available'] <= 2)] 
    print(len(cluster_data[i]))

21
1
52
9
46


In [9]:
# subclustering
for i in cluster_data:
    if len(cluster_data[i]) > 25: 
        k = cluster_data[i][['longitude', 'latitude']]
        kmeans = KMeans(n_clusters = math.ceil(len(cluster_data[i])/20), random_state= 2).fit(k)
        kmeans_label = kmeans.labels_
        cluster_data[i]['new_label'] = kmeans_label
        cluster_data[i]['Cluster Label'] = cluster_data[i]['Cluster Label'].astype(str)+'-' + cluster_data[i]['new_label'].astype(str)
        del cluster_data[i]['new_label']
        print(cluster_data[i])
#             print(cluster_data[i][cluster_data[i]['new_label']== j])

        #         for j in np.unique(cluster_data[i]['new_label']):
#             if len(cluster_data[i][cluster_data[i]['new_label']== j]) > 25:
#                 print(cluster_data[i][cluster_data[i]['new_label']==j])


    station_id  num_bikes_available  num_docks_available  Station  Unnamed: 0  \
3            1                    2                   13    31000           0   
25          24                   14                    1    31202         126   
26          25                   22                    1    31203         127   
29          28                    1                   13    31400         244   
33          32                    2                   12    31601         282   
41          43                   15                    0    31111         105   
45          47                   13                    0    31109         103   
56          59                   19                    0    31214         137   
62          65                    0                   11    31402         246   
68          73                    1                   10    31501         259   
83          88                   17                    0    31609         290   
88          94              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view

In [10]:
# for i in cluster_data:
#     print(len(np.unique(cluster_data[i]['Cluster Label'])))
print(len(cluster_data[0]))    

21


In [11]:
# distance matrix
for m in cluster_data:
    best_route = {}
    for uni in np.unique(cluster_data[m]['Cluster Label']):
        temp = cluster_data[m][cluster_data[m]['Cluster Label'] == uni].reset_index(drop=True).reset_index()
        k = list(zip(temp['longitude'], temp['latitude']))
        if len(k) > 3:
            distance = []
            for i in range(len(k)):
                for j in range(len(k)):
                    if i < j:
                        d = geodesic(k[i],k[j]).miles
                        distance.append((i,j,d))
            fitness_dists= mlrose.TravellingSales(distances = distance)
            problem_fit = mlrose.TSPOpt(length = len(temp), fitness_fn= fitness_dists,maximize= False)
            best_state,best_fitness = mlrose.genetic_alg(problem_fit, random_state = 2,mutation_prob= 0.20, max_attempts=70)
            print(best_state)
            print(best_fitness)
            output_csv = temp.reindex(list(best_state))
            output_csv.to_csv(f'cluster{uni}.csv', index=False) 

[ 1  3  5 10  8  4  9 12 19 20 18 16 17  2  0 15 14 11  7  6 13]
70.26456499388892
[16 17  6  9  4 20 19  5 12  1  0 15 18 11  7 13  3 14 10  8  2]
19.30845921445249
[12 10  5  0  2 17 15  1  8 14  6 16 13  4  7 11  3  9]
22.80588080821402
[ 1  9  5 10  8  7 12  6 11  4  0  3  2]
15.3955350792274
[2 0 5 7 1 4 6 3 8]
36.596255832370744
[ 1  0  7  3  4  6 10  8  9  5  2]
18.478238599248545
[ 4  8  1 10  2  0  3  9  7  5  6]
6.3505127307582905
[ 6 13  1  0 19 12 20  8  4 10  7  5 23  2  9 15 22 11 21 18 14 17  3 16]
28.368825409950713


In [None]:
#evaluation 
# distance calculator         
for m in evaluation_data:
    best_route = {}
    for uni in np.unique(evaluation_data[m]['Cluster Label']):
        temp = evaluation_data[m][evaluation_data[m]['Cluster Label'] == uni].reset_index(drop=True).reset_index()
        k = list(zip(temp['longitude'], temp['latitude']))
        if len(k) > 3:
            distane = []
            for i in range(len(k)):
                for j in range(len(k)):
                    if i < j:
                        d = geodesic(k[i],k[j]).miles
                        distance.append((i,j,d))
            fitness_dists= mlrose.TravellingSales(distances = distance)
            problem_fit = mlrose.TSPOpt(length = len(temp), fitness_fn= fitness_dists,maximize= False)
            best_state,best_fitness = mlrose.genetic_alg(problem_fit, random_state = 2) 
            print(uni, best_state)
            print(best_fitness)