In [3]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os 
%matplotlib inline
plt.style.use('ggplot')  #使用该风格
plt.rcParams['axes.unicode_minus']=False #正常显示负号
plt.rcParams['font.family'] = ['Arial Unicode MS'] #正常显示中文
os.chdir('/Users/lyn/Library/Mobile Documents/com~apple~CloudDocs/Documents/jupyter/')  #定义相对路径

In [4]:
#读取数据
mo=pd.read_csv('mobike.csv')
mo.info()
mo.isnull().sum() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6427 entries, 0 to 6426
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6427 non-null   int64  
 1   user_id            6427 non-null   int64  
 2   start_time         6427 non-null   object 
 3   end_time           6427 non-null   object 
 4   timeduration       6427 non-null   int64  
 5   bikeid             6427 non-null   int64  
 6   tripduration       6427 non-null   int64  
 7   from_station_id    6427 non-null   int64  
 8   from_station_name  6427 non-null   object 
 9   to_station_id      6427 non-null   int64  
 10  to_station_name    6427 non-null   object 
 11  usertype           6427 non-null   object 
 12  gender             5938 non-null   object 
 13  birthyear          5956 non-null   float64
 14  age                6427 non-null   object 
dtypes: float64(1), int64(7), object(7)
memory usage: 753.3+ KB


Unnamed: 0             0
user_id                0
start_time             0
end_time               0
timeduration           0
bikeid                 0
tripduration           0
from_station_id        0
from_station_name      0
to_station_id          0
to_station_name        0
usertype               0
gender               489
birthyear            471
age                    0
dtype: int64

- gender和birthday有少量空值，需drop；
- birthyear跟age重复，可删除birthyear数据；age为类别性变量，需改为int；
- start_time、end time为类别型变量，需更改为时间变量；
- 多余变量Unnamed,user_id,bikeid,from_station_id,to_station_id可删除；

In [111]:
mo1=mo.dropna().copy()     #去掉空值
mo1=mo1.drop(['Unnamed: 0','user_id','bikeid','from_station_id',
              'to_station_id','from_station_id','to_station_id','from_station_name','to_station_name','birthyear'],axis=1)
#mo1.loc[:,'age']=mo1['age'].astype('int')    #改类型为int
mo1['age']=mo1['age'].astype('int') 
mo1.loc[:,'start_time']=pd.to_datetime(mo1['start_time'])  #改类型为datetime64
mo1.loc[:,'end_time']=pd.to_datetime(mo1['end_time'])        #改类型为datetime64
mo1['start_hour']=mo1['start_time'].apply(lambda x:x.hour-0)  #新增每日开始骑行的小时数
mo1=pd.get_dummies(mo1)    #哑变量转换
mo1.head()


Unnamed: 0,start_time,end_time,timeduration,tripduration,age,start_hour,usertype_Customer,usertype_Subscriber,gender_Female,gender_Male
0,2018-11-14 07:37:00,2018-11-14 07:44:00,7,436,37,7,0,1,0,1
1,2018-12-18 19:02:00,2018-12-18 19:10:00,7,445,31,19,0,1,0,1
2,2018-10-09 12:37:00,2018-10-09 12:55:00,18,1090,30,12,1,0,0,1
3,2018-11-12 12:30:00,2018-11-12 12:40:00,9,581,30,12,0,1,1,0
4,2018-11-07 07:29:00,2018-11-07 07:35:00,6,390,40,7,0,1,0,1


In [112]:
mo2=mo1.drop(['start_time','end_time','gender_Male','usertype_Customer'],axis=1) #去掉多余信息
mo2.head()

Unnamed: 0,timeduration,tripduration,age,start_hour,usertype_Subscriber,gender_Female
0,7,436,37,7,1,0
1,7,445,31,19,1,0
2,18,1090,30,12,0,0
3,9,581,30,12,1,1
4,6,390,40,7,1,0


In [134]:
#开始聚类分析模型
from sklearn import cluster
from sklearn import preprocessing
mo3=pd.DataFrame(preprocessing.scale(mo2))     #数据标准化
model=cluster.KMeans(n_clusters=4,random_state=1)  #分成4个群，设置随机数种子1
model.fit(mo3)  
#评估模型
from sklearn import metrics
mo_cluster=model.fit_predict(mo2)    #生成输出集
score=metrics.silhouette_score(mo2,mo_cluster)# 计算轮廓系数
print(score)  #聚类效果很好；

0.9351433162052254


In [136]:
#调整K簇优化模型
from sklearn import cluster
from sklearn import preprocessing
mo3=pd.DataFrame(preprocessing.scale(mo2))     #数据标准化
model=cluster.KMeans(n_clusters=3,random_state=1)  #分成3个群，设置随机数种子1
model.fit(mo3)  
mo2['cluster']=model.labels_  #添加分群标签
from sklearn import metrics
mo_cluster=model.fit_predict(mo2)    #生成输出集
score=metrics.silhouette_score(mo2,mo_cluster)# 计算轮廓系数
print(score)  #聚类效果更好，则取此模型；

0.983432351580415


In [137]:
#数据透视表，age取平均，gender_Female取count用来计算人数，start_hour取平均，usertype_Subscriber取sum计算订阅者人数,timeduration去平均；
mo2.pivot_table(index=['cluster','gender_Female'],
                values=['age','start_hour','usertype_Subscriber','gender_Female','timeduration','tripduration'],
                aggfunc={'age':'mean','start_hour':'mean','usertype_Subscriber':'sum',
                         'gender_Female':'count','timeduration':'mean','tripduration':'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,age,gender_Female,start_hour,timeduration,tripduration,usertype_Subscriber
cluster,gender_Female,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,33.196078,153,13.555556,20.03268,3655.601307,2.0
0,1,35.464286,56,13.214286,21.625,2359.142857,0.0
1,0,37.054639,4484,13.146744,10.246655,682.916369,4484.0
2,1,34.93253,1245,13.376707,11.204016,761.816867,1245.0


- 性别上，0群体均为男性，1群体均为女性，2群体男女皆有且男性人数大于女性人数
- 订阅上，0和1群体均为订阅者，2群体几乎没有订阅者；
- 年龄上，0群体年龄比1和2均高；
- 骑行时长和距离上，0和1的骑行时长和骑行距离均显著小于2，说明订阅用户多用于短距高频出行；
- 出行时刻上，3个群体start_hour基本一致；

## 总结
- 用户群0：单次骑行时长和距离均较小的男性订阅用户，年龄相对较大；
- 用户群1：单次骑行时长和距离均较小的女性订阅用户，年龄较1群体大；
- 用户群2：单次单次骑行时长和距离都较长的非订阅用户，年龄较0群体小，与1群体接近；
- 对于用户群2，低频长距用户，次数包比时间包对他们来说更具吸引力；
- 0群体和1群体都是短途高频用户，包月对于他们来说更优惠，可推出更长周期套餐和其他本地生活类APP共同会员提升用户粘性；