# 抖音用户浏览行为数据分析挖掘与可视化

数据集包括抖音用户浏览视频的行为（包括浏览记录，用户相关信息，行为描述等等）

## 1.数据预处理

In [1]:
import pandas as pd

In [2]:
#数据导入
df=pd.read_csv(r'douyin_dataset.txt')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,uid,user_city,item_id,author_id,item_city,channel,finish,like,music_id,duration_time,real_time,H,date
0,3,15692,109.0,691661,18212,213.0,0,0,0,11513.0,10,2019-10-28 21:55:10,21,2019-10-28
1,5,44071,80.0,1243212,34500,68.0,0,0,0,1274.0,9,2019-10-21 22:27:03,22,2019-10-21
2,16,10902,202.0,3845855,634066,113.0,0,0,0,762.0,10,2019-10-26 00:38:51,0,2019-10-26
3,19,25300,21.0,3929579,214923,330.0,0,0,0,2332.0,15,2019-10-25 20:36:25,20,2019-10-25
4,24,3656,138.0,2572269,182680,80.0,0,0,0,238.0,9,2019-10-21 20:46:29,20,2019-10-21


In [4]:
df.shape

(1737357, 14)

In [5]:
df.columns

Index(['Unnamed: 0', 'uid', 'user_city', 'item_id', 'author_id', 'item_city',
       'channel', 'finish', 'like', 'music_id', 'duration_time', 'real_time',
       'H', 'date'],
      dtype='object')

#### 字段解释

用户信息：uid(用户id),user_city(用户城市)

作品信息：item_id(作品id),item_city(作者城市),channel(作品频道),music_id(音乐id),duration_time(作品时长),real_time(具体发布时间),H、date(发布的时、天)

作者信息：author_id

浏览行为描述：finish(是否看完),like(是否点赞)

In [6]:
#可以看到Unnamed: 0为无效字段，删除掉
del df["Unnamed: 0"]

In [7]:
df.columns

Index(['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel',
       'finish', 'like', 'music_id', 'duration_time', 'real_time', 'H',
       'date'],
      dtype='object')

In [8]:
#检查数据空缺
df.isnull().sum()

uid              0
user_city        0
item_id          0
author_id        0
item_city        0
channel          0
finish           0
like             0
music_id         0
duration_time    0
real_time        0
H                0
date             0
dtype: int64

In [9]:
#无数据空缺
#查看数据大体信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1737357 entries, 0 to 1737356
Data columns (total 13 columns):
 #   Column         Dtype  
---  ------         -----  
 0   uid            int64  
 1   user_city      float64
 2   item_id        int64  
 3   author_id      int64  
 4   item_city      float64
 5   channel        int64  
 6   finish         int64  
 7   like           int64  
 8   music_id       float64
 9   duration_time  int64  
 10  real_time      object 
 11  H              int64  
 12  date           object 
dtypes: float64(3), int64(8), object(2)
memory usage: 172.3+ MB


In [10]:
#查看数据是否重复
df.duplicated().sum()

45

In [11]:
#删掉重复的信息
df=df.drop_duplicates()
df.duplicated().sum()

0

## 2.特征指标的构建

### 用户指标分析：

浏览量、点赞量、完整观看数量、观看作品数、观看作者数、观看作品的平均时长、观看配乐数、去过的城市数、观看作品城市数

### 作者指标分析：

创作活跃度（日）、去过的城市数量、发布作品的日期数、使用配乐数量、总浏览量、发布作品的点赞数、总观完量、总作品数、作品的平均时长

### 作品指标分析：

点赞量、浏览量、背景音乐、发布城市

## 3.特征指标统计分析

### 3.1用户指标统计分析

In [12]:
#新建表
user_df=pd.DataFrame()

In [13]:
#用户id
user_df['uid']=df['uid'].unique()
#设置uid为index，方便后续对齐
user_df.set_index('uid',inplace=True)
#将表按照inedx排序
user_df=user_df.sort_index(ascending=True)

In [14]:
#用户的浏览量
user_df['浏览量']=df.groupby('uid')['like'].count()
#用户的点赞量
user_df['点赞量']=df.groupby('uid')['like'].sum()
#观看作者数
user_df['观看作者数']=df.groupby(['uid']).agg({'author_id':pd.Series.nunique})
#观看作品数
user_df['观看作品数']=df.groupby(['uid']).agg({'item_id':pd.Series.nunique})
#观看作品平均时长
user_df["观看作品平均时长"]=df.groupby(['uid'])['duration_time'].mean()
#观看配乐数
user_df['观看配乐数']=df.groupby(['uid']).agg({'music_id':pd.Series.nunique})
#完整观看数
user_df['完整观看数']=df.groupby(['uid'])['finish'].sum()
#用户去过的城市数量
user_df["去过的城市数"]=df.groupby(['uid']).agg({'user_city':pd.Series.nunique})
#用户观看作品所在的城市数量
user_df['观看作品所在的城市数']=df.groupby(['uid']).agg({'item_city':pd.Series.nunique})

In [15]:
#查看大体数据信息
user_df.describe()

Unnamed: 0,浏览量,点赞量,观看作者数,观看作品数,观看作品平均时长,观看配乐数,完整观看数,去过的城市数,观看作品所在的城市数
count,59232.0,59232.0,59232.0,59232.0,59232.0,59232.0,59232.0,59232.0,59232.0
mean,29.330632,0.283175,28.337942,29.330311,11.331897,25.611392,11.759995,1.155372,20.32268
std,49.589417,2.24018,46.642395,49.588854,3.301528,38.459314,16.970857,0.529922,24.595297
min,1.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0
25%,4.0,0.0,4.0,4.0,9.705882,4.0,2.0,1.0,4.0
50%,12.0,0.0,12.0,12.0,10.833333,12.0,6.0,1.0,11.0
75%,34.0,0.0,33.0,34.0,12.088944,31.0,15.0,1.0,28.0
max,1951.0,183.0,1740.0,1951.0,42.0,1197.0,284.0,10.0,279.0


In [16]:
user_df.head()

Unnamed: 0_level_0,浏览量,点赞量,观看作者数,观看作品数,观看作品平均时长,观看配乐数,完整观看数,去过的城市数,观看作品所在的城市数
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,34,0,31,34,12.058824,31,18,1,28
1,28,1,28,28,12.357143,26,14,1,25
2,56,0,56,56,10.357143,47,19,3,45
3,117,1,116,117,9.982906,89,60,1,76
4,123,0,117,123,10.853659,94,77,1,84


In [17]:
#存入文件，方便后续读取处理可视化
user_df.to_csv(r"用户特征.csv",encoding='utf_8_sig')

### 3.2作者特征统计分析

In [23]:
#创建新表
author_df=pd.DataFrame()
#作者id，并设置为index
author_df['author_id']=df['author_id'].unique()
author_df.set_index('author_id',inplace=True)
author_df=author_df.sort_index()

In [37]:
#总作品数
author_df['总作品数'] = df.groupby('author_id')['item_id'].nunique()
#总浏览量
author_df['总浏览量']=df.groupby(['author_id'])['like'].count()
#总点赞量
author_df['总点赞量'] = df.groupby('author_id')['like'].sum().fillna(0)
#总观完量
author_df['总观完量']=df.groupby(['author_id'])['finish'].sum()
#使用的配乐数量
author_df['使用的配乐数量'] = df.groupby('author_id')['music_id'].nunique()
#去过的城市数
author_df['去过的城市数']=df.groupby(['author_id'])['item_city'].nunique()
#发布作品的日期数
df['real_time'] = pd.to_datetime(df['real_time'])
author_df['发布作品的日期数']=df.groupby(['author_id'])['real_time'].nunique()
#作品的平均时长
author_df['作品的平均时长'] = df.groupby('author_id')['duration_time'].mean()
#创作活跃度（日）
author_df['创造活跃度(日)'] = df.groupby('author_id')['date'].apply(
    lambda x: (x.max() - x.min()).days + 1
)

In [38]:
author_df

Unnamed: 0_level_0,总作品数,总浏览量,总点赞量,总观完量,使用的配乐数量,去过的城市数,发布作品的日期数,作品的平均时长,创造活跃度(日)
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,1,0,0,1,1,1,10.000000,1
1,3,16,0,8,3,1,3,8.875000,9
3,1,311,3,203,1,1,1,9.000000,1
5,5,1054,33,485,4,1,5,7.040797,28
8,1,4,0,3,1,1,1,19.000000,1
...,...,...,...,...,...,...,...,...,...
850274,1,1,0,0,1,1,1,10.000000,1
850276,1,1,0,0,1,1,1,9.000000,1
850279,1,1,0,0,1,1,1,10.000000,1
850287,1,1,0,0,1,1,1,9.000000,1


In [39]:
#查看大体信息
author_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 208187 entries, 0 to 850307
Data columns (total 9 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   总作品数      208187 non-null  int64  
 1   总浏览量      208187 non-null  int64  
 2   总点赞量      208187 non-null  int64  
 3   总观完量      208187 non-null  int64  
 4   使用的配乐数量   208187 non-null  int64  
 5   去过的城市数    208187 non-null  int64  
 6   发布作品的日期数  208187 non-null  int64  
 7   作品的平均时长   208187 non-null  float64
 8   创造活跃度(日)  208187 non-null  int64  
dtypes: float64(1), int64(8)
memory usage: 15.9 MB


In [40]:
author_df.describe()

Unnamed: 0,总作品数,总浏览量,总点赞量,总观完量,使用的配乐数量,去过的城市数,发布作品的日期数,作品的平均时长,创造活跃度(日)
count,208187.0,208187.0,208187.0,208187.0,208187.0,208187.0,208187.0,208187.0,208187.0
mean,2.158992,8.344959,0.080567,3.345877,1.942475,1.044787,2.15891,10.673132,5.176279
std,3.504402,38.481736,0.536451,16.886125,2.694698,0.28556,3.504087,5.795376,8.432662
min,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
25%,1.0,1.0,0.0,0.0,1.0,1.0,1.0,9.0,1.0
50%,1.0,1.0,0.0,0.0,1.0,1.0,1.0,9.727273,1.0
75%,2.0,4.0,0.0,2.0,2.0,1.0,2.0,10.0,4.0
max,152.0,2648.0,36.0,1111.0,98.0,19.0,152.0,620.0,40.0


In [41]:
#存入文件
author_df.to_csv('作者特征.csv',encoding='utf_8_sig')

### 3.3作品特征统计分析

点赞量、浏览量、背景音乐、发布城市

In [42]:
#新建表
item_df=pd.DataFrame()
#索引创建
item_df['item_id']=df['item_id'].unique()
item_df.set_index('item_id',inplace=True)
item_df=item_df.sort_index()

In [43]:
#作品浏览量
item_df['浏览量']=df.groupby(['item_id'])['like'].count()
#点赞量
item_df['点赞量']=df.groupby(['item_id'])['like'].sum()
#背景音乐
item_df['背景音乐']=df.groupby(['item_id'])['music_id'].first()
#发布城市
item_df['发布城市']=df.groupby(['item_id'])['item_city'].first()

In [44]:
item_df

Unnamed: 0_level_0,浏览量,点赞量,背景音乐,发布城市
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,24,0,220.0,24.0
1,1309,5,574.0,63.0
3,2,0,26289.0,7.0
4,613,3,162.0,146.0
7,2,0,540.0,33.0
...,...,...,...,...
4122649,1,0,298.0,70.0
4122653,1,0,8.0,182.0
4122669,1,0,33.0,137.0
4122677,1,0,10456.0,246.0


In [47]:
item_df.describe()

Unnamed: 0,浏览量,点赞量,背景音乐,发布城市
count,449472.0,449472.0,449472.0,449472.0
mean,3.865229,0.037317,6113.790414,92.539524
std,17.718806,0.289749,12329.790826,81.476422
min,1.0,0.0,1.0,0.0
25%,1.0,0.0,220.0,29.0
50%,1.0,0.0,1190.0,69.0
75%,2.0,0.0,5544.0,139.0
max,1506.0,35.0,89776.0,460.0


In [48]:
#存入文件
item_df.to_csv('作品特征.csv',encoding='utf_8_sig')