In [None]:
import requests
import json
import pandas as pd

 ### 数据收集

In [None]:
#收集推特档案
arc = pd.read_csv('twitter-archive-enhanced.csv', dtype = {'tweet_id': str})

#收集神经网络图像预测文件
url = 'https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/image-predictions.tsv'

r = requests.get(url)
print('Status code:', r.status_code)

with open('image-predictions.tsv', 'wb') as file:
    file.write(r.content)
    
ima = pd.read_csv('image-predictions.tsv', sep='\t', dtype={'tweet_id':str})

#收集twitterAPI
tweet = []
with open('tweet_json.txt', 'r') as f:
    for i in f:
        t = json.loads(i)
        tweet.append({'tweetID': t['id_str'], 
                      
                      'retweet_count': t['retweet_count'],
                      
                      'favorite_count': t['favorite_count']})

#调整列标题        
cols = ['tweetID', 'retweet_count', 'favorite_count']
twe = pd.DataFrame(tweet)
twe = twe.loc[:,cols]

### 数据评估

In [None]:
#目测评估三个数据集
arc 

In [None]:
ima

In [None]:
twe

In [None]:
#编程评估三个数据集
arc.info()

In [None]:
arc.rating_denominator.value_counts()

In [None]:
arc.rating_numerator.value_counts()

In [None]:
twe.info()

In [None]:
twe.sample()

In [None]:
ima.info()

### 数据清洗

In [None]:
#保存副本
arc_clean = arc.copy()
ima_clean = ima.copy()
twe_clean = twe.copy()

#### 缺失数据
in_reply_to_status_id、in_reply_to_user_id、retweeted_status_timestamp、doggo、floofer、pupper、puppo 列

###### 定义
删除这些列

##### 代码

In [None]:
#清洗大部分空值的列
arc_clean.drop(['in_reply_to_status_id','in_reply_to_user_id', 'retweeted_status_id','retweeted_status_timestamp','expanded_urls'], axis=1, inplace=True)

In [None]:
arc_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True)

#### 规范数据

In [None]:
#统一分母为10，只保留分母为10的行
arc_clean = arc_clean[arc_clean.rating_denominator == 10]

##### 定义

根据项目动机，只保留原始评级

##### 代码

In [None]:
#保留原始状态非转发的Tweet
arc_clean.retweeted_status_user_id.fillna('1', inplace=True)

In [None]:
arc_clean = arc_clean[arc_clean.retweeted_status_user_id == '1']

In [None]:
arc_clean.drop(['retweeted_status_user_id'], axis=1, inplace=True)

In [None]:
#保留分子的值为0~14的行
arc_clean = arc_clean[arc_clean.rating_numerator >= 0]

In [None]:
arc_clean = arc_clean[arc_clean.rating_numerator <= 14]

##### 测试

In [None]:
#检查结果
arc_clean.rating_numerator.value_counts()

#### 整洁度问题

##### 定义
把识别出的狗的种类的行 merge 到 twitter-archive-enhanced.csv 中，把相关转发数（retweet_count）和喜欢数（favorite_count）merge 到
twitter-archive-enhanced.csv 中。

##### 代码

In [None]:
#检查推特档案tweet_id的数据类型
arc_clean.tweet_id

In [None]:
#保留预测为狗狗的行
ima_clean = ima_clean[ima_clean.p1_dog == True]

In [None]:
ima_clean.drop(['p1_dog', 'p2', 'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog'], axis=1, inplace=True)

##### 测试

In [None]:
#检查结果
ima_clean

In [None]:
#合并预测结果和推特档案
tmp = pd.merge(arc_clean, twe_clean, left_on='tweet_id', right_on='tweetID',how="inner")
dogs = pd.merge(tmp, ima_clean, on='tweet_id')

#### 为可视化做准备

In [None]:
#更改列名
dogs.rename(columns={'tweetID':'total_count'}, inplace=True)

In [None]:
#通过求和为排序做准备
dogs.total_count = (dogs.rating_numerator + dogs.retweet_count + dogs.favorite_count)

In [None]:
#为狗排序，得出人们最喜爱的狗
dogs.sort_values('total_count', ascending=False, inplace=True)

In [None]:
#得出人们较不喜欢的的狗
ddogs = dogs.tail(10)

In [None]:
#查看数据集中用户上传最多的狗狗种类是什么
total_dogs = dogs.p1.value_counts()

In [None]:
#保存为twitter_archive_master.csv文件
dogs.to_csv('twitter_archive_master.csv', index=False)

In [None]:
#缺失的name里的None改为No Name
dogs.name.replace({'None':'No Name'}, inplace=True)

In [None]:
dogs

In [None]:
#取排名前20
dogs = dogs.head(20)

### 数据可视化

In [None]:
#利用pygal库进行可交互式可视化
import pygal
from pygal.style import LightColorizedStyle as LCS, LightenStyle as LS

my_style = LS('#4486F7', base_style=LCS)
chart1 = pygal.Bar(style=my_style, x_label_rotation=45, show_legend=False)

names, plot_dicts = [], []
for dog in dogs.iterrows():
    names.append(dog[1]['name'])
    #设置值、标签和交互式链接
    plot_dict={'value': dog[1]['total_count'],
              'label': dog[1]['p1'],
              'xlink': dog[1]['jpg_url']}
    plot_dicts.append(plot_dict)
    
chart1.title = ('Most popular dogs in WeRateDogs ')
chart1.x_labels = names
chart1.add('', plot_dicts)
chart1.render_to_file('popular_dogs.svg')

![popular_dogs](popular_dogs.svg)

结论：最受欢迎的狗狗种类从⾼到低前5位为拉布拉多⽝、湖畔㹴、吉娃娃、史宾格⽝、法国⽃⽜⽝。

In [None]:
#对用户上传最多的狗狗的种类进行可视化
chart2 = pygal.Bar(style=my_style, x_label_rotation=45, show_legend=False)

dog_dicts = []
for dog in total_dogs[:10]:
    dog_dict={'value': dog}
    dog_dicts.append(dog_dict)
    
kind = ['golden_retriever', 'Labrador_retriever', 'Pembroke', 'Chihuahua', 'chow', 'pug', 'Samoyed', 'Pomeranian', 'toy_poodle', 'malamute']

chart2.title = ('Most kind dogs by user uploaded')
chart2.x_labels = kind
chart2.add('', dog_dicts)
chart2.render_to_file('most_kind_dogs.svg')

![most_kind_dogs](most_kind_dogs.svg)

结论，在上传图⽚⽤户中拥有最多数量的狗狗种类前五位分别为：⾦⽑⽝、拉布拉多⽝、潘布鲁克威尔⼠柯基⽝、吉娃娃、松狮⽝。

In [None]:
#对用户上传最少的狗狗的种类进行可视化
chart3 = pygal.Bar(style=my_style, x_label_rotation=45, show_legend=False)

ddog_dicts = []
for dog in total_dogs[-10:]:
    ddog_dict={'value': dog}
    ddog_dicts.append(ddog_dict)
    
kinds = ['silky_terrier', 'black-and-tan_coonhound', 'Ibizan_hound', 'groenendael', 'EntleBucher', 'Welsh_springer_spaniel', 'standard_schnauzer', 'curly-coated_retriever', 'Tibetan_terrrier', 'Appenzeller']

chart3.title = ('Minimal kind dogs by user uploaded')
chart3.x_labels = kinds
chart3.add('', ddog_dicts)
chart3.render_to_file('minimal_kind_dogs.svg')

![minimal_kind_dogs](minimal_kind_dogs.svg)

结论：拥有最少数量的狗狗种类如下：澳洲丝⽑㹴、⿊褐⾊猎浣熊⽝、伊⽐莎猎⽝、格罗安达⽝、恩特雷布赫⼭地⽝、威尔⼠史宾格⽝、标准雪纳瑞、卷⽑寻回猎⽝、⻄藏㹴、阿彭策尔牧⽜⽝。