# Project: Wrangling and Analyze Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import tweepy
import json
import matplotlib.pyplot as plt
import requests 

##### used above commands to import libraries

## Data Gathering

1. Directly downloading the WeRateDogs Twitter archive data (twitter_archive_enhanced.csv)

In [2]:
df_enhanced = pd.read_csv('twitter-archive-enhanced.csv')
df_enhanced.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


2. Using Requests library to download the tweet image prediction (image_predictions.tsv)

In [3]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
   
response = requests.get(url)

with open('image_predictions.tsv', mode = 'wb') as file:
    file.write(response.content)
    

In [4]:
df_predict = pd.read_csv('image_predictions.tsv', sep = '\t')
df_predict.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


3. Using the Tweepy library to query additional data via the Twitter API (tweet_json.txt)

In [5]:
# the tweet-json.txt was downloaded and uploaded to the notebook
# the file was then read with and imported with commands below
with open('tweet-json.txt') as f:
    df_api = pd.DataFrame((json.loads(line) for line in f), columns = ['id', 'favorite_count', 'retweet_count'])
    
df_api.columns = ['tweet_id','favorites','retweets']
df_api.set_index('tweet_id', inplace = True)
df_api.head()
    
    
    

Unnamed: 0_level_0,favorites,retweets
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
892420643555336193,39467,8853
892177421306343426,33819,6514
891815181378084864,25461,4328
891689557279858688,42908,8964
891327558926688256,41048,9774


## Assessing Data




In [6]:
df_enhanced.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,


In [7]:
df_enhanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

### There are some missing values

In [8]:
df_enhanced.isnull().sum().sum()

11140

### There are some wrong dog names in the dataset.

In [9]:
df_enhanced.name.value_counts()

None          745
a              55
Charlie        12
Oliver         11
Lucy           11
Cooper         11
Penny          10
Tucker         10
Lola           10
Winston         9
Bo              9
Sadie           8
the             8
an              7
Buddy           7
Bailey          7
Daisy           7
Toby            7
Koda            6
Oscar           6
Dave            6
Milo            6
Scout           6
Rusty           6
Bella           6
Jax             6
Jack            6
Leo             6
Stanley         6
George          5
             ... 
his             1
Rodney          1
Clifford        1
Ike             1
Stormy          1
Bloop           1
Tuco            1
Aubie           1
Banjo           1
Jennifur        1
Chef            1
Yukon           1
Pluto           1
Mason           1
such            1
Taz             1
officially      1
Al              1
Alejandro       1
Ronduh          1
Harnold         1
Roscoe          1
Cecil           1
Coopson         1
Cuddles   

### Timestamp has wrong datatype

In [10]:
df_enhanced['timestamp'].dtypes

dtype('O')

### Some dogs do not have any classification

In [11]:
df_dog = df_enhanced.copy()
df_dog.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')

In [12]:
df_dog.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator','name'], axis = 1)

Unnamed: 0,tweet_id,doggo,floofer,pupper,puppo
0,892420643555336193,,,,
1,892177421306343426,,,,
2,891815181378084864,,,,
3,891689557279858688,,,,
4,891327558926688256,,,,
5,891087950875897856,,,,
6,890971913173991426,,,,
7,890729181411237888,,,,
8,890609185150312448,,,,
9,890240255349198849,doggo,,,


### The number of columns and rows as compared to the twitter archive dataset differs

In [13]:
df_predict.shape

(2075, 12)

### There are duplicate images

In [14]:
df_predict.jpg_url.duplicated().sum()

66

### The number of rows and columns as compared to the other two datasets differ

In [15]:
df_api.shape

(2354, 2)

### Some column names are confusing

In [16]:
df_predict.columns

Index(['tweet_id', 'jpg_url', 'img_num', 'p1', 'p1_conf', 'p1_dog', 'p2',
       'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog'],
      dtype='object')

### There is more than one classification for a dog

In [17]:
df_dog.query('doggo == "doggo" & floofer == "floofer" & pupper == "None" & puppo == "None"')

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
200,854010172552949760,,,2017-04-17 16:34:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...","At first I thought this was a shy doggo, but i...",,,,https://twitter.com/dog_rates/status/854010172...,11,10,,doggo,floofer,,


### Quality issues
1. There are duplicate images for some entries.

2. Some column names are confusing as they do not give much information about the content.

3. There is more than one classification for some dogs.

4. Some dogs have no classification.

5. The image prediction dataset has 2075 entries as compared to twitter archive's 2356 entries.

6. There are some missing values.

7. The number of data entries in the twitter API dataset differs from the other two datasets.

8. There are some wrong dog names. Eg. a dog being called 'a'

9. Timestamp has wrong datatype that is object instead of date.



### Tidiness issues
1. All the datasets should be merged together.

2. The columns that classify dogs should be added together for easier analysis

## Cleaning Data


In [18]:
# copies of original pieces of data
data = pd.merge(df_enhanced, df_predict, on = 'tweet_id', how = 'outer')
data.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,1.0,orange,0.097049,False,bagel,0.085851,False,banana,0.07611,False


In [19]:
data = pd.merge(data, df_api, on = 'tweet_id', how = 'outer')
data.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,favorites,retweets
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,0.097049,False,bagel,0.085851,False,banana,0.07611,False,39467.0,8853.0
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,...,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True,33819.0,6514.0


In [20]:
df = data.copy()
df.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,favorites,retweets
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,0.097049,False,bagel,0.085851,False,banana,0.07611,False,39467.0,8853.0
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,...,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True,33819.0,6514.0


### Issue #1:

There is more than one classification for some dogs.

#### Code

In [21]:
def find_dog_category(val):
    is_doggo = val[0]
    is_floofer = val[1]
    is_pupper = val[2]
    is_puppo = val[3]
    res = ''
    if is_doggo != 'None':
        res += 'doggo '
    if is_floofer != 'None':
        res += 'floofer '
    if is_pupper != 'None':
        res += 'pupper '
    if is_puppo != 'None':
        res += 'puppo '
    return res

df['dog_category'] = df[['doggo', 'floofer', 'pupper', 'puppo']].apply(find_dog_category, axis = 1)

In [22]:
df.drop(['puppo', 'pupper', 'doggo', 'floofer'], axis = 1, inplace=True)
df.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,favorites,retweets,dog_category
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,False,bagel,0.085851,False,banana,0.07611,False,39467.0,8853.0,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,...,True,Pekinese,0.090647,True,papillon,0.068957,True,33819.0,6514.0,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,...,True,malamute,0.078253,True,kelpie,0.031379,True,25461.0,4328.0,


### Issue #2:

Timestamp has wrong datatype that is object instead of date.

#### Code

In [23]:
df.timestamp.dtypes

dtype('O')

In [24]:
df.timestamp = pd.to_datetime(df.timestamp)

In [25]:
df.timestamp.dtypes

dtype('<M8[ns]')


### Issue #3: 

Some columns are not necessary

#### Code

In [26]:
df.shape

(2356, 27)

In [27]:
df = df[df.retweeted_status_id.isnull()]

In [28]:
len(df[df.retweeted_status_id.isnull() == False])

0

In [29]:
df.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'jpg_url', 'img_num', 'p1', 'p1_conf',
       'p1_dog', 'p2', 'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog',
       'favorites', 'retweets', 'dog_category'],
      dtype='object')

In [30]:
df.drop(['in_reply_to_status_id','in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis = 1,inplace = True)

### Issue #4:

There are duplicate images for some entries.

#### Code

In [31]:
sum(df.jpg_url.duplicated())

180

In [32]:
df = df.drop_duplicates(subset=['jpg_url'], keep='last')

In [33]:
sum(df['jpg_url'].duplicated())

0

### Issue #5:

There are some missing values.

#### Code

In [34]:
df.isnull().sum()

tweet_id              0
timestamp             0
source                0
text                  0
expanded_urls         1
rating_numerator      0
rating_denominator    0
name                  0
jpg_url               1
img_num               1
p1                    1
p1_conf               1
p1_dog                1
p2                    1
p2_conf               1
p2_dog                1
p3                    1
p3_conf               1
p3_dog                1
favorites             0
retweets              0
dog_category          0
dtype: int64

In [35]:
df.fillna(value = 'Unknown', inplace = True)

In [36]:
df.isnull().sum()

tweet_id              0
timestamp             0
source                0
text                  0
expanded_urls         0
rating_numerator      0
rating_denominator    0
name                  0
jpg_url               0
img_num               0
p1                    0
p1_conf               0
p1_dog                0
p2                    0
p2_conf               0
p2_dog                0
p3                    0
p3_conf               0
p3_dog                0
favorites             0
retweets              0
dog_category          0
dtype: int64

In [37]:
df.shape

(1995, 22)

In [38]:
df.head

<bound method NDFrame.head of                 tweet_id           timestamp  \
0     892420643555336193 2017-08-01 16:23:56   
1     892177421306343426 2017-08-01 00:17:27   
2     891815181378084864 2017-07-31 00:18:03   
3     891689557279858688 2017-07-30 15:58:51   
4     891327558926688256 2017-07-29 16:00:24   
5     891087950875897856 2017-07-29 00:08:17   
6     890971913173991426 2017-07-28 16:27:12   
7     890729181411237888 2017-07-28 00:22:40   
8     890609185150312448 2017-07-27 16:25:51   
9     890240255349198849 2017-07-26 15:59:51   
10    890006608113172480 2017-07-26 00:31:25   
11    889880896479866881 2017-07-25 16:11:53   
12    889665388333682689 2017-07-25 01:55:32   
13    889638837579907072 2017-07-25 00:10:02   
14    889531135344209921 2017-07-24 17:02:04   
15    889278841981685760 2017-07-24 00:19:32   
16    888917238123831296 2017-07-23 00:22:39   
17    888804989199671297 2017-07-22 16:56:37   
18    888554962724278272 2017-07-22 00:23:06   
20    8880

### Issue #6: 

New column needed

#### Code

In [39]:
df['rating'] = df['rating_numerator'] / df['rating_denominator']

In [40]:
df.columns

Index(['tweet_id', 'timestamp', 'source', 'text', 'expanded_urls',
       'rating_numerator', 'rating_denominator', 'name', 'jpg_url', 'img_num',
       'p1', 'p1_conf', 'p1_dog', 'p2', 'p2_conf', 'p2_dog', 'p3', 'p3_conf',
       'p3_dog', 'favorites', 'retweets', 'dog_category', 'rating'],
      dtype='object')

### Issue #7: 

Some column names are confusing as they do not give much information about the content.

#### Code

In [41]:
df.columns = ['tweet_id', 'timestamp', 'source', 'text', 'expanded_urls',
       'rating_numerator', 'rating_denominator', 'name','image_url', 'img_number', 'first_prediction', 'first_prediction_confidence', 'first_prediction_isdog', 'second_prediction', 'second_prediction_confidence', 'second_prediction_isdog', 'third_prediction', 'third_prediction_confidence','third_prediction_isdog', 'favorites', 'retweets', 'stage', 'rating']

In [42]:
df.columns

Index(['tweet_id', 'timestamp', 'source', 'text', 'expanded_urls',
       'rating_numerator', 'rating_denominator', 'name', 'image_url',
       'img_number', 'first_prediction', 'first_prediction_confidence',
       'first_prediction_isdog', 'second_prediction',
       'second_prediction_confidence', 'second_prediction_isdog',
       'third_prediction', 'third_prediction_confidence',
       'third_prediction_isdog', 'favorites', 'retweets', 'stage', 'rating'],
      dtype='object')

In [43]:
df.head(2)

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,image_url,img_number,...,second_prediction,second_prediction_confidence,second_prediction_isdog,third_prediction,third_prediction_confidence,third_prediction_isdog,favorites,retweets,stage,rating
0,892420643555336193,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1,...,bagel,0.0858511,False,banana,0.07611,False,39467.0,8853.0,,1.3
1,892177421306343426,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,...,Pekinese,0.0906465,True,papillon,0.0689569,True,33819.0,6514.0,,1.3


### Issue #8: 

Reordering column names

#### Code

In [44]:
df = df[['tweet_id', 'timestamp', 'source', 'text', 'name', 'stage','rating', 'rating_numerator', 'rating_denominator','favorites', 'img_number', 'first_prediction', 'first_prediction_confidence',
       'first_prediction_isdog', 'second_prediction',
       'second_prediction_confidence', 'second_prediction_isdog',
       'third_prediction', 'third_prediction_confidence',
       'third_prediction_isdog', 'image_url', 'retweets', 'expanded_urls' ]]

In [45]:
df.head(2)

Unnamed: 0,tweet_id,timestamp,source,text,name,stage,rating,rating_numerator,rating_denominator,favorites,...,first_prediction_isdog,second_prediction,second_prediction_confidence,second_prediction_isdog,third_prediction,third_prediction_confidence,third_prediction_isdog,image_url,retweets,expanded_urls
0,892420643555336193,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,Phineas,,1.3,13,10,39467.0,...,False,bagel,0.0858511,False,banana,0.07611,False,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,8853.0,https://twitter.com/dog_rates/status/892420643...
1,892177421306343426,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,Tilly,,1.3,13,10,33819.0,...,True,Pekinese,0.0906465,True,papillon,0.0689569,True,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,6514.0,https://twitter.com/dog_rates/status/892177421...


## Storing Data
Gathered, assessed, and cleaned master dataset saved to a CSV file named "twitter_archive_master.csv".

In [46]:
 df.to_csv('twitter.csv', index = False)

## Analyzing and Visualizing Data


### Highly rated dog by stage.

In [47]:
best_stage = df.groupby('stage')

In [48]:
best_stage.rating.mean().sort_values(ascending=False).head(4)

stage
doggo puppo     1.300000
puppo           1.200000
floofer         1.200000
doggo           1.188889
Name: rating, dtype: float64

### Highly rated dog by name

In [49]:
best_dog = df.groupby('name')

In [50]:
best_dog.rating.mean().sort_values(ascending=False).head(5)

name
Atticus    89.350000
Logan       7.500000
Sam         2.214286
Sophie      1.500000
Kuyu        1.400000
Name: rating, dtype: float64

### Distribution of dog cuteness

In [51]:
df_cuteness = df[df['rating'] > 0]

In [52]:
df.rating.describe()

count    1995.000000
mean        1.169164
std         4.066053
min         0.000000
25%         1.000000
50%         1.100000
75%         1.200000
max       177.600000
Name: rating, dtype: float64

In [53]:
cute_tag = ['Normal Dog(0, 1.875)','Nice Dog(1.875, 3.75)', 'Beautiful Dog(3.75, 5.625)', 'Adorable Dog(5.625, 7.5)' ]

In [54]:
cute_bins = pd.qcut(df_cuteness['rating'], 4, labels = cute_tag)

### Insights:
1. Distribution of dog cuteness

2. Highly rated dog by name

3. Highly rated dog by stage


### Visualization

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
cute_bins.value_counts().plot(kind = 'pie', ax = ax, label = 'Dog Stages', autopct = '%1.2f%%')
plt.title('General Distribution of Dog Stages Cuteness')
plt.legend(); 