# Project: Wrangling and Analyze Data

In [1]:
import pandas as pd
import numpy as np

import requests
import os

import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

# regular expression
import re

In [2]:
def extract_urls_from_str(value)->str:
    """
    Extract urls the given string
    Return : urls, separated by comma
    
    References : 
        - https://www.digitalocean.com/community/tutorials/pandas-dataframe-apply-examples
        - https://stackoverflow.com/questions/499345/regular-expression-to-extract-url-from-an-html-link
    """
    string = str(value)
    urls = re.findall(r'href=[\'"]?([^\'" >]+)', string)
    return ', '.join(urls) 

## Data Gathering
In the cell below, gather **all** three pieces of data for this project and load them in the notebook. **Note:** the methods required to gather each data are different.
1. Directly download the WeRateDogs Twitter archive data (twitter_archive_enhanced.csv)

In [3]:
twitter_archive_df = pd.read_csv("twitter-archive-enhanced.csv")
twitter_archive_df.head(3).T

Unnamed: 0,0,1,2
tweet_id,892420643555336193,892177421306343426,891815181378084864
in_reply_to_status_id,,,
in_reply_to_user_id,,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000,2017-07-31 00:18:03 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....,This is Archie. He is a rare Norwegian Pouncin...
retweeted_status_id,,,
retweeted_status_user_id,,,
retweeted_status_timestamp,,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...,https://twitter.com/dog_rates/status/891815181...


2. Use the Requests library to download the tweet image prediction (image_predictions.tsv)

In [4]:
# Create download folder if not exist
folder_name = "downloads"
folder_path = "./{}".format(folder_name)
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [5]:
# Make the download request

# download's url provided in the project's instructions
image_url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
image_response = requests.get(image_url)

if image_response.status_code != 200:
    raise Exception("Response status code != from 200")
else:
    print("Request successful")

Request successful


In [6]:
# Save downloaded file
file_name = image_url.split("/")[-1]
file_path = os.path.join(folder_path, file_name)
with open( file_path , mode="wb" ) as file:
    file.write(image_response.content)
    print("File saved on disk")

File saved on disk


In [7]:
image_predictions_df = pd.read_csv(file_path, sep="\t")
image_predictions_df.head(3).T

Unnamed: 0,0,1,2
tweet_id,666020888022790149,666029285002620928,666033412701032449
jpg_url,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg
img_num,1,1,1
p1,Welsh_springer_spaniel,redbone,German_shepherd
p1_conf,0.465074,0.506826,0.596461
p1_dog,True,True,True
p2,collie,miniature_pinscher,malinois
p2_conf,0.156665,0.074192,0.138584
p2_dog,True,True,True
p3,Shetland_sheepdog,Rhodesian_ridgeback,bloodhound


3. Use the Tweepy library to query additional data via the Twitter API (tweet_json.txt)

In [8]:
tweet_ids = image_predictions_df["tweet_id"].values
tweet_ids

array([666020888022790149, 666029285002620928, 666033412701032449, ...,
       891815181378084864, 892177421306343426, 892420643555336193],
      dtype=int64)

In [9]:
# Signify that I have not put my own twitter api key in this notebook
have_put_my_own_api_key = False

# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = "XXX"
consumer_secret = "XXX"
access_token = "XXX"
access_secret = "XXX"

tweet_json_file_name = "tweet_json.txt"
tweet_json_file_path = "./{}".format(tweet_json_file_name)

if have_put_my_own_api_key:
    """ The code below was taken from my Udacity's classroom """
    
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)

    api = tweepy.API(auth, wait_on_rate_limit=True)

    # Query Twitter's API for JSON data for each tweet ID in the Twitter archive
    count = 0
    fails_dict = {}
    start = timer()
    # Save each tweet's returned JSON as a new line in a .txt file
    with open('tweet_json.txt', 'w') as outfile:
        # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
        for tweet_id in tweet_ids:
            count += 1
            print(str(count) + ": " + str(tweet_id))
            try:
                tweet = api.get_status(tweet_id, tweet_mode='extended')
                print("Success")
                json.dump(tweet._json, outfile)
                outfile.write('\n')
            except tweepy.TweepError as e:
                print("Fail")
                fails_dict[tweet_id] = e
                pass
    end = timer()
    print(end - start)
    print(fails_dict)

Import json file into a df

In [10]:
"""

References : 

- https://www.geeksforgeeks.org/read-json-file-using-python/
- https://www.pythontutorial.net/python-basics/python-read-text-file/
- https://www.freecodecamp.org/news/python-json-how-to-convert-a-string-to-json/

"""
df_list = []

# Open file
with open(tweet_json_file_path,"r") as file:
    #file_content = json.load(file)
    lines = file.readlines()
    #print(lines[0])
    #print(json.loads(lines[0])["created_at"])
    for line in lines:
         df_list.append(json.loads(line))

df_list[:2]

[{'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
  'id': 892420643555336193,
  'id_str': '892420643555336193',
  'full_text': "This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU",
  'truncated': False,
  'display_text_range': [0, 85],
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [],
   'urls': [],
   'media': [{'id': 892420639486877696,
     'id_str': '892420639486877696',
     'indices': [86, 109],
     'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
     'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
     'url': 'https://t.co/MgUWQ76dJU',
     'display_url': 'pic.twitter.com/MgUWQ76dJU',
     'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
     'type': 'photo',
     'sizes': {'large': {'w': 540, 'h': 528, 'resize': 'fit'},
      'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
      'small': {'w': 540, 'h': 528, 'resize': 'fit'},

In [11]:
tweets_df = pd.DataFrame(df_list)
tweets_df.head(3).T

Unnamed: 0,0,1,2
created_at,Tue Aug 01 16:23:56 +0000 2017,Tue Aug 01 00:17:27 +0000 2017,Mon Jul 31 00:18:03 +0000 2017
id,892420643555336193,892177421306343426,891815181378084864
id_str,892420643555336193,892177421306343426,891815181378084864
full_text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....,This is Archie. He is a rare Norwegian Pouncin...
truncated,False,False,False
display_text_range,"[0, 85]","[0, 138]","[0, 121]"
entities,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'hashtags': [], 'symbols': [], 'user_mentions...","{'hashtags': [], 'symbols': [], 'user_mentions..."
extended_entities,"{'media': [{'id': 892420639486877696, 'id_str'...","{'media': [{'id': 892177413194625024, 'id_str'...","{'media': [{'id': 891815175371796480, 'id_str'..."
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
in_reply_to_status_id,,,


In [12]:
# Keep useful columns while discarding the others, as per the project instructions
tweets_df = tweets_df[ ["id","retweet_count","favorite_count","user","retweeted"] ]
tweets_df.head(3)

Unnamed: 0,id,retweet_count,favorite_count,user,retweeted
0,892420643555336193,8853,39467,"{'id': 4196983835, 'id_str': '4196983835', 'na...",False
1,892177421306343426,6514,33819,"{'id': 4196983835, 'id_str': '4196983835', 'na...",False
2,891815181378084864,4328,25461,"{'id': 4196983835, 'id_str': '4196983835', 'na...",False


## Assessing Data
In this section, detect and document at least **eight (8) quality issues and two (2) tidiness issue**. You must use **both** visual assessment
programmatic assessement to assess the data.

**Note:** pay attention to the following key points when you access the data.

* You only want original ratings (no retweets) that have images. Though there are 5000+ tweets in the dataset, not all are dog ratings and some are retweets.
* Assessing and cleaning the entire dataset completely would require a lot of time, and is not necessary to practice and demonstrate your skills in data wrangling. Therefore, the requirements of this project are only to assess and clean at least 8 quality issues and at least 2 tidiness issues in this dataset.
* The fact that the rating numerators are greater than the denominators does not need to be cleaned. This [unique rating system](http://knowyourmeme.com/memes/theyre-good-dogs-brent) is a big part of the popularity of WeRateDogs.
* You do not need to gather the tweets beyond August 1st, 2017. You can, but note that you won't be able to gather the image predictions for these tweets since you don't have access to the algorithm used.



In [13]:
# Looking at the content of twitter_archive_df
twitter_archive_df.head(3).T

Unnamed: 0,0,1,2
tweet_id,892420643555336193,892177421306343426,891815181378084864
in_reply_to_status_id,,,
in_reply_to_user_id,,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000,2017-07-31 00:18:03 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....,This is Archie. He is a rare Norwegian Pouncin...
retweeted_status_id,,,
retweeted_status_user_id,,,
retweeted_status_timestamp,,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...,https://twitter.com/dog_rates/status/891815181...


In [14]:
# Looking at the structure of twitter_archive_df
twitter_archive_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

We have wrong datatype in the timestamp column, it should be of type datetime

In [15]:
# Checking null values in twitter_archive_df
twitter_archive_df.isnull().sum()

tweet_id                         0
in_reply_to_status_id         2278
in_reply_to_user_id           2278
timestamp                        0
source                           0
text                             0
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
rating_numerator                 0
rating_denominator               0
name                             0
doggo                            0
floofer                          0
pupper                           0
puppo                            0
dtype: int64

* There are too much null values in many columns (compared to the total number of samples we have) : `in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp`.         
* `expanded_urls` also have some null values.

* The non-null value in the `retweeted_XXX` columns are marking that we we have retweet sample (we will further investivate this)

In [16]:
# Checking NaN (not a number) values in twitter_archive_df
twitter_archive_df.isna().sum()

tweet_id                         0
in_reply_to_status_id         2278
in_reply_to_user_id           2278
timestamp                        0
source                           0
text                             0
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
rating_numerator                 0
rating_denominator               0
name                             0
doggo                            0
floofer                          0
pupper                           0
puppo                            0
dtype: int64

The results for NaN check are the same as the results from null check, so there is nothing suspicious to investigate.

In [17]:
# Taking a look at the retweeted samples
twitter_archive_df[ ~ twitter_archive_df["retweeted_status_id"].isnull() ].T

Unnamed: 0,19,32,36,68,73,74,78,91,95,97,...,926,937,943,949,1012,1023,1043,1242,2259,2260
tweet_id,888202515573088257,886054160059072513,885311592912609280,879130579576475649,878404777348136964,878316110768087041,877611172832227328,874434818259525634,873697596434513921,873337748698140672,...,754874841593970688,753298634498793472,752701944171524096,752309394570878976,747242308580548608,746521445350707200,743835915802583040,711998809858043904,667550904950915073,667550882905632768
in_reply_to_status_id,,,,,,,,,,,...,,,,,,,,,,
in_reply_to_user_id,,,,,,,,,,,...,,,,,,,,,,
timestamp,2017-07-21 01:02:36 +0000,2017-07-15 02:45:48 +0000,2017-07-13 01:35:06 +0000,2017-06-26 00:13:58 +0000,2017-06-24 00:09:53 +0000,2017-06-23 18:17:33 +0000,2017-06-21 19:36:23 +0000,2017-06-13 01:14:41 +0000,2017-06-11 00:25:14 +0000,2017-06-10 00:35:19 +0000,...,2016-07-18 03:06:01 +0000,2016-07-13 18:42:44 +0000,2016-07-12 03:11:42 +0000,2016-07-11 01:11:51 +0000,2016-06-27 01:37:04 +0000,2016-06-25 01:52:36 +0000,2016-06-17 16:01:16 +0000,2016-03-21 19:31:59 +0000,2015-11-20 03:51:52 +0000,2015-11-20 03:51:47 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...",...,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...","<a href=""http://twitter.com"" rel=""nofollow"">Tw..."
text,RT @dog_rates: This is Canela. She attempted s...,RT @Athletics: 12/10 #BATP https://t.co/WxwJmv...,RT @dog_rates: This is Lilly. She just paralle...,RT @dog_rates: This is Emmy. She was adopted t...,RT @dog_rates: Meet Shadow. In an attempt to r...,RT @dog_rates: Meet Terrance. He's being yelle...,RT @rachel2195: @dog_rates the boyfriend and h...,RT @dog_rates: This is Coco. At first I though...,RT @dog_rates: This is Walter. He won't start ...,RT @dog_rates: This is Sierra. She's one preci...,...,RT @dog_rates: This is Rubio. He has too much ...,RT @dog_rates: This is Carly. She's actually 2...,RT @dog_rates: HEY PUP WHAT'S THE PART OF THE ...,RT @dog_rates: Everyone needs to watch this. 1...,RT @dog_rates: This pupper killed this great w...,RT @dog_rates: This is Shaggy. He knows exactl...,RT @dog_rates: Extremely intelligent dog here....,RT @twitter: @dog_rates Awesome Tweet! 12/10. ...,RT @dogratingrating: Exceptional talent. Origi...,RT @dogratingrating: Unoriginal idea. Blatant ...
retweeted_status_id,887473957103951872.0,886053734421102592.0,830583320585068544.0,878057613040115712.0,878281511006478336.0,669000397445533696.0,876850772322988032.0,866334964761202688.0,868880397819494400.0,873213775632977920.0,...,679158373988876288.0,681523177663676416.0,683515932363329536.0,675354435921575936.0,704761120771465216.0,667866724293877760.0,667138269671505920.0,711998279773347840.0,667548695664070656.0,667548415174144000.0
retweeted_status_user_id,4196983835.0,19607400.0,4196983835.0,4196983835.0,4196983835.0,4196983835.0,512804507.0,4196983835.0,4196983835.0,4196983835.0,...,4196983835.0,4196983835.0,4196983835.0,4196983835.0,4196983835.0,4196983835.0,4196983835.0,783214.0,4296831739.0,4296831739.0
retweeted_status_timestamp,2017-07-19 00:47:34 +0000,2017-07-15 02:44:07 +0000,2017-02-12 01:04:29 +0000,2017-06-23 01:10:23 +0000,2017-06-23 16:00:04 +0000,2015-11-24 03:51:38 +0000,2017-06-19 17:14:49 +0000,2017-05-21 16:48:45 +0000,2017-05-28 17:23:24 +0000,2017-06-09 16:22:42 +0000,...,2015-12-22 04:35:49 +0000,2015-12-28 17:12:42 +0000,2016-01-03 05:11:12 +0000,2015-12-11 16:40:19 +0000,2016-03-01 20:11:59 +0000,2015-11-21 00:46:50 +0000,2015-11-19 00:32:12 +0000,2016-03-21 19:29:52 +0000,2015-11-20 03:43:06 +0000,2015-11-20 03:41:59 +0000
expanded_urls,https://twitter.com/dog_rates/status/887473957...,https://twitter.com/dog_rates/status/886053434...,https://twitter.com/dog_rates/status/830583320...,https://twitter.com/dog_rates/status/878057613...,"https://www.gofundme.com/3yd6y1c,https://twitt...",https://twitter.com/dog_rates/status/669000397...,https://twitter.com/rachel2195/status/87685077...,https://twitter.com/dog_rates/status/866334964...,https://twitter.com/dog_rates/status/868880397...,https://www.gofundme.com/help-my-baby-sierra-g...,...,https://twitter.com/dog_rates/status/679158373...,https://twitter.com/dog_rates/status/681523177...,"https://vine.co/v/ibvnzrauFuV,https://vine.co/...",https://twitter.com/dog_rates/status/675354435...,https://twitter.com/dog_rates/status/704761120...,https://twitter.com/dog_rates/status/667866724...,https://twitter.com/dog_rates/status/667138269...,https://twitter.com/twitter/status/71199827977...,https://twitter.com/dogratingrating/status/667...,https://twitter.com/dogratingrating/status/667...


Those retweeted samples should not be present in our dataset, as per the projec instruction.

In [18]:
# Taking a look at the samples which have value in expanded_urls column
twitter_archive_df[ ~ twitter_archive_df["expanded_urls"].isnull() ].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2346,2347,2348,2349,2350,2351,2352,2353,2354,2355
tweet_id,892420643555336193,892177421306343426,891815181378084864,891689557279858688,891327558926688256,891087950875897856,890971913173991426,890729181411237888,890609185150312448,890240255349198849,...,666058600524156928,666057090499244032,666055525042405380,666051853826850816,666050758794694657,666049248165822465,666044226329800704,666033412701032449,666029285002620928,666020888022790149
in_reply_to_status_id,,,,,,,,,,,...,,,,,,,,,,
in_reply_to_user_id,,,,,,,,,,,...,,,,,,,,,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000,2017-07-31 00:18:03 +0000,2017-07-30 15:58:51 +0000,2017-07-29 16:00:24 +0000,2017-07-29 00:08:17 +0000,2017-07-28 16:27:12 +0000,2017-07-28 00:22:40 +0000,2017-07-27 16:25:51 +0000,2017-07-26 15:59:51 +0000,...,2015-11-16 01:01:59 +0000,2015-11-16 00:55:59 +0000,2015-11-16 00:49:46 +0000,2015-11-16 00:35:11 +0000,2015-11-16 00:30:50 +0000,2015-11-16 00:24:50 +0000,2015-11-16 00:04:52 +0000,2015-11-15 23:21:54 +0000,2015-11-15 23:05:30 +0000,2015-11-15 22:32:08 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...",...,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....,This is Archie. He is a rare Norwegian Pouncin...,This is Darla. She commenced a snooze mid meal...,This is Franklin. He would like you to stop ca...,Here we have a majestic great white breaching ...,Meet Jax. He enjoys ice cream so much he gets ...,When you watch your owner call another dog a g...,This is Zoey. She doesn't want to be one of th...,This is Cassie. She is a college pup. Studying...,...,Here is the Rand Paul of retrievers folks! He'...,My oh my. This is a rare blond Canadian terrie...,Here is a Siberian heavily armored polar bear ...,This is an odd dog. Hard on the outside but lo...,This is a truly beautiful English Wilson Staff...,Here we have a 1949 1st generation vulpix. Enj...,This is a purebred Piers Morgan. Loves to Netf...,Here is a very happy pup. Big fan of well-main...,This is a western brown Mitsubishi terrier. Up...,Here we have a Japanese Irish Setter. Lost eye...
retweeted_status_id,,,,,,,,,,,...,,,,,,,,,,
retweeted_status_user_id,,,,,,,,,,,...,,,,,,,,,,
retweeted_status_timestamp,,,,,,,,,,,...,,,,,,,,,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...,https://twitter.com/dog_rates/status/891815181...,https://twitter.com/dog_rates/status/891689557...,https://twitter.com/dog_rates/status/891327558...,https://twitter.com/dog_rates/status/891087950...,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",https://twitter.com/dog_rates/status/890729181...,https://twitter.com/dog_rates/status/890609185...,https://twitter.com/dog_rates/status/890240255...,...,https://twitter.com/dog_rates/status/666058600...,https://twitter.com/dog_rates/status/666057090...,https://twitter.com/dog_rates/status/666055525...,https://twitter.com/dog_rates/status/666051853...,https://twitter.com/dog_rates/status/666050758...,https://twitter.com/dog_rates/status/666049248...,https://twitter.com/dog_rates/status/666044226...,https://twitter.com/dog_rates/status/666033412...,https://twitter.com/dog_rates/status/666029285...,https://twitter.com/dog_rates/status/666020888...


In [19]:
# Assessing the content of twitter_archive_df["source"]
twitter_archive_df["source"].unique()

array(['<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
       '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
       '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>',
       '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>'],
      dtype=object)

Everything seem ok here.

In [20]:
# Looking at the content of tweets_df
tweets_df.head(3).T

Unnamed: 0,0,1,2
id,892420643555336193,892177421306343426,891815181378084864
retweet_count,8853,6514,4328
favorite_count,39467,33819,25461
user,"{'id': 4196983835, 'id_str': '4196983835', 'na...","{'id': 4196983835, 'id_str': '4196983835', 'na...","{'id': 4196983835, 'id_str': '4196983835', 'na..."
retweeted,False,False,False


In [21]:
# Looking at the structure of tweets_df
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              2354 non-null   int64 
 1   retweet_count   2354 non-null   int64 
 2   favorite_count  2354 non-null   int64 
 3   user            2354 non-null   object
 4   retweeted       2354 non-null   bool  
dtypes: bool(1), int64(3), object(1)
memory usage: 76.0+ KB


In [22]:
# Looking at the content of image_predictions_df
image_predictions_df.tail(3).T

Unnamed: 0,2072,2073,2074
tweet_id,891815181378084864,892177421306343426,892420643555336193
jpg_url,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg
img_num,1,1,1
p1,Chihuahua,Chihuahua,orange
p1_conf,0.716012,0.323581,0.097049
p1_dog,True,True,False
p2,malamute,Pekinese,bagel
p2_conf,0.078253,0.090647,0.085851
p2_dog,True,True,False
p3,kelpie,papillon,banana


In [23]:
# Observing the structure of image_predictions_df
image_predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [24]:
# sum(tweets_df.duplicated())

### Tidiness issues
1. `twitter_archive_df` and `tweets_df` should form one dataframe instead being two different dataframes.

1. `twitter_archive_df` has values as columns for the stage of the dog.

1. `twitter_archive_df` has useless columns for our purpose, such as status_id, status_user_id, status_timestamp, ...

1. `tweets_df.user` should contains the id of another dataframe named `user`.

### Quality issues 
1. `twitter_archive_df['source']` has tag `a` as value instead of the value of `href` attribute

1. Wrong datatypes in `twitter_archive_df` in the following columns : `timestamp`

1. `twitter_archive_df` has retweet samples, which are useless for our purpose (according to the project instructions)

1. Null values in the following columns of `twitter_archive_df` : `in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp, expanded_urls`.

1. Duplicate columns `id` and `id_str` in `tweets_df`

1. No value in the following columns of `tweets_df` : `geo`, `coordinates`, `contributors`

1. `tweets_df.created` has value beyond August 1st, 2017, which mean those sample will not have corresponding value in image_predictions_df

1. Some samples in `image_predictions_df` are not dog

1. In columns `p1`, `p2`, and `p3` of `image_predictions_df`, some values are capitalized while others are not

## Cleaning Data
In this section, clean **all** of the issues you documented while assessing. 

**Note:** Make a copy of the original data before cleaning. Cleaning includes merging individual pieces of data according to the rules of [tidy data](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html). The result should be a high-quality and tidy master pandas DataFrame (or DataFrames, if appropriate).

Make copies of original image_predictions_df. It is useless to make copy of the others dataframe since they will be merged

In [25]:
# Make copies of original image_predictions_df

#tweets_clean = tweets_df.copy()
#twitter_archive_clean = twitter_archive_df.copy()
image_predictions_clean = image_predictions_df.copy()


### Issue #1 (tidiness): `twitter_archive_df` and `tweets_df` should form one dataframe instead being two different dataframes.

In [26]:
twitter_archive_df.head(2).T

Unnamed: 0,0,1
tweet_id,892420643555336193,892177421306343426
in_reply_to_status_id,,
in_reply_to_user_id,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...


In [27]:
twitter_archive_df.shape

(2356, 17)

In [28]:
tweets_df.head()

Unnamed: 0,id,retweet_count,favorite_count,user,retweeted
0,892420643555336193,8853,39467,"{'id': 4196983835, 'id_str': '4196983835', 'na...",False
1,892177421306343426,6514,33819,"{'id': 4196983835, 'id_str': '4196983835', 'na...",False
2,891815181378084864,4328,25461,"{'id': 4196983835, 'id_str': '4196983835', 'na...",False
3,891689557279858688,8964,42908,"{'id': 4196983835, 'id_str': '4196983835', 'na...",False
4,891327558926688256,9774,41048,"{'id': 4196983835, 'id_str': '4196983835', 'na...",False


In [29]:
tweets_df.shape

(2354, 5)

#### Define
Merge `twitter_archive_df` and `tweets_df` in one dataframe

#### Code

In [30]:
# All columns in the two datasets
# all_columns = pd.Series(list(twitter_archive_clean) + list(tweets_clean) )
# all_columns

In [31]:
# Find duplicated columns
# duplicated_columns = all_columns[all_columns.duplicated()]
# duplicated_columns

In [32]:
# Drop duplicated columns from tweets_clean
# tweets_clean.drop(duplicated_columns, axis=1, inplace=True)
# tweets_clean.head(2).T

In [33]:
# tweets_clean.shape

In [34]:
# Merging the two dataset

# https://stackoverflow.com/questions/43297589/merge-two-data-frames-based-on-common-column-values-in-pandas
twitter_df = twitter_archive_df.merge(tweets_df, how="inner",left_on="tweet_id", right_on="id")
twitter_df.shape

(2354, 22)

#### Test

In [35]:
twitter_df.head(2).T

Unnamed: 0,0,1
tweet_id,892420643555336193,892177421306343426
in_reply_to_status_id,,
in_reply_to_user_id,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...


In [36]:
# Making sure the merge worked fine
twitter_df.shape[1] == twitter_archive_df.shape[1] + tweets_df.shape[1]

True

In [37]:
# Dropping redundant id column
twitter_df.drop(["id"],axis=1, inplace=True)
twitter_df.head(2).T

Unnamed: 0,0,1
tweet_id,892420643555336193,892177421306343426
in_reply_to_status_id,,
in_reply_to_user_id,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...


### Issue #2 (tidiness): `twitter_archive_df` has values as columns for the stage of the dog.          
Those columns are `doggo, floofer, pupper, puppo`

In [38]:
twitter_df.head(3).T

Unnamed: 0,0,1,2
tweet_id,892420643555336193,892177421306343426,891815181378084864
in_reply_to_status_id,,,
in_reply_to_user_id,,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000,2017-07-31 00:18:03 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....,This is Archie. He is a rare Norwegian Pouncin...
retweeted_status_id,,,
retweeted_status_user_id,,,
retweeted_status_timestamp,,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...,https://twitter.com/dog_rates/status/891815181...


#### Define
Melt the dog stage columns into a column which name will be `stage` and which value will be the stage of the dog or NaN depending on weither the stage is known or not.

#### Code

In [39]:
id_vars = twitter_df.columns.tolist()
id_vars.remove("doggo")
id_vars.remove("floofer")
id_vars.remove("pupper")
id_vars.remove("puppo")
id_vars

['tweet_id',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'timestamp',
 'source',
 'text',
 'retweeted_status_id',
 'retweeted_status_user_id',
 'retweeted_status_timestamp',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'name',
 'retweet_count',
 'favorite_count',
 'user',
 'retweeted']

In [40]:
melted = pd.melt(twitter_df, id_vars=id_vars, var_name="stage", value_name="stage_value")
melted.head(3).T

Unnamed: 0,0,1,2
tweet_id,892420643555336193,892177421306343426,891815181378084864
in_reply_to_status_id,,,
in_reply_to_user_id,,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000,2017-07-31 00:18:03 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....,This is Archie. He is a rare Norwegian Pouncin...
retweeted_status_id,,,
retweeted_status_user_id,,,
retweeted_status_timestamp,,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...,https://twitter.com/dog_rates/status/891815181...


Looking at our dataframe, we don't need the `stage_value` column, so will drop it in a while after performing some tidiness cleaning.

In [41]:
melted.shape

(9416, 19)

In [42]:
# twitter_df.shape

**Analyzing the content of `melted` based on weither the stage of the dog is known or not**

In [43]:
# Visual assessement of the tweet which id is 892420643555336193 (this tweet dog stage is unknow)
melted[ melted["tweet_id"]==892420643555336193 ].T

Unnamed: 0,0,2354,4708,7062
tweet_id,892420643555336193,892420643555336193,892420643555336193,892420643555336193
in_reply_to_status_id,,,,
in_reply_to_user_id,,,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 16:23:56 +0000,2017-08-01 16:23:56 +0000,2017-08-01 16:23:56 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Phineas. He's a mystical boy. Only eve...,This is Phineas. He's a mystical boy. Only eve...,This is Phineas. He's a mystical boy. Only eve...
retweeted_status_id,,,,
retweeted_status_user_id,,,,
retweeted_status_timestamp,,,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892420643...


We have 04 entries for the same tweet_id, each corresponding to one of the 04 melted columns. 

Let's separate `melted` in two different dataframes for a better observation 

In [44]:
# Retrieve the tweets with known dog stage
tweets_with_stage_df = melted[ melted["stage_value"] !="None" ]
tweets_with_stage_df.head(3).T

Unnamed: 0,9,42,98
tweet_id,890240255349198849,884162670584377345,872967104147763200
in_reply_to_status_id,,,
in_reply_to_user_id,,,
timestamp,2017-07-26 15:59:51 +0000,2017-07-09 21:29:42 +0000,2017-06-09 00:02:31 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Cassie. She is a college pup. Studying...,Meet Yogi. He doesn't have any important dog m...,Here's a very large dog. He has a date later. ...
retweeted_status_id,,,
retweeted_status_user_id,,,
retweeted_status_timestamp,,,
expanded_urls,https://twitter.com/dog_rates/status/890240255...,https://twitter.com/dog_rates/status/884162670...,https://twitter.com/dog_rates/status/872967104...


In [45]:
# Visual assessement of a tweet with known dog stage
tweets_with_stage_df[ tweets_with_stage_df["tweet_id"]==890240255349198849 ].T

Unnamed: 0,9
tweet_id,890240255349198849
in_reply_to_status_id,
in_reply_to_user_id,
timestamp,2017-07-26 15:59:51 +0000
source,"<a href=""http://twitter.com/download/iphone"" r..."
text,This is Cassie. She is a college pup. Studying...
retweeted_status_id,
retweeted_status_user_id,
retweeted_status_timestamp,
expanded_urls,https://twitter.com/dog_rates/status/890240255...


We can confirm that there is only one entry for the tweets when dog stage is known. 

In [46]:
# Retrieve the tweets with unknown dog stage
tweets_without_stage_df = melted[ melted["stage_value"] =="None" ]
tweets_without_stage_df.head(3).T

Unnamed: 0,0,1,2
tweet_id,892420643555336193,892177421306343426,891815181378084864
in_reply_to_status_id,,,
in_reply_to_user_id,,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000,2017-07-31 00:18:03 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....,This is Archie. He is a rare Norwegian Pouncin...
retweeted_status_id,,,
retweeted_status_user_id,,,
retweeted_status_timestamp,,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...,https://twitter.com/dog_rates/status/891815181...


In [47]:
# Visual assessement of a tweet with unknown dog stage
tweets_without_stage_df[ tweets_without_stage_df["tweet_id"]==892420643555336193 ].T

Unnamed: 0,0,2354,4708,7062
tweet_id,892420643555336193,892420643555336193,892420643555336193,892420643555336193
in_reply_to_status_id,,,,
in_reply_to_user_id,,,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 16:23:56 +0000,2017-08-01 16:23:56 +0000,2017-08-01 16:23:56 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Phineas. He's a mystical boy. Only eve...,This is Phineas. He's a mystical boy. Only eve...,This is Phineas. He's a mystical boy. Only eve...
retweeted_status_id,,,,
retweeted_status_user_id,,,,
retweeted_status_timestamp,,,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892420643...


We can observe that there is 04 entries (same number as the number of columns melted) for the tweets when dog stage is unknown.

Now we will :
* replace the values in `tweets_without_stage_df["stage"]` by "others"
* concatenate `tweets_without_stage_df` to `tweets_with_stage_df`

In [48]:
# Giving NaN values to stage column in tweets_without_stage_df
tweets_without_stage_df["stage"] = "others"
tweets_without_stage_df.head(2).T

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_without_stage_df["stage"] = "others"


Unnamed: 0,0,1
tweet_id,892420643555336193,892177421306343426
in_reply_to_status_id,,
in_reply_to_user_id,,
timestamp,2017-08-01 16:23:56 +0000,2017-08-01 00:17:27 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Phineas. He's a mystical boy. Only eve...,This is Tilly. She's just checking pup on you....
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
expanded_urls,https://twitter.com/dog_rates/status/892420643...,https://twitter.com/dog_rates/status/892177421...


In [49]:
tweets_without_stage_df.shape

(9023, 19)

In [50]:
# Drop duplicate value by tweet_id in tweets_without_stage_df
tweets_without_stage_df.drop_duplicates(subset="tweet_id",inplace=True)
tweets_without_stage_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_without_stage_df.drop_duplicates(subset="tweet_id",inplace=True)


(2354, 19)

In [51]:
# Concat tweets_without_stage_df and tweets_with_stage_df, in order to form a whole again
twitter_clean = pd.concat([tweets_with_stage_df,tweets_without_stage_df], axis=0, ignore_index=True)

# Shuffling the dataframe in order to eliminate an eventual bias
"""Reference : https://datagy.io/pandas-shuffle-dataframe/ """
random_state_seed = 5 # set seed for reproducibility
twitter_clean = twitter_clean.sample(frac=1, random_state=random_state_seed).reset_index()


twitter_clean.head(3).T

Unnamed: 0,0,1,2
index,1435,1067,1340
tweet_id,728046963732717569,781661882474196992,745057283344719872
in_reply_to_status_id,,,
in_reply_to_user_id,,,
timestamp,2016-05-05 02:21:37 +0000,2016-09-30 01:08:10 +0000,2016-06-21 00:54:33 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Raymond. He controls fountains with hi...,Who keeps sending in pictures without dogs in ...,This is Oliver. He's downright gorgeous as hel...
retweeted_status_id,,,
retweeted_status_user_id,,,
retweeted_status_timestamp,,,


Looking at our dataframe, we don't need the `stage_value` column anymore, so will drop it. We will also drop `index` column, because it is the original index of the samples before the shuffled.

In [52]:
twitter_df = twitter_clean.drop(["stage_value","index"],axis=1)
twitter_df.head(3).T

Unnamed: 0,0,1,2
tweet_id,728046963732717569,781661882474196992,745057283344719872
in_reply_to_status_id,,,
in_reply_to_user_id,,,
timestamp,2016-05-05 02:21:37 +0000,2016-09-30 01:08:10 +0000,2016-06-21 00:54:33 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Raymond. He controls fountains with hi...,Who keeps sending in pictures without dogs in ...,This is Oliver. He's downright gorgeous as hel...
retweeted_status_id,,,
retweeted_status_user_id,,,
retweeted_status_timestamp,,,
expanded_urls,https://twitter.com/dog_rates/status/728046963...,https://twitter.com/dog_rates/status/781661882...,https://twitter.com/dog_rates/status/745057283...


#### Test

In [53]:
# Checking the occurences in twitter_df["stage"]
twitter_df["stage"].unique()

array(['others', 'pupper', 'doggo', 'puppo', 'floofer'], dtype=object)

In [54]:
# Checking the repartition of the dog per their stage
twitter_df["stage"].value_counts()

others     2354
pupper      256
doggo        97
puppo        30
floofer      10
Name: stage, dtype: int64

In [55]:
# Checking for duplicate 

# the user column is (temporarly) removed from the duplicate checking in order to avoid "unhashable type: 'dict'" error
#   when using dataframe.duplicated()
twitter_df.drop(["user"],axis=1).duplicated().sum()

0


<!-- Now we will check the number of column in each of our twitter dataset (the original, and the new) -->

### Issue #3 (tidiness): `twitter_archive_df` has useless columns for our purpose          
Those columns are `status_id, status_user_id, status_timestamp`.   

Our cleaning is to be performed in `twitter_df`

In [56]:
# Recalling the content of our df
twitter_df.head(2).T

Unnamed: 0,0,1
tweet_id,728046963732717569,781661882474196992
in_reply_to_status_id,,
in_reply_to_user_id,,
timestamp,2016-05-05 02:21:37 +0000,2016-09-30 01:08:10 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Raymond. He controls fountains with hi...,Who keeps sending in pictures without dogs in ...
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
expanded_urls,https://twitter.com/dog_rates/status/728046963...,https://twitter.com/dog_rates/status/781661882...


#### Define

Remove columns deemed useless for our purpose from `twitter_df` : `in_reply_to_status_id, in_reply_to_user_id, expanded_urls`.

#### Code

In [57]:
useless_columns = ["in_reply_to_status_id","in_reply_to_user_id","expanded_urls"]
twitter_df.drop(useless_columns, axis=1, inplace=True)

#### Test

In [58]:
# Making sure the dropped columns no longer exist
twitter_df.head(2).T

Unnamed: 0,0,1
tweet_id,728046963732717569,781661882474196992
timestamp,2016-05-05 02:21:37 +0000,2016-09-30 01:08:10 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Raymond. He controls fountains with hi...,Who keeps sending in pictures without dogs in ...
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
rating_numerator,11,5
rating_denominator,10,10
name,Raymond,


### Issue #4 (tidiness): `tweets_df.user` should contains the id of another dataframe named `user`

The cleaning it to be performed in `twitter_df`

In [59]:
twitter_df.head(3).T

Unnamed: 0,0,1,2
tweet_id,728046963732717569,781661882474196992,745057283344719872
timestamp,2016-05-05 02:21:37 +0000,2016-09-30 01:08:10 +0000,2016-06-21 00:54:33 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Raymond. He controls fountains with hi...,Who keeps sending in pictures without dogs in ...,This is Oliver. He's downright gorgeous as hel...
retweeted_status_id,,,
retweeted_status_user_id,,,
retweeted_status_timestamp,,,
rating_numerator,11,5,12
rating_denominator,10,10,10
name,Raymond,,Oliver


#### Define
* Extract values from `twitter_df["user"]`, and use them to populate a new `user_df` dataframe
* Replace each value from `twitter_df["user"]` by the user_id
* Remove duplicated samples from `user_df`

#### Code

In [60]:
# Retrieve users data from twitter_df["user"]
user_list = twitter_df["user"].values.tolist()
user_list[1]

{'id': 4196983835,
 'id_str': '4196983835',
 'name': 'WeRateDogs™ (author)',
 'screen_name': 'dog_rates',
 'location': 'DM YOUR DOGS, WE WILL RATE',
 'description': '#1 Source for Professional Dog Ratings | STORE: @ShopWeRateDogs | IG, FB & SC: WeRateDogs MOBILE APP: @GoodDogsGame | Business: dogratingtwitter@gmail.com',
 'url': 'https://t.co/N7sNNHAEXS',
 'entities': {'url': {'urls': [{'url': 'https://t.co/N7sNNHAEXS',
     'expanded_url': 'http://weratedogs.com',
     'display_url': 'weratedogs.com',
     'indices': [0, 23]}]},
  'description': {'urls': []}},
 'protected': False,
 'followers_count': 3200901,
 'friends_count': 104,
 'listed_count': 2789,
 'created_at': 'Sun Nov 15 21:41:29 +0000 2015',
 'favourites_count': 114031,
 'utc_offset': None,
 'time_zone': None,
 'geo_enabled': True,
 'verified': True,
 'statuses_count': 5288,
 'lang': 'en',
 'contributors_enabled': False,
 'is_translator': False,
 'is_translation_enabled': False,
 'profile_background_color': '000000',
 'prof

In [61]:
# Create user dataframe
user_df = pd.DataFrame(user_list)
user_df.head(2).T

Unnamed: 0,0,1
id,4196983835,4196983835
id_str,4196983835,4196983835
name,WeRateDogs™ (author),WeRateDogs™ (author)
screen_name,dog_rates,dog_rates
location,"DM YOUR DOGS, WE WILL RATE","DM YOUR DOGS, WE WILL RATE"
description,#1 Source for Professional Dog Ratings | STORE...,#1 Source for Professional Dog Ratings | STORE...
url,https://t.co/N7sNNHAEXS,https://t.co/N7sNNHAEXS
entities,{'url': {'urls': [{'url': 'https://t.co/N7sNNH...,{'url': {'urls': [{'url': 'https://t.co/N7sNNH...
protected,False,False
followers_count,3200944,3200901


In [62]:
user_df.drop(["entities"],axis=1,inplace=True)

In [63]:
user_df.shape

(2747, 41)

In [64]:
user_df.drop_duplicates(inplace=True)

In [65]:
user_df.shape

(79, 41)

In [66]:
user_df["id"].unique()

array([4196983835], dtype=int64)

We are noticing that all entries in `user_df` are about one user. This should be related to the fact it is additionnal data describing the state, and not the attributes, of the users which were stored in `twitter_df["user"]`; this can be explained by how NoSQL databases are usually designed.      
Given the situation, `user_df` is of no more use to use, so we will discard it.                
Given the situation `twitter_df["user"]` should contain data related to only one user_id. We cannot help but wondering if it is useful to us as a column, and if user mean a person or something like a terminal/source ip.

In [67]:
twitter_df.head(2).T

Unnamed: 0,0,1
tweet_id,728046963732717569,781661882474196992
timestamp,2016-05-05 02:21:37 +0000,2016-09-30 01:08:10 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Raymond. He controls fountains with hi...,Who keeps sending in pictures without dogs in ...
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
rating_numerator,11,5
rating_denominator,10,10
name,Raymond,


In [68]:
# Forming a new column containing the user_id
twitter_df["user_id"] = twitter_df["user"].apply(lambda x: x["id"])
twitter_df.sample(10).T

Unnamed: 0,1941,2113,2186,127,572,2182,1184,1726,378,217
tweet_id,861383897657036800,683849932751646720,739238157791694849,703425003149250560,675135153782571009,761599872357261312,820749716845686786,676949632774234114,821149554670182400,667902449697558528
timestamp,2017-05-08 00:54:59 +0000,2016-01-04 03:18:23 +0000,2016-06-04 23:31:25 +0000,2016-02-27 03:42:44 +0000,2015-12-11 02:08:58 +0000,2016-08-05 16:28:54 +0000,2017-01-15 21:49:15 +0000,2015-12-16 02:19:04 +0000,2017-01-17 00:18:04 +0000,2015-11-21 03:08:47 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Hobbes. He's never seen bubbles before...,This is Jiminy. He's not the brightest dog. Ne...,Here's a doggo blowing bubbles. It's downright...,Really guys? Again? I know this is a rare Alba...,This is Steven. He got locked outside. Damn it...,"This is Sephie. According to this picture, she...",Meet Sunny. He can take down a polar bear in o...,This is Tyrus. He's a Speckled Centennial Tico...,This is Luca. He got caught howling. H*ckin em...,This is Cleopatricia. She is a northern Paperb...
retweeted_status_id,,,,,,,,,,
retweeted_status_user_id,,,,,,,,,,
retweeted_status_timestamp,,,,,,,,,,
rating_numerator,13,5,13,9,5,11,13,8,12,9
rating_denominator,10,10,10,10,10,10,10,10,10,10
name,Hobbes,Jiminy,,,Steven,Sephie,Sunny,Tyrus,Luca,Cleopatricia


In [69]:
# Confirming that we have the same user_id in our dataframe
twitter_df["user_id"].unique()

array([4196983835], dtype=int64)

Since we have the same user_id in `twitter_df["user_id"]`, this column is no more useful in our analysis. We will drop it.

In [70]:
# def extract_keysvalue_from_json(json_data:dict, dict_key:str):
#     """
#         Retrieve the value of a given key from a json data
        
#         @param json_data:str : the json from which to extract the given key's value
#         @param dict_key:str : the key which value we wish to extract from the dictionnary 
#     """
#     return json_data[dict_key] #json.loads(json_data)[dict_key]

In [71]:
# Drop user column, since it is no more useful
twitter_df.drop(["user","user_id"],axis=1,inplace=True)

#### Test

In [72]:
twitter_df.head(2).T

Unnamed: 0,0,1
tweet_id,728046963732717569,781661882474196992
timestamp,2016-05-05 02:21:37 +0000,2016-09-30 01:08:10 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Raymond. He controls fountains with hi...,Who keeps sending in pictures without dogs in ...
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
rating_numerator,11,5
rating_denominator,10,10
name,Raymond,


### Issue #1 (quality issue) : `twitter_archive_df['source']` has tag `a` as value instead of the value of `href` attribute

The cleaning is to be performed in `twitter_df`

#### Define
Extract href value from `twitter_df["source"]` occurences, and use it to replace tag `a`

#### Code

In [74]:
# Recalling the content of our dataset
twitter_df.head(2).T

Unnamed: 0,0,1
tweet_id,728046963732717569,781661882474196992
timestamp,2016-05-05 02:21:37 +0000,2016-09-30 01:08:10 +0000
source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
text,This is Raymond. He controls fountains with hi...,Who keeps sending in pictures without dogs in ...
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
rating_numerator,11,5
rating_denominator,10,10
name,Raymond,


In [76]:
twitter_df["source"][0]

'<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>'

In [77]:
twitter_df.loc[1,"source"]

'<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>'

In [78]:
twitter_df["source"].unique()

array(['<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
       '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>',
       '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
       '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>'],
      dtype=object)

In [79]:
# Clean source column
twitter_df["source"] = twitter_df["source"].apply(extract_urls_from_str)
twitter_df.head(2).T

Unnamed: 0,0,1
tweet_id,728046963732717569,781661882474196992
timestamp,2016-05-05 02:21:37 +0000,2016-09-30 01:08:10 +0000
source,http://twitter.com/download/iphone,http://twitter.com/download/iphone
text,This is Raymond. He controls fountains with hi...,Who keeps sending in pictures without dogs in ...
retweeted_status_id,,
retweeted_status_user_id,,
retweeted_status_timestamp,,
rating_numerator,11,5
rating_denominator,10,10
name,Raymond,


#### Test

In [80]:
# Checking that the values of twitter_archive_clean["source"] are links (and no more tags)
twitter_df["source"].unique()

array(['http://twitter.com/download/iphone', 'http://vine.co',
       'http://twitter.com',
       'https://about.twitter.com/products/tweetdeck'], dtype=object)

### Issue #2 (quality issue): Wrong datatypes in `twitter_archive_df["timestamp"]`            
The datatypes in `twitter_archive_df["timestamp"]` should be datetime, not object(string).

This cleaning is to be performed in `twitter_df`.

In [81]:
# Recalling the structure of twitter_df
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2747 entries, 0 to 2746
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2747 non-null   int64  
 1   timestamp                   2747 non-null   object 
 2   source                      2747 non-null   object 
 3   text                        2747 non-null   object 
 4   retweeted_status_id         216 non-null    float64
 5   retweeted_status_user_id    216 non-null    float64
 6   retweeted_status_timestamp  216 non-null    object 
 7   rating_numerator            2747 non-null   int64  
 8   rating_denominator          2747 non-null   int64  
 9   name                        2747 non-null   object 
 10  retweet_count               2747 non-null   int64  
 11  favorite_count              2747 non-null   int64  
 12  retweeted                   2747 non-null   bool   
 13  stage                       2747 

#### Define    
Convert `twitter_df["timestamp"]` to datetime.

#### Code

In [82]:
twitter_df["timestamp"] = pd.to_datetime(twitter_df["timestamp"])

#### Test

In [84]:
# Confirming that timestamp column was converted to datetime
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2747 entries, 0 to 2746
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2747 non-null   int64              
 1   timestamp                   2747 non-null   datetime64[ns, UTC]
 2   source                      2747 non-null   object             
 3   text                        2747 non-null   object             
 4   retweeted_status_id         216 non-null    float64            
 5   retweeted_status_user_id    216 non-null    float64            
 6   retweeted_status_timestamp  216 non-null    object             
 7   rating_numerator            2747 non-null   int64              
 8   rating_denominator          2747 non-null   int64              
 9   name                        2747 non-null   object             
 10  retweet_count               2747 non-null   int64           

In [None]:
ccc

### Issue #3: Null values in many columns in `twitter_archive_df`    

Null values in the following columns of `twitter_archive_df` : `in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp, expanded_urls`.

In [None]:
# Recalling the structure of twitter_archive_clean
twitter_archive_clean.info()

#### Define
* Check if the null value in each column can be retrieved from the others dataframe. If yes, do so. If no, drop either the column or the samples with null values, depending of the usage and relevance of the said column.

#### Code

#### Test

## Storing Data
Save gathered, assessed, and cleaned master dataset to a CSV file named "twitter_archive_master.csv".

## Analyzing and Visualizing Data
In this section, analyze and visualize your wrangled data. You must produce at least **three (3) insights and one (1) visualization.**

### Insights:
1.

2.

3.

### Visualization