## Gather

In [1]:
import pandas as pd
import numpy as np
import requests
import os
import tweepy
import json

In [2]:
# Gathering the WeRateDogs Twitter archive
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

In [3]:
# Gathering the tweet image predictions file (image_predictions.tsv)
folder_name = 'image_predictions'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

with open(os.path.join(folder_name,
                      url.split('/')[-1]), mode='wb') as file:
    file.write(response.content)

In [4]:
# Gathering additional data: retweet count and favorite count by:

    # passing the key's and tokens values and creating API object
consumer_key = 'MY_CONSUMER_KEY'
consumer_secret = 'MY_CONSUMER_SECRET'
access_token = 'MY_ACCESS_TOKEN'
access_secret = 'MY_ACCESS_SECRET'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [5]:
api_counts = {}

# creating a for loop which queries the tweet ids from the tweet archive and passes them into the 
# api.get_status function to collect the retweet and favorite counts, and appends all three values into 
# api_counts 
for i in range(len(twitter_archive)):
    try:
        tweet_id = twitter_archive.tweet_id[i]
        tweet = api.get_status(tweet_id,tweet_mode='extended')
        api_counts[int(tweet_id)] = {
            'retweet_count':tweet._json['retweet_count'],
            'favorite_count':tweet._json['favorite_count']
        }
        if i % 100 == 0: print(f"{i} tweets processed.")
    except tweepy.TweepError:
        print(f"Tweet with ID '{tweet_id}' does not exist.")

0 tweets processed.
Tweet with ID '888202515573088257' does not exist.
Tweet with ID '873697596434513921' does not exist.
100 tweets processed.
Tweet with ID '872668790621863937' does not exist.
Tweet with ID '872261713294495745' does not exist.
Tweet with ID '869988702071779329' does not exist.
Tweet with ID '866816280283807744' does not exist.
Tweet with ID '861769973181624320' does not exist.
Tweet with ID '856602993587888130' does not exist.
200 tweets processed.
Tweet with ID '851953902622658560' does not exist.
Tweet with ID '845459076796616705' does not exist.
Tweet with ID '844704788403113984' does not exist.
Tweet with ID '842892208864923648' does not exist.
Tweet with ID '837366284874571778' does not exist.
Tweet with ID '837012587749474308' does not exist.
300 tweets processed.
Tweet with ID '829374341691346946' does not exist.
Tweet with ID '827228250799742977' does not exist.
400 tweets processed.
500 tweets processed.
Tweet with ID '812747805718642688' does not exist.
Twe

Rate limit reached. Sleeping for: 624


900 tweets processed.
Tweet with ID '754011816964026368' does not exist.
1000 tweets processed.
1100 tweets processed.
1200 tweets processed.
1300 tweets processed.
1400 tweets processed.
1500 tweets processed.
1600 tweets processed.
1700 tweets processed.
Tweet with ID '680055455951884288' does not exist.


Rate limit reached. Sleeping for: 624


1800 tweets processed.
1900 tweets processed.
2000 tweets processed.
2100 tweets processed.
2200 tweets processed.
2300 tweets processed.


In [6]:
# Writing the api_counts object into the json file
with open("tweet_json.txt", "w") as outfile:
    for key in api_counts.keys():
        value = api_counts[key]
        json.dump({key: value}, outfile)
        outfile.write('\n')

In [7]:
# Reading line by line the tweet_json.txt file into an empty dict
counts = {}
for i, line in enumerate(open('tweet_json.txt', 'r')):
    line_data = json.loads(line)
    key = list(line_data.keys())[0]
    value = line_data[key]
    value['tweet_id'] = key
    counts[i] = value

In [8]:
counts

{0: {'retweet_count': 7664,
  'favorite_count': 36001,
  'tweet_id': '892420643555336193'},
 1: {'retweet_count': 5663,
  'favorite_count': 31071,
  'tweet_id': '892177421306343426'},
 2: {'retweet_count': 3758,
  'favorite_count': 23379,
  'tweet_id': '891815181378084864'},
 3: {'retweet_count': 7835,
  'favorite_count': 39299,
  'tweet_id': '891689557279858688'},
 4: {'retweet_count': 8437,
  'favorite_count': 37525,
  'tweet_id': '891327558926688256'},
 5: {'retweet_count': 2829,
  'favorite_count': 18906,
  'tweet_id': '891087950875897856'},
 6: {'retweet_count': 1842,
  'favorite_count': 10990,
  'tweet_id': '890971913173991426'},
 7: {'retweet_count': 17111,
  'favorite_count': 60681,
  'tweet_id': '890729181411237888'},
 8: {'retweet_count': 3892,
  'favorite_count': 26020,
  'tweet_id': '890609185150312448'},
 9: {'retweet_count': 6656,
  'favorite_count': 29725,
  'tweet_id': '890240255349198849'},
 10: {'retweet_count': 6643,
  'favorite_count': 28604,
  'tweet_id': '89000660

In [9]:
# Converting the above created dict into a data frame
counts_df = pd.DataFrame(counts)
counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2321,2322,2323,2324,2325,2326,2327,2328,2329,2330
retweet_count,7664,5663,3758,7835,8437,2829,1842,17111,3892,6656,...,53,124,220,772,51,40,130,41,42,460
favorite_count,36001,31071,23379,39299,37525,18906,10990,60681,26020,29725,...,105,268,407,1121,124,96,271,112,121,2396
tweet_id,892420643555336193,892177421306343426,891815181378084864,891689557279858688,891327558926688256,891087950875897856,890971913173991426,890729181411237888,890609185150312448,890240255349198849,...,666058600524156928,666057090499244032,666055525042405380,666051853826850816,666050758794694657,666049248165822465,666044226329800704,666033412701032449,666029285002620928,666020888022790149


In [10]:
counts_df = counts_df.T

# Assess

    Assessing the twitter_archive data frame: 

In [11]:
twitter_archive

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,


In [13]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [15]:
twitter_archive.rating_numerator.value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

In [18]:
twitter_archive.rating_numerator.describe()

count    2356.000000
mean       13.126486
std        45.876648
min         0.000000
25%        10.000000
50%        11.000000
75%        12.000000
max      1776.000000
Name: rating_numerator, dtype: float64

In [16]:
twitter_archive.rating_denominator.value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [20]:
twitter_archive[twitter_archive.tweet_id.duplicated()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [27]:
twitter_archive[twitter_archive.name == 'None'].count()

tweet_id                      745
in_reply_to_status_id          77
in_reply_to_user_id            77
timestamp                     745
source                        745
text                          745
retweeted_status_id            65
retweeted_status_user_id       65
retweeted_status_timestamp     65
expanded_urls                 686
rating_numerator              745
rating_denominator            745
name                          745
doggo                         745
floofer                       745
pupper                        745
puppo                         745
dtype: int64

In [52]:
names_None = [name for name in twitter_archive.name if type(name) is str and name == 'None']

In [56]:
len(names_None)

745

In [48]:
names = [name for name in twitter_archive.name if type(name) is str and len(name) == 1]

In [49]:
names

['a',
 'a',
 'O',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a']

In [66]:
twitter_archive.timestamp.sort_values()

2355    2015-11-15 22:32:08 +0000
2354    2015-11-15 23:05:30 +0000
2353    2015-11-15 23:21:54 +0000
2352    2015-11-16 00:04:52 +0000
2351    2015-11-16 00:24:50 +0000
                  ...            
4       2017-07-29 16:00:24 +0000
3       2017-07-30 15:58:51 +0000
2       2017-07-31 00:18:03 +0000
1       2017-08-01 00:17:27 +0000
0       2017-08-01 16:23:56 +0000
Name: timestamp, Length: 2356, dtype: object

    Assessing the counts data frame:

In [12]:
counts_df

Unnamed: 0,retweet_count,favorite_count,tweet_id
0,7664,36001,892420643555336193
1,5663,31071,892177421306343426
2,3758,23379,891815181378084864
3,7835,39299,891689557279858688
4,8437,37525,891327558926688256
...,...,...,...
2326,40,96,666049248165822465
2327,130,271,666044226329800704
2328,41,112,666033412701032449
2329,42,121,666029285002620928


In [14]:
counts_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2331 entries, 0 to 2330
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   retweet_count   2331 non-null   object
 1   favorite_count  2331 non-null   object
 2   tweet_id        2331 non-null   object
dtypes: object(3)
memory usage: 152.8+ KB


In [62]:
counts_df.describe()

Unnamed: 0,retweet_count,favorite_count,tweet_id
count,2331,2331,2331
unique,1671,1983,2331
top,224,0,739238157791694849
freq,7,163,1


In [21]:
counts_df[counts_df.tweet_id.duplicated()]

Unnamed: 0,retweet_count,favorite_count,tweet_id


In [23]:
sum(counts_df.retweet_count.isnull())

0

In [24]:
sum(counts_df.favorite_count.isnull())

0

    Checking the content of the above-created image predictions folder

In [67]:
os.listdir(folder_name)

['image-predictions.tsv']

## Quality issues

### `twitter_archive` data frame:
1. Missing values for the following columns:
        - in_reply_to_status_id
        - in_reply_to_user_id
        - retweeted_status_id
        - retweeted_status_user_id
        - retweeted_status_timestamp
        - expanded_urls
2. Erroneous datatypes for the following columns:
        - timestamp
        - doggo
        - floofer
        - pupper
        - puppo
3. Missing and erroneous values for the column:
        - name
4. Erroneous values for the rating columns:
        - numerators' values that fluctuate strongly from the broadly accepted standards --> values of frequencies of 2 and lower
        - denominators's values different than 10
5. Number of tweet_ids do not match the number of tweet_id collected from API


### `counts` data frame:
8. Erroneous datatypes for the following column: 
        - retweet_count
        - favorite_count
        - tweet_id