In [None]:
import re
import json
import nltk

In [None]:
with open("fake-news-Dec-8th.json", "r") as source:
    alltweets = json.load(source)

unique_tweets = set()
for tweet in alltweets:
    unique_tweets.update([tweet['full_text']])

print(f"Total tweets: {len(alltweets)}")
print(f"Total unique (text): {len(unique_tweets)}")

ids =[]
for tweet in alltweets:
    ids.append(tweet['id'])  
    
ids.sort(reverse=True)
first=ids[0]
last=ids[-1]

print("\nTweets collected between:")
for tweet in alltweets:
    if tweet['id'] == first:
        print(tweet['created_at'])
    if tweet['id'] == last: 
        print(tweet['created_at'])
print("---")

In [None]:
from typing import List

def tweet_tokenize(tweet: dict,*,casefold=False) -> List[str]:
    pattern = """(?x)                  # VERBOSE
        (?:[A-Za-z]\.)+                # abbreviations
        | \w+['’]\w\w?                 # contractions
        | \#\w+                        # hashtags
        | \@\w+                        # mentions
        | https?://\w+\.\w+\.?(?:/\w+) # links
        | \w+(?:-\w+)*                 # hyphenated words/names
        | \$?\d+(?:\.\d+)?%?           # currency or percentages
        | &amp;                        # &
        """
    if casefold: text = tweet['full_text'].casefold()
    else: text = tweet['full_text']
    return nltk.regexp_tokenize(text, pattern)

def ngrams(symbols: list, n=3):
    if len(symbols) < n:
        return
    prev_context = symbols[:n - 1]
    for i in range(len(symbols) - n + 1):
        yield tuple(symbols[i:i + n])
        
def has_hash(tweets: List[dict]) -> List[dict]:
    # Yields all tweets that contain hashtag(s)
    for tweet in tweets:
        if tweet['entities']['hashtags']: yield tweet
            
def has_url(tweets: List[dict]) -> List[dict]:
    # Yields all tweets that contain URL(s)
    # Urls indicate quoted tweets link/media sharing, etc.
    for tweet in tweets:
        if tweet['entities']['urls']: yield tweet

def has_quote(tweets: List[dict]) -> List[dict]:
    # Yields tweets that quote another tweet
    for tweet in tweets:
        if tweet['is_quote_status']: yield tweet
            
def yield_quoted(tweets: List[dict]) -> dict:
    # Yields tweets quoted by another user
    # Identifies tweets marked as quoting but don't contain quoted content
    for tweet in has_quote(tweets):
        try: yield tweet['quoted_status']
        except KeyError: print(f"id: {tweet['id']} has no quoted tweet.")

def get_quoted(tweet):
    return tweet['quoted_status']

def tweet_hashtags(tweet: dict,*,casefold=True) -> str:
    # Yields hashtag(s) of a tweet
    taglst = tweet['entities']['hashtags']
    for tag in taglst:
        if casefold: yield tag['text'].casefold()
        else: yield tag['text']

def tweet_urls(tweet: dict,*,casefold=False) -> str:
    # Yields non-status URL(s) embedded in a given tweet
    # some urls are case-sensitive
    statusfilter='twitter.com/\w+?/status/'
    in_urlst = tweet["entities"]["urls"]
    out_urlst = []
    for url_dict in in_urlst:
        if not re.findall(statusfilter, url_dict['expanded_url']):
            out_urlst.append(url_dict)
    for url in out_urlst:
        if casefold: yield url['expanded_url'].casefold()
        else: yield url['expanded_url']
    
def get_screenname(tweet: dict,*,casefold=True) -> str:
    if casefold: return tweet['user']['screen_name'].casefold()
    else: return tweet['user']['screen_name']

def search_keys(target_query: str,target_dict: dict) -> str:
    # Yield keys to access tweets by hashtag, url or quoted user
    for key in target_dict.keys():
        if target_query in key: yield key   

In [None]:
# Casefold and tokenize all tweets, and get rid of urls
tokenized_tweets = []
for tweet in alltweets:
    tokens = []
    for token in tweet_tokenize(tweet,casefold=True):
        if not token.startswith('http'): tokens.append(token)
    tokenized_tweets.append(tokens)

# Calculate avg length of tweets
all_lens=0
for tweet in tokenized_tweets:
    all_lens+=len(tweet)
avg_len = all_lens/len(tokenized_tweets)
print(f"Then avergage tweet length (without urls) is *{round(avg_len,2)}* tokens.")

# Complie all short tweets that begin or end with 'fake news'
shortnsweet = []
for tweet in tokenized_tweets:
    if len(tweet) <= 10 \
    and (tweet[0:2]==['fake','news'] or tweet[-2:]==['fake','news']):
        shortnsweet.append(tweet)
print(f"*{len(shortnsweet)}* tweets are <= 10 chars and begin or end with 'fake news'.")  

# Find tweets that are at least 30% in ALL CAPS
yelling = []
for tweet in alltweets:
    tokens = tweet_tokenize(tweet,casefold=False)
    yells=0
    for token in tokens:
        if token.isupper(): yells+=1
    if yells/len(tokens) >= 0.3: yelling.append(tweet)
        
print(f"*{len(yelling)}* involve some yelling.")
print('---')

In [None]:
# Collect all tweets with hashtags
has_tags = []
for tweet in has_hash(alltweets):
    has_tags.append(tweet)
print(f"*{len(has_tags)}* tweets contain hashtags.")
    
# Collect all tweets with urls
# Urls indicate quoted tweets *and* links
has_urls=[]
for tweet in has_url(alltweets):
    has_urls.append(tweet)
print(f"*{len(has_urls)}* tweets contain urls.")

# Collect only the tweets marked as quoting another user
are_quoting=[]
for tweet in has_quote(alltweets):
    are_quoting.append(tweet)
print(f">> *{len(are_quoting)}* tweets are marked as quoting another user.")

# Collect all other tweets that shared a link
not_quoting=[]
for tweet in has_urls:
    if tweet not in are_quoting:
        not_quoting.append(tweet)
print(f">> *{len(not_quoting)}* tweets shared at least one link.")

# Collect the embedded quoted tweets
is_quoted_tweet=[]
for tweet in yield_quoted(alltweets):
    is_quoted_tweet.append(tweet)
print(f">> *{len(is_quoted_tweet)}* quoted tweets are accesible via the API.")
print('---')

In [None]:
# clean up tweets that are quote-marked w/o an embedded tweet
bad = {}
for tweet in are_quoting:
    if tweet['id'] in bad:
        tweet['is_quote_status'] = False

are_quoting=[]
for tweet in has_quote(alltweets):
    are_quoting.append(tweet)
print(f">> *{len(are_quoting)}* tweets are now marked as quoting another user.")
print('---')

In [None]:
# Collect 'quoted' tweet entities/users for comparison
quoted_users = set()
for tweet in are_quoting:
    quoted_users.update([get_screenname(get_quoted(tweet))])
    
# Map tweets by the quoted entity:
by_quoted_user = nltk.defaultdict(list)
for tweet in are_quoting:
    q = get_quoted(tweet)
    by_quoted_user[get_screenname(q)].append(tweet)

# Make sure all quoted users are accounted for
assert len(quoted_users) == len(by_quoted_user)

# Calculate the number of tweets quoting each entity
tweets_per_quoted=[]
for key in by_quoted_user.keys():
    tweets_per_quoted.append((key, len(by_quoted_user[key])))
    
tweets_per_quoted.sort(key=lambda x: x[1], reverse=True)
top_20_quoted = tweets_per_quoted[0:20]

print("Top 20 quoted users:")
for ent, num in top_20_quoted:
    print('---')
    print(f"{num} tweets quoted @{ent}.")
print('---')

In [None]:
# Map hashtags associated with the quoted entities
tags_with_quoted = nltk.defaultdict(set)
for key in by_quoted_user.keys():
    for tweet in has_hash(by_quoted_user[key]):
        tags_with_quoted[key].update(tweet_hashtags(tweet))

print("Tags associated with @civmilair:")
print("#"+"\n#".join(tags_with_quoted['civmilair']))

In [None]:
# Count frequecy of users among all tweets
most_active = nltk.FreqDist()
for tweet in alltweets:
    most_active.update([get_screenname(tweet)])

allusers_sorted=most_active.most_common()

# These are mostly bots
for ent, num in allusers_sorted[0:20]:
    print(f"@{ent} tweeted {num} times.")
    print('---')

In [None]:
# Calculate the number of times each tag appears
tagfreq = nltk.FreqDist()
for tweet in has_hash(alltweets):
    tagfreq.update(tweet_hashtags(tweet))
    
alltags_sorted=tagfreq.most_common()

# Map tweets by the hashtags they contain
tweets_by_tag = nltk.defaultdict(list)
for tweet in has_hash(alltweets):
    for tag in tweet_hashtags(tweet):
        tweets_by_tag[tag].append(tweet)
        
assert len(alltags_sorted) == len(tweets_by_tag)

for tag, num in alltags_sorted[0:20]:
    print(f"#{tag} was tweeted {num} times.")
    print('---')

In [None]:
### Strip urls as much as possible to map tweets by their news source
all_links = []
for tweet in not_quoting:
    for tweeturl in tweet_urls(tweet):
        all_links.append(tweeturl)
        
url_stripped="""(?x) 
            (?:https?://)?          # http or https protocol (non-capturing)
            (                       # begin capture match.groups()[0]
            (www\.\w+(-?\.?\w+)?)?  # www + host name (optional)
            (\w+-)?(\w+)?           # cont'd host name with hyphen  
            (\.\w+(-?\.?\w+)*)?     # cont'd host names w/ hypen or dot                 
            )                       # end capture match.groups()[0]
            """

sources = dict()
stripped_urls = set()

for url in all_links:
    # Map stripped urls by the full url in the tweet
    sources[url]=re.search(url_stripped, url).groups()[0]
    # Also collect stripped urls for comparison
    stripped_urls.update([re.search(url_stripped, url).groups()[0]])

## Map tweets by stripped url
tweets_by_source=nltk.defaultdict(list)
for tweet in not_quoting:
    for url in tweet_urls(tweet):
        tweets_by_source[sources[url]].append(tweet)

assert len(stripped_urls) == len(tweets_by_source)

In [None]:
# Calculate the number of tweets containing each source (e.g. 'www.cnn.com')
tweets_per_source = []
for key in tweets_by_source.keys():
    tweets_per_source.append((key,len(tweets_by_source[key])))
    
tweets_per_source.sort(key=lambda x: x[1], reverse=True)
top_20_sources = tweets_per_source[0:20]

# Map tags associtated with each source
associated_tags = nltk.defaultdict(set)
for key in tweets_by_source.keys():
    for tweet in has_hash(tweets_by_source[key]): 
        associated_tags[key].update(tweet_hashtags(tweet))
        
for source,num in top_20_sources:
    print('---')
    print(f"{num} shared from {source}")
print('---')
print('---')
print('GatewayPundit associated tags: ')
print('#'+', #'.join(associated_tags['www.thegatewaypundit.com']))
print('---')

In [None]:
to5=[]
to10=[] 
to15=[] 
to20=[]
to30=[] 
to40=[]
to50=[] 
more50=[]
divided_by_size = [to5, to10, to15, to20, to30, to40, to50, more50]

tokenized = (tweet_tokenize(tweet,casefold=False) for tweet in alltweets)
for tweet in tokenized:
    tokens = []
    for token in tweet:
        if not token.startswith('http') \
        and not token.startswith('@') \
        and not token.startswith('#'): tokens.append(token)
    if len(tokens) <= 5: to5.append(tokens)
    elif len(tokens) <= 10: to10.append(tokens)
    elif len(tokens) <= 15: to15.append(tokens)
    elif len(tokens) <= 20: to20.append(tokens)
    elif len(tokens) <= 30: to30.append(tokens)
    elif len(tokens) <= 40: to40.append(tokens)
    elif len(tokens) <= 50: to50.append(tokens)
    else: more50.append(tokens)

def count_titles(tokens):
    total = 0
    for token in tokens:
        if token.istitle(): total+=1
    return total

individual_avgs = []
for i in range(len(divided_by_size)):
    t_avg = []
    for tokens in divided_by_size[i]:
        titles=count_titles(tokens)
        avg_title=round(titles/len(tokens)*100)
        t_avg.append(avg_title)
    individual_avgs.append(t_avg)

ovall_avgs=[]
for i in range(len(individual_avgs)):
    all_avgs = 0
    for avg in individual_avgs[i]: all_avgs+=avg
    ovall_avgs.append(all_avgs/len(individual_avgs[i]))
    
plusavg_titlecase = []
avg_titlecase = []
for i in range(8):
    for index in range(len(divided_by_size[i])):
        if individual_avgs[i][index] > round(ovall_avgs[i],1)*1.5: 
            plusavg_titlecase.append(divided_by_size[i][index])
        else: 
            avg_titlecase.append(divided_by_size[i][index])
            
print(f"{len(to5)} tweets under 5 tokens avg {round(ovall_avgs[0],1)}% title case.")
print(f"{len(to10)} tweets w/ 6-10 tokens avg {round(ovall_avgs[1],1)}% title case.")
print(f"{len(to15)} tweets w/ 11-15 tokens avg {round(ovall_avgs[2],1)}% title case.")
print(f"{len(to20)} tweets w/ 16-20 tokens avg {round(ovall_avgs[3],1)}% title case.")
print(f"{len(to30)} tweets w/ 21-30 tokens avg {round(ovall_avgs[4],1)}% title case.")
print(f"{len(to40)} tweets w/ 31-40 tokens avg {round(ovall_avgs[5],1)}% title case.")
print(f"{len(to50)} tweets w/ 41-50 tokens avg {round(ovall_avgs[6],1)}% title case.")
print(f"{len(more50)} tweets over 50 tokens avg {round(ovall_avgs[7],1)}% title case.")
print('---')
print(f"{len(plusavg_titlecase)} tweets are above average for their group.")
print(f"{len(avg_titlecase)} tweets are about at or below average.")
print('---')

In [None]:
### Frequency of title cased tweets
# These are mostly article titles and bot-generated tweets

# Reconstruct tokens into strings
reconstructed_titles= []
for tokens in plusavg_titlecase:
    reconstructed_titles.append(' '.join(tokens))

# Get frequencies and sort
titles_sorted=nltk.FreqDist(reconstructed_titles).most_common()

# And the winners are...
for title,num in titles_sorted[0:20]:
    print(f"{num} tweets of:\n\"{title}\"")
    print('---')

In [None]:
### Frequency of ngrams in normalish title-cased tweets

# First, casefold
tokens_folded =[]
for tokens in avg_titlecase:
    folded=[]
    for token in tokens: 
        folded.append(token.casefold())
    tokens_folded.append(folded)
    
# Still have the same amount of tweets
len(tokens_folded)==len(avg_titlecase)

# Frequency of Trigrams
folded_gramfreq=nltk.FreqDist()
for tokens in tokens_folded:
    folded_gramfreq.update(ngrams(tokens,3))

# ANYthing other than fake news
common_nonFN=[]
for gram, freq in folded_gramfreq.most_common():
    if "fake" not in gram and 'news' not in gram:
        common_nonFN.append((gram,freq))
        
# Trump's "ENEMY OF THE PEOPLE" tweet was a v hot topic
for gram,num in common_nonFN[0:25]:
    print(gram,num)
    
print("\n*** The End ***")