### Import packages / setup

In [108]:
import datetime
import tweepy
from tweepy import OAuthHandler
import json
import pandas as pd
import csv
import re
import string
import os
import time
import random
import numpy as np
from nltk.corpus import stopwords
sw = stopwords.words('english')
from string import punctuation
from collections import Counter, defaultdict
from pprint import pprint
from operator import itemgetter

# I've put my API keys in a .py file called API_keys.py
from my_api_keys import api_key, api_key_secret, access_token, access_token_secret

In [110]:
# Authenticate the Tweepy API
auth = tweepy.OAuthHandler(api_key,api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

In [128]:
# Modifications to punctuation and sw lists

punctuation = set(punctuation)
punctuation.add("’")

sw2 = set(sw)
addl = ("|","-","/","•","&")
sw2.update(addl)

### Function for scraping tweets related to COP26
#### The function scrapes 15k tweets per day and stores in a CSV file

In [74]:
def scrapetweets(the_api, search_words, numtweets, numruns):
    
    # Define a for-loop to generate tweets at regular intervals
    # We cannot make large API call in one go. Hence, let's try numruns times

    # Define a pandas dataframe to store the date:
    db_tweets = pd.DataFrame(columns = ['user_id','screen_name','description','location','friends_count',
           'followers_count','totaltweets','date_created', 'tweet_id', 'retweetcount','full_text'])
    
    program_start = time.time()
    for i in range(0, numruns):
        # We will time how long it takes to scrape tweets for each run:
        start_run = time.time()
        
        # Collect tweets using the Cursor object
        # .Cursor() returns an object that you can iterate or loop over to access the data collected.
        # Each item in the iterator has various attributes that you can access to get information about each tweet
        tweets = tweepy.Cursor(the_api.search_tweets, 
                               q=search_words, 
                               lang="en", 
                               tweet_mode='extended'
                              ).items(numtweets)
        
        # Store these tweets into a python list
        tweet_list = [tweet for tweet in tweets]
        
        # Begin scraping the tweets individually:
        noTweets = 0
    
        for tweet in tweet_list:
            userid = tweet.user.id
            username = tweet.user.screen_name
            description = tweet.user.description
            location = tweet.user.location
            following = tweet.user.friends_count
            followers = tweet.user.followers_count
            totaltweets = tweet.user.statuses_count
            date_created = tweet.created_at
            tweet_id = tweet.id
            retweetcount = tweet.retweet_count
            full_text = tweet.full_text

            # Add the 11 variables to the empty list - ith_tweet:
            ith_tweet = [userid, username, description, location, following, followers, totaltweets,
                         date_created, tweet_id, retweetcount, full_text]

            # Append to dataframe - db_tweets
            db_tweets.loc[len(db_tweets)] = ith_tweet

            # increase counter - noTweets  
            noTweets += 1
                       
            
            
        # Run ended:
        end_run = time.time()
        duration_run = round((end_run-start_run)/60, 2)

        print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
        print('time take for {} run to complete is {} mins'.format(i+1, duration_run))

        time.sleep(920) #15 minute sleep time between runs

    # Once all runs have completed, save them to a single csv file:
    
    # Obtain timestamp in a readable format
    to_csv_timestamp = datetime.date.today().strftime('%Y%m%d_%H%M%S')
    
    # Define working path and filename
    path = os.getcwd()
    filename = path + '/data/' + to_csv_timestamp + '_cop_tweets.csv'
    
    # Store dataframe in csv with creation date timestamp
    db_tweets.to_csv(filename, index = False)
    
    program_end = time.time()
    print("\n")
    print(f'Scraping for {startdate} to {enddate} has completed!')
    print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))

In [75]:
# COP26 was Oct 31 to Nov 12. Pull the conference, plus three days on either end. 
# So need Oct 28 to Nov 15. 

startdate = "2021-10-30"
enddate = "2021-10-31"

search_words = f'#cop26 OR #COPglasgow since:{startdate} until:{enddate} -filter:retweets'
numtweets=2500
numruns=6

scrapetweets(api, search_words, numtweets, numruns)

no. of tweets scraped for run 1 is 1
time take for 1 run to complete is 0.01 mins


Scraping for 2021-10-29 to 2021-10-30 has completed!
Total time taken to scrape is 0.0 minutes.


###  Issues to address

deal with carriage returns

tab-separator

deal with hashtags separators

####  Read daily tweets CSVs into pandas dataframe

In [64]:
db = pd.DataFrame(columns = ['user_id','screen_name','description','location','friends_count',
           'followers_count','totaltweets','date_created', 'tweet_id', 'retweetcount','full_text'])

file_location = "/Users/natebender/Desktop/Repo/text-mining/datashare_AA_COP_tweets/data/"
files = sorted(os.listdir(file_location))
for idx, file in enumerate(files):
    
    data = "".join([file_location,file])
    datafile = pd.read_csv(data)

    db = db.append(datafile,ignore_index=True)

In [230]:
# Check desc stats on overall descriptions before splitting into groups
def get_patterns(all_descriptions) :

    all_desc = all_descriptions.dropna()
    all_str = " ".join(all_desc)    
    clean = [w for w in all_str.split() if w.lower() not in sw2]
    
    # Calculate your statistics here
    total_tokens = len(clean)
    unique_tokens = len(set(clean))
    clean_tok_len = [len(w) for w in clean]
    avg_token_len = np.mean(clean_tok_len)
    lex_diversity = len(set(clean))/len(clean)
    top_10 = Counter(clean).most_common(10)
    
    
    # Now we'll fill out the dictionary. 
    results = {'tokens':total_tokens,
               'unique_tokens':unique_tokens,
               'avg_token_length':round(avg_token_len,2),
               'lexical_diversity':round(lex_diversity,2),
               'top_10':top_10}

    return(results)

### Desc stats on database

In [237]:
# Drop NA values from description column
all_desc = db.description.dropna()
print(f'Database: {"{:,}".format(len(db.tweet_id))} tweets')
print(f'With NAs removed: {"{:,}".format(len(all_desc))} descriptions')
print(f'Descriptive stats are:')
get_patterns(db.description)

Database: 30,000 tweets
With NAs removed: 27,108 descriptions
Descriptive stats are:


{'tokens': 331578,
 'unique_tokens': 20173,
 'avg_token_length': 7.42,
 'lexical_diversity': 0.06,
 'top_10': [('climate', 1512),
  ('🏴\U000e0067\U000e0062\U000e0073\U000e0063\U000e0074\U000e007f', 1428),
  ('#COP26', 1380),
  ('Climate', 1254),
  ('Tweeting', 1194),
  ('🤖', 1128),
  ('#UK', 1104),
  ('aviation', 1104),
  ('#COP26Glasgow', 1104),
  ('activity', 1098)]}

### Lexicon expansion function to analyze tweets by group

In [133]:
class Lexicon :
    def __init__(self, corpus, search_condition, num_words, ratio_cutoff):
        self.corpus = list(corpus)
        self.search_condition = search_condition
        self.num_words = num_words
        self.ratio_cutoff = ratio_cutoff
        
    def parameters(self) :
        print(f"The corpus is {len(self.corpus)} total tweets")
        print(f"The search string is {self.search_condition}")
        print(f"Words must appear {self.ratio_cutoff} times in both corpora to be included in analysis")
        print(f"Descriptive stats and comparisons for {self.num_words} words are returned")
        
    def lex_expansion(self) :

        # Set starting marker for keeping track of how long the function runs
        start_time = datetime.datetime.now()

        group_1 = []

        for string in self.corpus :
            if self.search_condition.search(string) :
                group_1.append(string)

        group_1 = list(set(group_1))
        group_2 = [string for string in self.corpus if string not in group_1]

        group_1_tweets = ("{:,}".format(len(group_1)))
        group_2_tweets = ("{:,}".format(len(group_2)))    

        # Tokenize the two groups by first turning them each into single large strings
        g1_str = " ".join(group_1)    
        g2_str = " ".join(group_2)

        # Then splitting each into lists of strings. Once we've created groups, we no longer need to know
        # which words came from which specific tweets, as long as we keep track of which group the 
        # words come from.
        # We also tokenize and normalize at this stage. We want to remove stopwords, but retain numeric as numbers
        # could be used in interesting words, hashtags, or accounts in a corpus of tweets. 
        g1_uclean = [w for w in g1_str.split()]
        g2_uclean = [w for w in g2_str.split()]

        g1_clean = [w for w in g1_str.split() if w.lower() not in sw2]
        g2_clean = [w for w in g2_str.split()if w.lower() not in sw2]

        g1_len = len(g1_clean)
        g2_len = len(g2_clean)

        # SECT2: CREATE "CUTOFF_LIST" LIST OF WORDS THAT MEET CUTOFF RATIO
        # Create Counter dictionary of each corpus, used for determining words that meet cutoff ratio
        wcount_one = Counter(g1_clean)
        wcount_two = Counter(g2_clean)

        # Create list of words that meet the ratio cutoff in BOTH corpora, print result
        cutoff_list = list()
        candidate_words = list(wcount_one.keys()) + list(wcount_two.keys())
        candidate_words = set(candidate_words)
        for word in candidate_words :
            if wcount_one[word] >= self.ratio_cutoff and wcount_two[word] >= self.ratio_cutoff :
                cutoff_list.append(word)

        cutoff_statement = f'There are {"{:,}".format(len(cutoff_list))} words that meet the usage cutoff of {self.ratio_cutoff} appearances in both Group 1 and Group 2'

        # SECT3: CALCULATE METRICS ON WORDS IN CUTOFF_LIST
        # Create "metrics", an intermediate dict to hold data that will be passed 
        # to different dictionaries in the final "results" dict of dicts output
        metrics = defaultdict(list)

        for word in cutoff_list:        
            metrics[word].append(len([w for w in g1_clean if w==word]))  # word count in corpus_1
            metrics[word].append(len([w for w in g2_clean if w==word]))  # word count in corpus_2
            metrics[word].append(len([w for w in g1_clean if w==word])/g1_len)  # ratio of word count to corpus_1 length
            metrics[word].append(len([w for w in g2_clean if w==word])/g2_len)  # ratio of word count to corpus_2 length 

        # Loop through through the defaultdict and append word ratio for the each word of the corpus
        for word, nums in metrics.items() :
            # Make sure any zero ratios are excluded
            if (nums[2] * nums[3] > 0) :
                metrics[word].append(nums[2]/nums[3])  # appends Corpus_1/Corpus_2 index
                metrics[word].append(nums[3]/nums[2])  # appends Corpus_2/Corpus_1 index
            else :
                metrics[word].append(None)
                metrics[word].append(None)


        # SECT4: APPEND RESPECTIVE INDEXES (CORP1/CORP2 RATIO AND CORP2/CORP1 RATIO) TO "METRICS" DICT
        one_v_two = defaultdict(list)
        two_v_one = defaultdict(list)

        # Append word/index pair in "one_v_two" dict
        for word, nums in metrics.items() :
            one_v_two[word].append(nums[4])

        # Sort "one_v_two" down to just the "num_words" key:value pairs,
        # sorted in descending order of index ratio.
        one_v_two_sort = dict(sorted(one_v_two.items(), key = itemgetter(1), reverse = True)[:self.num_words])

        # Lastly, round those values to two decimal points for readability    
        for dict_value in one_v_two_sort :
            for k, v in one_v_two_sort.items() :
                one_v_two_sort[k] = [round(v,2) for v in one_v_two_sort[k]]

        # Same process for "two_v_one"
        # Append word/index pair in "two_v_one" dict
        for word, nums in metrics.items() :
            two_v_one[word].append(nums[5])

        # Sort "two_v_one" down to just the "num_words" key:value pairs, 
        # sorted in descending order of index ratio.
        two_v_one_sort = dict(sorted(two_v_one.items(), key = itemgetter(1), reverse = True)[:self.num_words])         

        # Lastly, round those values to two decimal points for readability    
        for dict_value in two_v_one_sort :
            for k, v in two_v_one_sort.items() :
                two_v_one_sort[k] = [round(v,2) for v in two_v_one_sort[k]]      


        # SECT5: CALCULATE METRICS FOR "CORP1" AND "CORP2" KEYS IN "RESULTS" FINAL DICT
        # Descriptive stats calcs for corpus_1
        total_tokens_1 = len(g1_clean)
        unique_tokens_1 = len(set(g1_clean))
        word_len_1 = [len(w) for w in g1_clean]
        avg_token_len_1 = np.mean(word_len_1)
        lex_diversity_1 = len(set(g1_clean))/len(g1_clean)
        top_n_1 = Counter(g1_clean).most_common(self.num_words)

        # Descriptive stats calcs for corpus_2
        total_tokens_2 = len(g2_clean)
        unique_tokens_2 = len(set(g2_clean))
        word_len_2 = [len(w) for w in g2_clean]
        avg_token_len_2 = np.mean(word_len_2)
        lex_diversity_2 = len(set(g2_clean))/len(g2_clean)
        top_n_2 = Counter(g2_clean).most_common(self.num_words)


        # SECT6: BRING EVERYTHING TOGETHER IN "RESULTS" FINAL DICT
        results1 = {'Group_1': {'Number_of_tweets':group_1_tweets,
                                'Number_of_words':total_tokens_1,
                              'Unique_words':unique_tokens_1,
                              'Avg _length':round(avg_token_len_1, 2),
                              'Lexical_diversity':round(lex_diversity_1, 2),
                              'Top_'+str(self.num_words):top_n_1},
                   'Group_2': {'Number_of_tweets':group_2_tweets,
                               'Number_of_words':total_tokens_2,
                              'Unique_words':unique_tokens_2,
                              'Avg_word_length':round(avg_token_len_2, 2),
                              'Lexical_diversity':round(lex_diversity_2, 2),
                              'Top_'+str(self.num_words):top_n_2},
                   'Group1_vs_Group2': one_v_two_sort,
                   'Group2_vs_Group1': two_v_one_sort,
                   'cutoff_statement': cutoff_statement}

        # Print elapsed time after function has run
        end_time = datetime.datetime.now()
        elapsed_time = end_time - start_time
        print(elapsed_time)
        return(results1)

In [216]:
search_1 = re.compile(r"\bfuture\b")
r1 = Lexicon(all_desc.apply(str), search_1, 10, 3)

In [217]:
r1.parameters()

The corpus is 27108 total tweets
The search string is re.compile('\\bfuture\\b')
Words must appear 3 times in both corpora to be included in analysis
Descriptive stats and comparisons for 10 words are returned


In [218]:
round1 = r1.lex_expansion()

0:00:01.883474


In [219]:
print(f'Group 1 tweets:')
print(round1['Group_1']['Number_of_tweets'])
print(f'Group 2 tweets:')
print(round1['Group_2']["Number_of_tweets"])
print(round1["cutoff_statement"])

Group 1 tweets:
71
Group 2 tweets:
26,442
There are 25 words that meet the usage cutoff of 3 appearances in both Group 1 and Group 2


In [220]:
print(f'Round 1: Group 1 v Group 2:')
pprint(sorted(round1['Group1_vs_Group2'].items(), key = itemgetter(1), reverse = True))

Round 1: Group 1 v Group 2:
[('generations', [170.72]),
 ('Scotland’s', [56.91]),
 ('we’re', [56.91]),
 ('current', [56.91]),
 ('wild', [34.14]),
 ('build', [26.56]),
 ('clean', [18.97]),
 ('better', [18.29]),
 ('healthy', [17.07]),
 ('Promoting', [17.07])]


In [221]:
print(f'Round 1: Group 2 v Group 1:')
pprint(sorted(round1['Group2_vs_Group1'].items(), key = itemgetter(1), reverse = True))

Round 1: Group 2 v Group 1:
[('climate', [0.86]),
 ('people', [0.27]),
 ('health', [0.26]),
 ('global', [0.25]),
 ('world', [0.21]),
 ('together', [0.17]),
 ('planet', [0.16]),
 ('energy', [0.12]),
 ('you.', [0.11]),
 ('environment,', [0.1])]


something SOMETHING
<hr style="border:2px solid gray"> </hr> 

In [None]:
search_1 = re.compile(r"\bclimate\b")
r1 = Lexicon(db.description.apply(str), search_1, 10, 5)

In [None]:
round1 = r1.lex_expansion()

In [11]:
print(f'Group 1 tweets:')
print(x2['Group_1']['Number_of_tweets'])
print(f'Group 2 tweets:')
print(x2['Group_2']["Number_of_tweets"])
print(x2["cutoff_statement"])

Group 1 tweets:
1,539
Group 2 tweets:
336,432
There are 1,129 words that meet the usage cutoff


In [None]:
print(f'Round 2: Group 1 v Group 2:')
pprint(sorted(x2['Group1_vs_Group2'].items(), key = itemgetter(1), reverse = True))

In [13]:
print(f'Round 2: Group 2 v Group 1:')
pprint(sorted(x2['Group2_vs_Group1'].items(), key = itemgetter(1), reverse = True))

Round 3: Group 2 v Group 1:
[('#vapefam', [11.27]),
 ('#vape', [7.98]),
 ('pod', [6.98]),
 ('#vapelife', [6.39]),
 ('tank', [5.92]),
 ('addicted', [5.4]),
 ('#eliquid', [5.39]),
 ('#vaping', [5.02]),
 ('#vapelyfe', [4.89]),
 ('vapes', [4.78])]
