In [1]:
#tweepy package to coolect tweets
import tweepy
import twitter_keys

#authentication
auth = tweepy.OAuthHandler(twitter_keys.CONSUMER_KEY, twitter_keys.CONSUMER_SECRET)
auth.set_access_token(twitter_keys.ACCESS_KEY, twitter_keys.ACCESS_SECRET)

#create instance of api
api = tweepy.API(auth , wait_on_rate_limit=True)
print("logged in")

logged in


In [None]:
import os
import pandas as pd
import numpy as np
import datetime
from textblob import TextBlob
import re
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer

#ENTER SEARCH TERM HERE
search = "semester"

#enter the max number of tweets
max_tweets = 3000

#arrays to collect data of only non rt tweets
timestamps = list()
tweets = list()
tweets_retweets = list()
tweets_likes = list()
tweets_length = list()
tweets_source = list()
tweets_sentiment = list()

#arrays to collect data of all tweets
all_tweets_timestamps = list();
all_tweets_tweets = list()
all_tweets_retweets = list()
all_tweets_likes = list()
all_tweets_length = list()
all_tweets_source = list()

#cursor to scrape data from twitter
for tweet in tweepy.Cursor(api.search,
                            q= search,
                            count = 100 ,
                            result_typetweets="recent",
                            include_entities=True,
                            lang="en" ,
                            tweet_mode="extended").items(max_tweets):
    
    all_tweets_timestamps.append(tweet.created_at)
    all_tweets_tweets.append(tweet.full_text)
    all_tweets_retweets.append(tweet.retweet_count)
    all_tweets_likes.append(tweet.favorite_count)
    all_tweets_length.append(len(tweet.full_text))
    all_tweets_source.append(tweet.source)

    #only load tweet to arrays if the NOT retweet
    if(tweet.full_text[0:2] != "RT"):
        
        #analyze the sentiment
        temp = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet.full_text).split())
        analysis = TextBlob(temp)
        if analysis.sentiment.polarity < 0:
            num = -1
        elif analysis.sentiment.polarity > 0:
            num = 1
        else:
            num = 0
         
        #add sentiment to array
        tweets_sentiment.append(num)
        
        #add all attributes of tweet to arrays
        timestamps.append(tweet.created_at)
        tweets.append(tweet.full_text)
        tweets_retweets.append(tweet.retweet_count)
        tweets_likes.append(tweet.favorite_count)
        tweets_length.append(len(tweet.full_text))
        tweets_source.append(tweet.source)

#message when done
print("done querying.")

now = datetime.datetime.now()
datestr = now.strftime('%Y-%m-%d')
time = now.strftime('%H-%M')

 
#check if the directory exist
#if it doesn't create it
directory = "data/" + datestr + "/"

if not os.path.exists(directory):
    os.makedirs(directory)
    
directory = "data/" + datestr + "/" + search + "/"

if not os.path.exists(directory):
    os.makedirs(directory)

#create file name for excel file and create wrttier to write to excel
xls_file = directory + search + "_" + time + "_data.xlsx"
writer = pd.ExcelWriter(xls_file)

#use pandas package to export data to csv file
#prepare dataframe to export only on rt tweets to csv
df = pd.DataFrame({'timestamp':timestamps, 
                   'likes': tweets_likes , 
                   'retweets': tweets_retweets , 
                   "source" : tweets_source ,
                   "length" : tweets_length ,
                   "sentiment" : tweets_sentiment ,
                   'tweet':tweets})

#write to excel
df.to_excel(writer , sheet_name='non-rt' ,  encoding='utf-8')



#prepare a dataframe to export to csv of all tweets
df = pd.DataFrame({'timestamp':all_tweets_timestamps , 
                   'likes': all_tweets_likes , 
                   'retweets': all_tweets_retweets , 
                   "source" : all_tweets_source ,
                   "length" : all_tweets_length ,
                   'tweet':all_tweets_tweets})

#create csv
df.to_excel(writer , sheet_name='rt' ,  encoding='utf-8')


#create metadata and output to csv file
meta_df = pd.DataFrame({"all_retweets" : [np.mean(all_tweets_retweets)] , 
                        "all_likes" : [np.mean(all_tweets_likes)] , 
                        "all_length" : [np.mean(all_tweets_length)] ,
                        "retweets" : [np.mean(tweets_retweets)] , 
                        "likes" : [np.mean(tweets_likes)] ,
                        "length" : [np.mean(tweets_length)] ,
                        "sentiment" : [np.mean(tweets_sentiment)] , 
                        "tweet-rt-ratio" : [len(tweets_retweets) / len(all_tweets_retweets)]})


#create csv
meta_df.to_excel(writer , sheet_name='metadata' ,  encoding='utf-8')

#create a list of tweet sources from all tweets and write to excel
all_sources = np.array(all_tweets_source)
unique, counts = np.unique(all_sources, return_counts=True)

all_source_df = pd.DataFrame({"source" : unique , "count" : counts , "percentage" : (counts / sum(counts)) * 100})
all_source_df = all_source_df.sort_values(by=['count'] , ascending=False)

all_source_df.to_excel(writer , sheet_name='source all tweets' ,  encoding='utf-8')

#create a list of tweet sources from non rt tweets and write to excel
sources = np.array(tweets_source)
unique, counts = np.unique(sources, return_counts=True)

source_df = pd.DataFrame({"source" : unique , "count" : counts , "percentage" : (counts / sum(counts)) * 100})
source_df = source_df.sort_values(by=['count'] , ascending=False)

source_df.to_excel(writer , sheet_name='source nonRT tweets' ,  encoding='utf-8')

### CREATE A BAG OF WORDS FOR EACH SET ####

#instace to tokenize words and remove punctuation
tokenizer = RegexpTokenizer(r'\w+')

#create a list of english stopwords
#add words with no info used in twitter
stop_words = set(stopwords.words('english')) 
stop_words.add("http")
stop_words.add("https")
stop_words.add("co")
stop_words.add("amp")

#list to collect a count of words
filtered_word_list = []

for tweet in tweets:
    word_tokens = tokenizer.tokenize(tweet.lower())
    for w in word_tokens:
        if (not w in stop_words):
            filtered_word_list.append(w)
            
unique, counts = np.unique(filtered_word_list, return_counts=True)
bag_of_words = pd.DataFrame({"words" : unique , "count" : counts})
bag_of_words = bag_of_words.sort_values(by=['count'] , ascending=False)

bag_of_words.to_excel(writer , sheet_name='most common words' ,  encoding='utf-8')

writer.save()

#output to user
print("full file created: " , xls_file)