In [71]:
import pymongo
import requests
from bs4 import BeautifulSoup as bs
import GetOldTweets3 as got
import time

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
import re, string
from nltk.tokenize import word_tokenize

In [72]:
# used to normalize raw tweets
def clean_data(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        # removing unwanted symbols and patterns from tokens using regular expressions
        token = re.sub("http[s]?://+(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+","", token)
        token = re.sub(".(.co/o).","", token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        token = re.sub("\w*\d\w*","",token)
        token = re.sub('[‘’“”…#–/]', '', token)
#         token = re.sub("http[s]?","",token)
        
        # assigning new pos tags for WordNetLemmatizer() function
        if tag.startswith("NN"):
            pos = "n"
        elif tag.startswith("VB"):
            pos = "v"
        else:
            pos = "a"
            
        # lemmatizing tokens (running=run)
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        
        # dropping puncuation and stop words
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

### Connecting to MongoDB

In [90]:
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)
db = client.twitter_db

### Scraping Tweets

In [86]:
def get_tweets(search_term, year):
    months = ["1","2","3","4","5","6","7","8","9","10","11","12",]
    all_tweets = []
    count = 0

    for month in months:
        # setting parameters
        tweetCriteria = got.manager.TweetCriteria() \
            .setQuerySearch(search_term)\
            .setSince(f"{year}-{month}-01")\
            .setUntil(f"{year}-{month}-28")\
            .setMaxTweets(1000)

        # scraping tweets
        tweets = got.manager.TweetManager.getTweets(tweetCriteria)

        # adding all tweets to list
        all_tweets.append(tweets)
        
        count = count + 1
        print("-"*15)
        print(f"Number of Tweets collected in {month}/{year}: {len(tweets)}")
        print("-"*15)

        # delay three seconds in between calls
        time.sleep(3)
        
    return all_tweets

In [87]:
restaurant = get_tweets("cheesecake factory", "2019")

---------------
Number of Tweets collected in 1/2019: 1000
---------------
---------------
Number of Tweets collected in 2/2019: 1000
---------------
---------------
Number of Tweets collected in 3/2019: 1000
---------------
---------------
Number of Tweets collected in 4/2019: 1000
---------------
---------------
Number of Tweets collected in 5/2019: 1000
---------------
---------------
Number of Tweets collected in 6/2019: 1000
---------------
---------------
Number of Tweets collected in 7/2019: 1000
---------------
---------------
Number of Tweets collected in 8/2019: 1000
---------------
---------------
Number of Tweets collected in 9/2019: 1000
---------------
---------------
Number of Tweets collected in 10/2019: 1000
---------------
---------------
Number of Tweets collected in 11/2019: 1000
---------------
---------------
Number of Tweets collected in 12/2019: 1000
---------------


### Adding to MongoDB

In [91]:
for tweet_bundle in restaurant:
    for tweet in tweet_bundle:
        # extracting information
        date = tweet.date
        tweet_text = tweet.text
        num_favorites = tweet.favorites
        num_replies = tweet.replies
        num_retweets = tweet.retweets
        author_username = tweet.username
        tweet_id = tweet.id

        # cleaning tweet text 
        clean_tokenized_tweet = clean_data(word_tokenize(tweet_text))

        # creating dictionary object
        post = {
            "search_term": search_term,
            "date_posted": date,
            "original_tweet": tweet_text,
            "clean_tokenized_tweet": clean_tokenized_tweet,
            "num_of_favorites": num_favorites,
            "num_of_replies": num_replies,
            "num_retweets": num_retweets,
            "username": author_username,
            "tweet_id": tweet_id
        }

        # inserting object into database
        db.restaurants.insert_one(post)