# TikTok Political Analysis
## ~Objectives
### Problems & Questions
__How can we better develop educational materials to meet kids where they are?__
- is it worth it to spend money to advertise to youth for political campaigns - are they engaging with current events?
- what are kids talking about & why? What does our education system tell them and not tell them

### Goals
- understanding how age/youth impacts political indoctrination
- understanding social impacts of political events
- to understand colloquial knowledge of political concepts

## ~Scope
- daily batch updates
- parsed news events triggers TikTok & twitter queries 
- topic counts 3 days before event cumulatively added to event day & 3 days following event
- see trend lines of engagement on Twitter & TikTok

### Overview:
- Use NewsAPI to find top news by day
- Parse news story title & article into individual words/phrases
- Count most important individual words & phrases
- Use top 3 most important words & phrases to create rules for searching the Twitter API
- Count number of tweets mentioning words & phrases filtered by rules
- Use top 3 words & phrases to find similar tags on TikTok API
- Count number of TikTok challenges/tags/captions with top words & phrases

## ~Extras
- age inference of users
- sentiment analysis (TextBlob)

---

# 1. Install Dependencies & Import Modules
- Newsapi-python: pip install newsapi-python
- Tweepy (install without virtual environment): pip install tweepy
- playwright: pip install playwright
                playwright install
- TikTokApi (install without virtual environment): pip install TikTokApi --upgrade


In [530]:
import pandas as pd
import json
import requests
from datetime import date, timedelta
from bs4 import BeautifulSoup
import numpy as np
from sqlalchemy import create_engine
import logging
import configparser
from timer import Timer
from numpy import datetime64
from datetime import date, datetime, timedelta
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from operator import itemgetter
import math
import tweepy  # python package for accessing Tweet streaming API
from tweepy import API
from tweepy import Stream
import urllib.parse
from TikTokApi import TikTokApi
from selenium import webdriver
import psycopg2 # alts: SQLalchemy - warning: not as simple
from psycopg2 import Error
import re


Configure using config.ini file

In [531]:
c = configparser.ConfigParser()
c.read('config.ini')

# config credentials
host = c['database']['host']
username = c['database']['user']
password = c['database']['password']
db = c['database']['database']

news_api_key = c['newsAuth']['api_key']
tiktok_id = c['tiktokAuth']['s_v_web_id']
# twitter_api_key = c['twitterAuth']['api_key']

access_token = c['twitterAuth']['access_token']
access_token_secret = c['twitterAuth']['access_token_secret']
consumer_key = c['twitterAuth']['consumer_key']
consumer_secret = c['twitterAuth']['consumer_secret']


create Database class

In [532]:
class DataBase():
    def __init__(self, host_name, user_name, user_password):
        self.host_name = host_name
        self.user_name = user_name
        self.user_password = user_password

    def create_server_connection(self):
        self.connection = None
        try:
            self.connection = psycopg2.connect(
                host=self.host_name,
                user=self.user_name,
                password=self.user_password
            )
            logging.info("Database connection successful")
        except Error as err:
            logging.error(f"Error: '{err}'")

        return self.connection


    def create_database(self, connection, query):
            self.connection = connection
            cursor = connection.cursor()
            try:
                cursor.execute(query)
                logging.info("Database created successfully")
            except Error as err:
                logging.error(f"Error: '{err}'")


    def create_db_connection(self, db_name):
            self.db_name = db_name
            self.connection = None
            try:
                self.connection = psycopg2.connect(
                    host=self.host_name,
                    user=self.user_name,
                    password=self.user_password,
                    database=self.db_name
                )
                # cursor = connection.cursor()
                logging.info("PostgreSQL Database connection successful")
            except Error as err:
                logging.error(f"Error: '{err}'")

            return self.connection

    # @Timer(name='Query Execution') *TODO fix __enter__ attribute error
    def execute_query(self, connection, query):
            self.connection = connection
            cursor = connection.cursor()
            try:
                cursor.execute(query)
                self.connection.commit()
                logging.info("Query successful")
            except Error as err:
                print(f"Error: '{err}'")
    
    def read_query(self, connection, query):
        self.connection = connection
        cursor = self.connection.cursor()
        result = None
        try:
            cursor.execute(query)
            result = cursor.fetchall()
            return result
        except Error as err:
            logging.error(f"Error: '{err}'")
    

    @Timer(name='Mogrify')
    def execute_mogrify(self, conn, df, table):
        """
        Using cursor.mogrify() to build the bulk insert query
        then cursor.execute() to execute the query
        """
        self.connection = conn
        # Create a list of tupples from the dataframe values
        tuples = [tuple(x) for x in df.to_numpy()]
    
        # Comma-separated dataframe columns
        cols = ','.join(list(df.columns))
    
        # SQL query to execute
        cursor = conn.cursor()
        values = [cursor.mogrify("(%s,%s,%s,%s)", tup).decode('utf8')
                for tup in tuples]
        # if not publishedAt, delete record
        query = "INSERT INTO %s(%s) VALUES" % (table, cols) + ",".join(values)

        try:
            cursor.execute(query, tuples)
            conn.commit()
        except (Exception, psycopg2.DatabaseError) as error:
            logging.error("Error: %s" % error)
            print("Error: %s" % error)
            conn.rollback()
            cursor.close()
            conn.close()
            return 1
        logging.info("execute_mogrify() done")
        cursor.close()
        conn.close()


Variables for SQL queries

In [533]:
# DDL queries
create_database_query = """
        CREATE DATABASE IF NOT EXISTS sm_news;
    """
    # create necessary tables
    # keyWords VARCHAR
create_article_table = """
    CREATE TABLE IF NOT EXISTS articles (
        publishedAt DATE,
        title VARCHAR PRIMARY KEY,
        author VARCHAR,
        url TEXT
        );
    """
create_article_table_index = """
    CREATE INDEX index
        ON articles(publishedAt,
            title
        );
    """
create_article_text_table = """
    CREATE TABLE IF NOT EXISTS article_text (
        title VARCHAR PRIMARY KEY,
        article_text TEXT
        );
    """
    #  REFERENCES articles(title)
# create_article_text_table_index = """
#     CREATE INDEX index
#         ON article_text(publishedAt, title
#         );
#     """
    # CREATE INDEX index ON articles(publishedAt);
create_political_event_table = """
    CREATE TABLE IF NOT EXISTS event (
        eventID ID PRIMARY KEY,
        startDate DATE,
        name VARCHAR NOT NULL,
        description VARCHAR NOT NULL,
        keyWords VARCHAR
        );
 """
create_tweets_table = """
    CREATE TABLE IF NOT EXISTS tweets (
        tweet_id INT PRIMARY KEY,
        publishedAt DATE NOT NULL,
        userID VARCHAR NOT NULL,
        tweet VARCHAR NOT NULL,
        location VARCHAR NOT NULL, 
        tags VARCHAR NOT NULL
        );
    """
create_tiktoks_table = """
    CREATE TABLE IF NOT EXISTS tiktoks (
        postID INT PRIMARY KEY,
        createTime DATE NOT NULL,
        userID INT NOT NULL,
        description VARCHAR NOT NULL,
        musicID INT NOT NULL,
        soundID INT NOT NULL,
        tags VARCHAR NOT NULL
        );
    """
create_tiktok_sounds_table = """
    CREATE TABLE IF NOT EXISTS tiktok_sounds (
        soundID INT PRIMARY KEY,
        soundTitle VARCHAR,
        isOriginal BOOLEAN
        );
    """
create_tiktok_music_table = """
    CREATE TABLE IF NOT EXISTS tiktok_music (
        songID INT PRIMARY KEY,
        songTitle VARCHAR NOT NULL
        );
    """

create_tiktok_stats_table = """
    CREATE TABLE IF NOT EXISTS tiktok_stats (
        postID INT PRIMARY KEY,
        shareCount INT,
        commentCount INT,
        playCount INT,
        diggCount INT
        );
    """

create_tiktok_tags_table = """
    CREATE TABLE IF NOT EXISTS tiktok_tags (
        tagID INT PRIMARY KEY,
        tag_name VARCHAR NOT NULL 
        );
    """
create_users_table = """
    CREATE TABLE IF NOT EXISTS users (
        userID INT PRIMARY KEY,
        username VARCHAR NOT NULL,
        user_bio VARCHAR NOT NULL
        );
    """
delete_bad_data = """
    DELETE FROM articles
        WHERE publishedAt IS NULL;
    """


Create Database

In [534]:
postgres_db = DataBase(host, username, password)

# connect to server
postgres_server = postgres_db.create_server_connection()

# connect to social media news db
connection = postgres_db.create_db_connection(db)

# execute defined queries to create db tables if needed


try:
    postgres_db.execute_query(connection, create_article_table)
    postgres_db.execute_query(connection, create_article_text_table)
    postgres_db.execute_query(connection, create_tweets_table)
    postgres_db.execute_query(connection, create_political_event_table)
    postgres_db.execute_query(connection, create_users_table)
    postgres_db.execute_query(connection, create_tiktok_sounds_table)
    postgres_db.execute_query(connection, create_tiktok_music_table)
    postgres_db.execute_query(connection, create_tiktok_stats_table)
    postgres_db.execute_query(connection, create_tiktok_tags_table)
    postgres_db.execute_query(connection, create_tiktoks_table)
except (ConnectionError) as e:
    logging.error({e}, 'Check SQL create queries')


In [535]:
# add foreign keys
alter_tiktoks_table = """
    ALTER TABLE tiktoks
    ADD FOREIGN KEY(musicID) REFERENCES tiktok_music(songID),
    ADD FOREIGN KEY(soundID) REFERENCES tiktok_sounds(soundID),
    ADD FOREIGN KEY(userID) REFERENCES users(userID)
    ON DELETE SET NULL;
"""
alter_tiktok_stats_table = """
    ALTER TABLE tiktok_stats
    ADD FOREIGN KEY(postID) REFERENCES tiktoks(postID)
    ON DELETE SET NULL;
"""
try:
    postgres_db.execute_query(connection, alter_tiktoks_table)
    postgres_db.execute_query(connection, alter_tiktok_stats_table)
except (ConnectionError) as e:
    logging.error({e}, 'Check SQL alteration queries')


# 2. Find Top News by Day

Create News class

In [536]:
# make sure you've done: git clone https://github.com/tamimibrahim17/List-of-user-agents.git at this point so the txt files are in your directory
def get_random_ua(browser):
    random_ua = ''
    ua_file = f'{browser}.txt'.title()

    try:
        with open(ua_file) as f:
            lines = f.readlines()
        if len(lines) > 0:
            prng = np.random.RandomState()
            index = prng.permutation(len(lines) - 1)
            idx = np.asarray(index, dtype=np.integer)[0]
            random_proxy = lines[int(idx)]
    except Exception as ex:
        logging('Exception in random_ua')
        print(str(ex))
    finally:
        return random_ua


In [537]:
class News():
    """Extract keywords from  news articles to use as search values for TikTok & Twitter posts relating to the political event of interest. """

    def __init__(self, api_key, logger=logging):
        self.api_key = api_key
        self.logger = logging.basicConfig(filename='news.log', filemode='w',
                    format=f'%(asctime)s - %(levelname)s - %(message)s')

    def request_pop_news(self, params={
        'q': ['politics' or 'political' or 'law' or 'legal' or 'policy'],
        'from': {date.today() - timedelta(days=3)},
        'to': {date.today},
        'language': 'en',
        'sort_by': 'popularity'
    }):
        pop_news = []
        self.params = params

        headers = {
            'X-Api-Key': self.api_key,
            # get_random_ua for Chrome
            'user-agent': get_random_ua('Chrome')
        }

        url = 'https://newsapi.org/v2/everything'

        # response as JSON dict
        self.response = requests.get(url, params=self.params, headers=headers).json()

        with open('pop_news.json', 'w') as f:
            # write results to JSON file
            json.dump(self.response, f)

        with open('pop_news.json', 'r') as file:
            # create Python list object from JSON
            pop_news_json = file.read().split("\n")

            for story in pop_news_json:
                pop_obj = json.loads(story)

                if 'title' in pop_obj:
                    pop_obj['title'] = pop_obj['articles']['title']
                if 'author' in pop_obj:
                    pop_obj['author'] = pop_obj['articles']['author']
                if 'url' in pop_obj:
                    pop_obj['url'] = pop_obj['articles']['url']
                if 'publishedAt' in pop_obj:
                    pop_obj['publishedAt'] = pop_obj['articles']['publishedAt']

                # add info to pop_news dict
                pop_news.append(pop_obj)
        
        # load returned results into Pandas dataframe
        # flatten data to dataframe
        # self.pop_news = pd.json_normalize(self.pop_news, record_path=['articles'])
        # pop_news_df = pd.DataFrame(
        #     self.pop_news, columns=['title', 'author', 'url', 'publishedAt', "text", "keyword1", "keyword2", "keyword3"])
        # self.pop_news_df.dropna(axis=0, how='any')

        return pop_news

    def get_top_headlines(self, params={
        "language": "en",
        "country": "us"
    }):

        top_headlines = []
        self.params = params

        headers = {
            "X-Api-Key": news_api_key,
            "user-agent": get_random_ua('Chrome')
        }
        url = "https://newsapi.org/v2/top-headlines"

        self.response = requests.get(
            url, params=self.params, headers=headers).json()  # response JSON dict

        with open("top_headlines.json", "w") as f:
            # write results to JSON file
            json.dump(self.response, f)

        with open("top_headlines.json", "r") as file:
            # create Python object from JSON
            top_headlines_json = file.read().split("\n")

            for story in top_headlines_json:
                story_obj = json.loads(story)

                if 'title' in story_obj:
                    story_obj["title"] = story_obj["articles"]["title"]
                if 'author' in story_obj:
                    story_obj["author"] = story_obj["articles"]["author"]
                if 'url' in story_obj:
                    story_obj["url"] = story_obj["articles"]["url"]
                if 'publishedAt' in story_obj:
                    story_obj["publishedAt"] = story_obj["articles"]["publishedAt"]

                # add info to top_headlines list/dict
                top_headlines.append(story_obj)
            
            # flatten data to dataframe
        # self.top_headlines = pd.json_normalize(self.top_headlines, record_path=['articles'])
        # top_headlines_df = pd.DataFrame(
        #         self.top_headlines, columns=["title", "author", "url", "publishedAt", "text", "keyword1", "keyword2", "keyword3"])
        # self.top_headlines_df = self.top_headlines_df.dropna(axis=0, how='any')

        return top_headlines

    # put all news together
    def get_all_news(self):
        """Combines top headlines and popular news into one Pandas DataFrame."""
        top_headlines = self.get_top_headlines()
        pop_news = self.request_pop_news()

        pop_news = pd.json_normalize(pop_news, record_path=['articles'])
        top_headlines = pd.json_normalize(top_headlines, record_path=['articles'])
        all_news = top_headlines.append(pop_news)
        self.all_news_df = pd.DataFrame(
            all_news, columns=['title', 'author', 'url', 'publishedAt', "text", "keyword1", "keyword2", "keyword3"])
        self.all_news_df.drop_duplicates()

        # convert to datetime
        self.all_news_df['publishedAt'] = self.all_news_df['publishedAt'].map(
            lambda row: datetime.strptime(str(row), "%Y-%m-%dT%H:%M:%SZ") if pd.notnull(row) else row)

        self.all_news_df.set_index('publishedAt', inplace=True)
        
        self.all_news_df["text"] = self.all_news_df["url"].apply(self.get_article_text)
        
        
        return self.all_news_df

    
    def get_article_text(self, url): 
        """Get and clean news article text"""
        
        contractions_dict = {"'s": " is", "n't": " not", "'m": " am", "'ll": " will",
                     "'d": " would", "'ve": " have", "'re": " are"}
        symbols_list = ['&', '+', '-', '/', '|', '$', '%', ':']
        
        # request
        r = requests.get(url)
        html = r.text
        soup = BeautifulSoup(html)
        a_text = soup.get_text()

        # remove newline characters
        a_text = a_text.strip()
        # remove mentions
        a_text = re.sub("@\S+", " ", a_text)
        # remove URLs
        a_text = re.sub("https*\S+", " ", a_text)
        # remove hashtags
        a_text = re.sub("#\S+", " ", a_text)
        # remove unicode characters
        a_text = a_text.encode('ascii', 'ignore').decode()
        # replace contractions
        for key, value in contractions_dict.items():
            if key in a_text:
                a_text = a_text.replace(key, value)
        
        for i in symbols_list:
            if i in a_text:
                a_text = a_text.replace(i, '')

        # make lowercase
        a_text = a_text.lower()
        a_text = re.sub(r'\w*\d+\w*', '', a_text)

        return a_text



    def keyword_extraction(self, text):
        """Determine weight of important words in articles and add to articles_text table
        using TF-IDF ranking"""

        # make sure text is in string format for parsing
        text = str(text)
        stop_words = set(stopwords.words('english'))

        # find total words in document for calculating Term Frequency (TF)
        total_words = text.split()
        total_word_length = len(total_words)

        # find total number of sentences for calculating Inverse Document Frequency
        total_sentences = tokenize.sent_tokenize(text)
        total_sent_len = len(total_sentences)

        # calculate TF for each word
        tf_score = {}
        for each_word in total_words:
            each_word = each_word.replace('.', '')
            if each_word not in stop_words:
                if each_word in tf_score:
                    tf_score[each_word] += 1
                else:
                    tf_score[each_word] = 1

        # Divide by total_word_length for each dictionary element
        tf_score.update((x, y/int(total_word_length))
                        for x, y in tf_score.items())  # TODO test - ZeroError

        #calculate IDF for each word
        idf_score = {}
        for each_word in total_words:
            each_word = each_word.replace('.', '')
            if each_word not in stop_words:
                if each_word in idf_score:
                    idf_score[each_word] = self.check_sent(each_word, total_sentences)
                else:
                    idf_score[each_word] = 1

        # Performing a log and divide
        idf_score.update((x, math.log(int(total_sent_len)/y))
                        for x, y in idf_score.items())

        # Calculate IDF * TF for each word
        tf_idf_score = {key: tf_score[key] *
                        idf_score.get(key, 0) for key in tf_score.keys()}

        return tf_idf_score

    def check_sent(self, word, sentences):
        """Check if word is present in sentence list for calculating IDF (Inverse Document Frequency)"""
        final = [all([w in x for w in word]) for x in sentences]
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    
        return int(len(sent_len))

    def get_top_n(self, dict_elem, n):
        """Calculate most important keywords in text of interest"""
        result = dict(sorted(dict_elem.items(),
                     key=itemgetter(1), reverse=True)[:n])
        # result = sorted(dict_elem,
        #                      key=itemgetter(1), reverse=True)[:n]
        # sorted_dict = {k:v for k,v in result}
        return result



# 3. Parse Titles & Articles

In [538]:
# instantiate News class
news = News(news_api_key)
# get all news - takes about 30 seconds
news.get_all_news()




Unnamed: 0_level_0,title,author,url,text,keyword1,keyword2,keyword3
publishedAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-08-26 18:12:14,Sturgis motorcycle rally linked to more than 1...,"Brittany Shammas, Hannah Knowles, Dan Keating",https://www.washingtonpost.com/health/2021/08/...,sturgis motorcycle rally linked to more than ...,,,
2021-08-26 18:11:23,Capitol Police Officers Sue Trump and Allies O...,Alan Feuer,https://www.nytimes.com/2021/08/26/us/politics...,capitol police officers sue trump and allies o...,,,
2021-08-26 18:04:46,"2 killed, 1 injured in Illinois courthouse sho...",Associated Press,https://www.foxnews.com/us/2-killed-injured-il...,"killed, injured in illinois courthouse shoot...",,,
2021-08-26 17:21:00,Biden braces for fallout as his dire warnings ...,"Kevin Liptak, CNN",https://www.cnn.com/2021/08/26/politics/biden-...,attack on kabul: biden braces for fallout as h...,,,
2021-08-26 16:57:11,"Nikki Haley, other Republicans call for Biden'...",Sam Dorman,https://www.foxnews.com/politics/biden-resigna...,"haley, blackburn, other republicans call for b...",,,
2021-08-26 16:54:31,Taliban spokesman says 'no proof' bin Laden wa...,Rachel Pannett,https://www.washingtonpost.com/world/2021/08/2...,taliban spokesman says no proof bin laden was ...,,,
2021-08-26 16:49:38,NFL forces Patriots and Rams to revise illegal...,,https://www.cbssports.com/nfl/news/nfl-forces-...,nfl forces patriots and rams to revise illegal...,,,
2021-08-26 16:47:32,Delta Air Lines could be 1st of many to hike p...,Brian Sozzi,https://finance.yahoo.com/news/delta-could-be-...,delta could be of many to hike premiums for u...,,,
2021-08-26 16:45:00,Tropical system could rapidly intensify into a...,Kathryn Prociv,https://www.nbcnews.com/news/weather/tropical-...,tropical system could rapidly intensify into a...,,,
2021-08-26 16:33:06,Netflix starts testing in-app games for users ...,Jay Peters,https://www.theverge.com/2021/8/26/22642946/ne...,netflix starts testing android games for users...,,,


# 4. Get Important Words & Phrases

In [539]:
# get keywords from article text
news.all_news_df["keywords"] = news.all_news_df['text'].apply(news.keyword_extraction)

# keywords = keywords.to_dict()
news.all_news_df["keyword1"] = news.all_news_df["keywords"].apply(news.get_top_n, n=1)
news.all_news_df["keyword2"] = news.all_news_df["keywords"].apply(
    news.get_top_n, n=2)
news.all_news_df["keyword3"] = news.all_news_df["keywords"].apply(
    news.get_top_n, n=3)
# get top 3 words of significance
#top_keywords = news.get_top_n(keywords, 3)
print(news.all_news_df)
# news.all_news_df['keyword1'] = top_keywords[1]
# news.all_news_df['keyword2'] = top_keywords[2]
# news.all_news_df['keyword3'] = top_keywords[3]


                                                                 title  \
publishedAt                                                              
2021-08-26 18:12:14  Sturgis motorcycle rally linked to more than 1...   
2021-08-26 18:11:23  Capitol Police Officers Sue Trump and Allies O...   
2021-08-26 18:04:46  2 killed, 1 injured in Illinois courthouse sho...   
2021-08-26 16:57:11  Nikki Haley, other Republicans call for Biden'...   
2021-08-26 16:54:31  Taliban spokesman says 'no proof' bin Laden wa...   
2021-08-26 16:49:38  NFL forces Patriots and Rams to revise illegal...   
2021-08-26 16:47:32  Delta Air Lines could be 1st of many to hike p...   
2021-08-26 16:45:00  Tropical system could rapidly intensify into a...   
2021-08-26 16:33:06  Netflix starts testing in-app games for users ...   
2021-08-26 16:30:17  The First Footage Of Kristen Stewart As Prince...   
2021-08-26 16:12:00  Monica Lewinsky Hired A Therapist To Watch “Im...   
2021-08-26 16:09:00  Female hummingbir

# 5. Search Twitter API
## Using Important Words & Phrases

Create Tweets class

In [540]:
class Tweets():
    
    def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, logger=logging):
        self.logger = logging.basicConfig(filename='tweets.log', filemode='w',
                                         format=f'%(asctime)s - %(levelname)s - %(message)s')
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = access_token
        self.access_token_secret = access_token_secret
        #self.logger = logging.getLogger(__name__)

    def tweepy_auth(self):

        self.auth = tweepy.OAuthHandler(self.consumer_key, self.consumer_secret)
        self.auth.set_access_token(self.access_token, self.access_token_secret)

        # create API object
        self.api = API(self.auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

        try:
            self.api.verify_credentials()
        except Exception as e:
            self.logger.error("Error during Tweepy authentication")
            raise e
        
        self.logger.info("Tweepy API Authenticated")
    
    def tweet_search(self, query):
        """Search for tweets within previous 7 days.
            Inputs: 
                https-encoded query
                language
                'until' date
                geocode (latitude/longitude)
            Returns: 
                Tweet object
        """
        self.tweet_search_list = []
        query = urllib.parse.urlencode(query)
        # latitude & longitude of Colombus, OH, USA
        latitude = '39.9828671'
        longitude = '-83.1309131'
        # radius of united states
        radius = '3881mi'

        query_result = tweepy.Cursor(self.api, q=query, lang='en', until={
                                     date.today()}, geocode=[latitude, longitude, radius])

        for status in tweepy.Cursor(query_result).items():
            self.tweet_search_list.append(status)
            return self.tweet_search_list

        # TODO append tweets to dataframe & return it
        self.tweet_search_df = pd.DataFrame(self.tweet_search_list)
        return self.tweet_search_df
        
    def tweet_trends(self):
            # returns JSON
        # 1 refers to USA WOEID 
        self.tweet_trends_list = []
        result = tweepy.Cursor(self.api.trends_place(1))

        for trend in tweepy.Cursor(result).items():
            self.tweet_trends_list.append(trend)
            return self.tweet_trends_list
        
        #TODO append to dataframe
        self.tweet_trends_df = pd.DataFrame(self.tweet_trends_list)
        return self.tweet_trends_df    
    
    def clean_tweets(self):
        # use slang.txt
        # https://www.geeksforgeeks.org/python-efficient-text-data-cleaning/
        pass

# define stream listener class
class TweetStreamListener(tweepy.StreamListener):
    def __init__(self, api=None):
        super(TwitterStreamListener).__init__()
        self.num_tweets = 0
        # self.file = open('tweets.txt', 'w')
        # self.db = ''
        self.tweet_list = []
        # self.file = open("tweets.json", "w")

    def on_status(self, status):
        tweet = status._json

        with open("tweets.json", "w") as f:
            # write tweets to json file
            json.dump(tweet, f)
        
        with open("tweets.json", "r") as file:
            # create python object from json
            tweets_json = file.read().split("\n")

            for tweet in tweets_json:
                tweet_obj = json.loads(tweet)

                #flatten nested fields
                if 'quoted_status' in tweet_obj:
                    tweet_obj['quote_tweet'] =tweet_obj['quoted_status']['extended_tweet']['full_text']
                if 'user' in tweet_obj:
                    tweet_obj['location'] = tweet_obj['user']['location']
                # if 'created_at' in tweet_obj:
                #     tweet_obj['created_at'] = pd.to_datetime(tweet)
                

                self.tweet_list.append(status)
                self.num_tweets += 1

                # flatten data to dataframe
                tweets = pd.json_normalize(self.tweet_list, record_path=['articles'])
                self.tweets_df = pd.DataFrame(self.tweet_list, columns=["tweet_id", "publishedAt", "userID", "text", "location"])

                return self.tweets_df
            
        if self.num_tweets < 450:  # whatever the max stream rate is for the twitter API Client
            return True
        else:
            return False





In [541]:
# instantiate Tweet Stream Listener
listener = TweetStreamListener()
# instantiate authentication
tweets = Tweets(consumer_key, consumer_secret,
                access_token, access_token_secret)
auth = tweets.tweepy_auth()
# authenticate stream
tweet_stream = tweepy.Stream(auth, listener)

# filter tweet stream?
filtered_stream = tweet_stream.filter(track=[keywords])
# mogrify stream
postgres_db.execute_mogrify(connection, filtered_stream, 'stream_tweets')

# tweet search
search = tweets.tweet_search(query={
    'tweet.fields': 'attachments,author_id,created_at,geo,id,public_metrics,source,text',
    'expansions': 'geo.place_id,attachments.media_keys', 'place.fields': 'country,geo,id,name', 'user.fields': 'created_at,description,id,location,name,username,verified'})
# change to datetime
tweets.tweet_search_df['created_at'].apply(
    lambda row: datetime.strptime(
        row, '%Y-%m-%d %H:%M:%S'),  # TODO check formatting
    axis=0)
# count keywords in tweet search df
batch_tweets = tweets.tweet_search_df['text'].str.contains(
    keywords, case=False)

# tweet trends
tweet_trends = tweets.tweet_trends()


NameError: name 'TwitterStreamListener' is not defined

# 6. Search TikTok
## Using Important Words & Phrases

In [None]:
"""Search TikTok for videos related to keywords parsed from news articles"""

fp = c['tiktokAuth']['s_v_web_id']

# tik_toks = TikTokApi.get_instance(custom_verifyFp=fp, use_test_endpoints=True, use_selenium=True)

class TikToks(TikTokApi):
    def __init__(self):
        super(TikTokApi, self).__init__()
        self.tiktok_list = []

    @Timer("Tiktok Download")
    def get_tiktok_trends(self):
        # returns tiktok dictionary/JSON
        self.api = TikTokApi()
        self.api.get_instance(custom_verifyFp=fp, use_test_endpoints=True, use_selenium=True)
        trends = self.api.by_hashtag(keywords)

        with open("tiktoks.json", "w") as f:
            json.dump(trends, f)

        with open("tiktoks.json", "r") as file:
            toks_json = file.read().split("\n")

            for tok in toks_json:
                tok_obj = json.loads(tok)
                
                if 'id' in tok:
                    tok_obj['userID'] = tok_obj['author']['id']
                    tok_obj['postID'] = tok_obj['id']
                if 'signature' in tok:
                    tok_obj['user_bio'] = tok_obj['author']['signature']
                if 'challenges' in tok:
                    # iterate over multiples
                    tok_obj['tagID'] = tok_obj['challenges']['id']
                    tok_obj['tag_name'] = tok_obj['challenges']['title']
                if 'createTime' in tok:
                    tok_obj['createTime'] = tok_obj['createTime']
                if 'desc' in tok:
                    tok_obj['description'] = tok_obj['desc']
                if 'stats' in tok:
                    tok_obj['comment_count'] = tok_obj['stats']['commentCount']
                    tok_obj['digg_count'] = tok_obj['stats']['diggCount']
                    tok_obj['play_count'] = tok_obj['stats']['playCount']
                    tok_obj['share_count'] = tok_obj['stats']['shareCount']
                if 'video' in tok:
                    tok_obj['videoID'] = tok_obj['itemList']['video']['id']
                if 'sound' in tok:
                    tok_obj['soundID'] = tok_obj['sound']['id']
                    tok_obj['soundTitle'] = tok_obj['sound']['title']
                    tok_obj['isOriginal'] = tok_obj['sound']['original']
                if 'music' in tok:
                    tok_obj['songID'] = tok_obj['music']['id']
                    tok_obj['songTitle'] = tok_obj['music']['title']
                
                self.tiktok_list.append(tok_obj)

            self.toks_df = pd.DataFrame(self.tiktok_list)

            # split df by columns corresponding to tables

        return self.toks_df



# 7. Add Late-Arriving Dimensions/Data
### *data corresponding to 3 days before news hit

In [None]:
from timer import Timer
from database import *
from get_news import *
import configparser

# configure ConfigParser
c = configparser.ConfigParser()
c.read('config.ini')

# references .config credentials
host = c['database']['host']
username = c['database']['user']
password = c['database']['password']
db = c['database']['database']

news_api_key = c['newsAuth']['api_key']
tiktok_id = c['tiktokAuth']['s_v_web_id']
twitter_api_key = c['twitterAuth']['api_key']

# instantiate DataBase class using .config files


# news.request_pop_news()
# news.get_top_headlines()

# apply get_text function using urls from all_news df
url_text = news.all_news_df['url'].apply(
        lambda row: news.article_text(news.all_news_df['url']),
        axis=1)
# put url_text into df
news.article_text_df['text'] = url_text

# get keywords from article text

# article_text_df['keys'] = keyword_extraction(article_text)

# TODO test get_news & find order of key:value pairs
news.article_text_df['keyword1'] = keywords[1]
news.article_text_df['keyword2'] = keywords[2]
news.article_text_df['keyword3'] = keywords[3]


# execute mogrify - insert news into database
postgres_db.execute_mogrify(connection, news.all_news_df, 'articles')

# append text and keys to database
postgres_db.execute_mogrify(connection, news.article_text_df, 'article_text')


# 8. Tally Up
### Partition counts by day

Add to database

In [None]:
# mogrify stream
postgres_db.execute_mogrify(connection, filtered_stream, 'stream_tweets')
# mogrify batch tweets
postgres_db.execute_mogrify(connection, batch_tweets, 'batch_tweets')
# mogrify trends
postgres_db.execute_mogrify(connection, tweet_trends, 'tweet_trends')
# execute mogrify - insert news into database
postgres_db.execute_mogrify(connection, news.all_news_df, 'articles')
# append text and keys to database
postgres_db.execute_mogrify(connection, news.article_text_df, 'article_text')


# 9. Plot & Analyze
- On which platform (Twitter or TikTok) do folks engage with politics the most?
- Where in the US is engagement the highest?
- Which political events seem to cause the most reaction among youth?