# TikTok Political Analysis
## ~Objectives
### Problems & Questions
__How can we better develop educational materials to meet kids where they are?__
- is it worth it to spend money to advertise to youth for political campaigns - are they engaging with current events?
- what are kids talking about & why? What does our education system tell them and not tell them

### Goals
- understanding how age/youth impacts political indoctrination
- understanding social impacts of political events
- to understand colloquial knowledge of political concepts

## ~Scope
- daily batch updates
- parsed news events triggers TikTok & twitter queries 
- topic counts 3 days before event cumulatively added to event day & 3 days following event
- see trend lines of engagement on Twitter & TikTok

### Overview:
- Use NewsAPI to find top news by day
- Parse news story title & article into individual words/phrases
- Count most important individual words & phrases
- Use top 3 most important words & phrases to create rules for searching the Twitter API
- Count number of tweets mentioning words & phrases filtered by rules
- Use top 3 words & phrases to find similar tags on TikTok API
- Count number of TikTok challenges/tags/captions with top words & phrases

## ~Extras
- age inference of users
- sentiment analysis (TextBlob)

---

# 1. Install Dependencies & Import Modules
- Newsapi-python: pip install newsapi-python
- Tweepy: pip install tweepy

In [None]:
import pandas as pd
import json
import requests
from datetime import date, timedelta
from bs4 import BeautifulSoup
import zlib
import TikTokApi

In [None]:
import numpy as np
from sqlalchemy import create_engine
import logging
import psycopg2
from psycopg2 import Error
import configparser
from timer import Timer

c = configparser.ConfigParser()
c.read('config.ini')

# config credentials
host = c['database']['host']
username = c['database']['user']
password = c['database']['password']
db = c['database']['database']


c = configparser.ConfigParser()
c.read('config.ini')

# references .config credentials
host = c['database']['host']
username = c['database']['user']
password = c['database']['password']
db = c['database']['database']


class DataBase():
    def __init__(self, host_name, user_name, user_password):
        self.host_name = host_name
        self.user_name = user_name
        self.user_password = user_password

    def create_server_connection(self):
        self.connection = None
        try:
            self.connection = psycopg2.connect(
                host=self.host_name,
                user=self.user_name,
                password=self.user_password
            )
            logging.info("Database connection successful")
        except Error as err:
            logging.error(f"Error: '{err}'")

        return self.connection


    def create_database(self, connection, query):
            self.connection = connection
            cursor = connection.cursor()
            try:
                cursor.execute(query)
                logging.info("Database created successfully")
            except Error as err:
                logging.error(f"Error: '{err}'")


    def create_db_connection(self, db_name):
            self.db_name = db_name
            self.connection = None
            try:
                connection = psycopg2.connect(
                    host=self.host_name,
                    user=self.user_name,
                    password=self.user_password,
                    database=self.db_name
                )
                # cursor = connection.cursor()
                logging.info("PostgreSQL Database connection successful")
            except Error as err:
                logging.error(f"Error: '{err}'")

            return self.connection

    @Timer(name='Query Execution')
    def execute_query(self, connection, query):
            self.connection = connection
            cursor = self.connection.cursor()
            try:
                cursor.execute(query)
                self.connection.commit()
                logging.info("Query successful")
            except Error as err:
                print(f"Error: '{err}'")
    
    def read_query(self, connection, query):
        self.connection = connection
        cursor = self.connection.cursor()
        result = None
        try:
            cursor.execute(query)
            result = cursor.fetchall()
            return result
        except Error as err:
            logging.error(f"Error: '{err}'")

    @Timer(name='Mogrify')
    def execute_mogrify(self, conn, df, table):
        """
        Using cursor.mogrify() to build the bulk insert query
        then cursor.execute() to execute the query
        """
        self.connection = conn
        # Create a list of tupples from the dataframe values
        tuples = [tuple(x) for x in df.to_numpy()]
    
        # Comma-separated dataframe columns
        cols = ','.join(list(df.columns))
    
        # SQL query to execute
        cursor = conn.cursor()
        values = [cursor.mogrify("(%s,%s,%s,%s)", tup).decode('utf8')
                for tup in tuples]
        # if not publishedAt, delete record
        query = "INSERT INTO %s(%s) VALUES" % (table, cols) + ",".join(values)

        try:
            cursor.execute(query, tuples)
            conn.commit()
        except (Exception, psycopg2.DatabaseError) as error:
            logging.error("Error: %s" % error)
            print("Error: %s" % error)
            conn.rollback()
            cursor.close()
            conn.close()
            return 1
        logging.info("execute_mogrify() done")
        cursor.close()
        conn.close()

# DDL queries
create_database_query = """
        CREATE DATABASE IF NOT EXISTS sm_news; 
    """
    # create necessary tables
    # keyWords VARCHAR
create_article_table = """
    CREATE TABLE IF NOT EXISTS articles (
        publishedAt DATE,
        title VARCHAR PRIMARY KEY,
        author VARCHAR,
        url TEXT
        );
    """    
create_article_table_index = """
    CREATE INDEX index 
        ON articles(publishedAt, 
            title
        );
    """
create_article_text_table = """
    CREATE TABLE IF NOT EXISTS article_text (
        title VARCHAR PRIMARY KEY REFERENCES articles (title),
        text
        );
    """
create_article_text_table_index = """
    CREATE INDEX index 
        ON article_text(publishedAt, title
        );
    """
    # CREATE INDEX index ON articles(publishedAt);
create_political_event_table = """
    CREATE TABLE IF NOT EXISTS event (
        eventID VARCHAR PRIMARY KEY,
        startDate DATE,
        name VARCHAR NOT NULL,
        description VARCHAR NOT NULL,
        keyWords VARCHAR
        );
    """
create_tweets_table = """
    CREATE TABLE IF NOT EXISTS tweets (
        tweet_id INT PRIMARY KEY,
        publishedAt DATE NOT NULL,
        userID INT NOT NULL,
        tweet VARCHAR NOT NULL,
        location VARCHAR NOT NULL, 
        tags VARCHAR NOT NULL
        );
    """
create_tiktoks_table = """
    CREATE TABLE IF NOT EXISTS tiktoks (
        postID INT PRIMARY KEY,
        createTime DATE NOT NULL,
        description VARCHAR NOT NULL,
        musicID VARCHAR NOT NULL,
        tags VARCHAR NOT NULL,
        FOREIGN KEY(songID) REFERENCES tiktok_music(songID),
        FOREIGN KEY(soundID) REFERENCES tiktok_sounds(soundID),
        FOREIGN KEY(userID) REFERENCES users(userID)
        );
    """
create_tiktok_sounds_table = """
    CREATE TABLE IF NOT EXISTS tiktok_sounds (
        soundID INT PRIMARY KEY,
        soundTitle VARCHAR,
        isOriginal BOOLEAN
        );
    """
create_tiktok_music_table = """
    CREATE TABLE IF NOT EXISTS tiktok_music (
        songID INT PRIMARY KEY,
        songTitle VARCHAR NOT NULL
        );
    """
create_tiktok_stats_table = """
    CREATE TABLE IF NOT EXISTS tiktok_stats (
        FOREIGN KEY(postID) REFERENCES tiktoks(postID),
        shareCount INT,
        commentCount INT,
        playCount INT,
        diggCount INT
        );
    """

create_tiktok_tags_table = """
    CREATE TABLE IF NOT EXISTS tiktok_tags (
        tagID INT PRIMARY KEY,
        tag_name VARCHAR NOT NULL 
        );
    """
create_users_table = """
    CREATE TABLE IF NOT EXISTS users (
        userID INT PRIMARY KEY,
        username VARCHAR NOT NULL,
        user_bio VARCHAR NOT NULL
        );
    """
delete_bad_data = """
    DELETE FROM articles
        WHERE publishedAt IS NULL;
    """


# 2. Find Top News by Day

In [None]:
""""Extract keywords from  news articles to use as search values for TikTok & Twitter posts relating to the political event of interest. """

from numpy import datetime64
from database import *
import logging
from datetime import date, datetime, timedelta
import pandas as pd
import json
import requests
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from operator import itemgetter
import math
from bs4 import BeautifulSoup
from django.db import DatabaseError

# best_words = []
# word_df = {}

news_api_key = c['newsAuth']['api_key']
tiktok_id = c['tiktokAuth']['s_v_web_id']
twitter_api_key = c['twitterAuth']['api_key']

class News():
    def __init__(self, api_key, logger=logging):
        self.api_key = api_key
        self.logger = logging.basicConfig(filename='news.log', filemode='w',
                    format=f'%(asctime)s - %(levelname)s - %(message)s')

    def request_pop_news(self, params={
        'q': ['politics' or 'political' or 'law' or 'legal' or 'policy'],
        'from': {date.today() - timedelta(days=3)},
        'to': {date.today},
        'language': 'en',
        'sort_by': 'popularity'
    }):
        self.pop_news = []
        self.params = params

        headers = {
            'X-Api-Key': self.api_key
        }

        url = 'https://newsapi.org/v2/everything'

        # response as JSON dict
        response = requests.get(url, params=self.params, headers=headers).json()

        with open('pop_news.json', 'w') as f:
            # write results to JSON file
            json.dump(response, f)

        with open('pop_news.json', 'r') as file:
            # create Python list object from JSON
            pop_news_json = file.read().split("\n")

            for story in pop_news_json:
                pop_obj = json.loads(story)

                if 'title' in pop_obj:
                    pop_obj['title'] = pop_obj['articles']['title']
                if 'author' in pop_obj:
                    pop_obj['author'] = pop_obj['articles']['author']
                if 'url' in pop_obj:
                    pop_obj['url'] = pop_obj['articles']['url']
                if 'publishedAt' in pop_obj:
                    pop_obj['publishedAt'] = pop_obj['articles']['publishedAt']

                # add info to pop_news dict
                self.pop_news.append(pop_obj)
        
        # load returned results into Pandas dataframe
        # flatten data to dataframe
        pop_news = pd.json_normalize(self.pop_news, record_path=['articles'])
        self.pop_news_df = pd.DataFrame(
                pop_news, columns=['title', 'author', 'url', 'publishedAt'])
        self.pop_news_df.dropna(axis=0, how='any')

        return self.pop_news_df

    def get_top_headlines(self, params={
        "language": "en",
        "country": "us"
    }):

        self.top_headlines = []
        self.params = params

        headers = {
            "X-Api-Key": self.api_key
        }
        url = "https://newsapi.org/v2/top-headlines"

        response = requests.get(
            url, params=self.params, headers=headers).json()  # response JSON dict

        with open("top_headlines.json", "w") as f:
            # write results to JSON file
            json.dump(response, f)

        with open("top_headlines.json", "r") as file:
            # create Python object from JSON
            top_headlines_json = file.read().split("\n")

            for story in top_headlines_json:
                story_obj = json.loads(story)

                if 'title' in story_obj:
                    story_obj["title"] = story_obj["articles"]["title"]
                if 'author' in story_obj:
                    story_obj["author"] = story_obj["articles"]["author"]
                if 'url' in story_obj:
                    story_obj["url"] = story_obj["articles"]["url"]
                if 'publishedAt' in story_obj:
                    story_obj["publishedAt"] = story_obj["articles"]["publishedAt"]

                # add info to top_headlines list/dict
                self.top_headlines.append(story_obj)
            
        # flatten data to dataframe
        top_headlines = pd.json_normalize(self.top_headlines, record_path=['articles'])
        self.top_headlines_df = pd.DataFrame(
                top_headlines, columns=["title", "author", "url", "publishedAt"])
        self.top_headlines_df.dropna(axis=0, how='any')

        return self.top_headlines_df

    # put all news together
    def all_news(self):
        # call class functions
        top_headlines = self.get_top_headlines()
        pop_news = self.request_pop_news()

        # combine result dfs
        self.all_news_df = pd.concat([top_headlines, pop_news])

        # convert to datetime
        self.all_news_df['publishedAt'] = self.all_news_df['publishedAt'].apply(
            lambda row: datetime.strptime(row, '%Y-%m-%d %H:%M:%S'), # TODO check formatting
            axis=0)

        return self.all_news_df

    
    def article_text(self, url):
        """Get news article text using Requests and BeautifulSoup"""
        #create dataframe to store text
        self.article_text_df = pd.DataFrame({'index': '',
                                'title': '',
                                'text': '',
                                'keyword1': '',
                                'keyword2': '',
                                'keyword3': ''
                                })

        r = requests.get(url)
        html = r.text
        soup = BeautifulSoup(html)
        text = soup.get_text()

        return text

    def keyword_extraction(self, text):
        """Determine weight of important words in articles and add to articles_text table
        using TF-IDF ranking"""

        # make sure text is in string format for parsing
        text = str(text)
        stop_words = set(stopwords.words('english'))

        # find total words in document for calculating Term Frequency (TF)
        total_words = text.split()
        total_word_length = len(total_words)

        # find total number of sentences for calculating Inverse Document Frequency
        total_sentences = tokenize.sent_tokenize(text)
        total_sent_len = len(total_sentences)

        # calculate TF for each word
        self.tf_score = {}
        for each_word in total_words:
            each_word = each_word.replace('.', '')
            if each_word not in stop_words:
                if each_word in self.tf_score:
                    self.tf_score[each_word] += 1
                else:
                    self.tf_score[each_word] = 1

        # Divide by total_word_length for each dictionary element
        self.tf_score.update((x, y/int(total_word_length))
                        for x, y in self.tf_score.items())  # test - ZeroError

        #calculate IDF for each word
        self.idf_score = {}
        for each_word in total_words:
            each_word = each_word.replace('.', '')
            if each_word not in stop_words:
                if each_word in self.idf_score:
                    self.idf_score[each_word] = self.check_sent(each_word, total_sentences)
                else:
                    self.idf_score[each_word] = 1

        # Performing a log and divide
        self.idf_score.update((x, math.log(int(total_sent_len)/y))
                        for x, y in self.idf_score.items())

        # Calculate IDF * TF for each word
        self.tf_idf_score = {key: self.tf_score[key] *
                        self.idf_score.get(key, 0) for key in self.tf_score.keys()}

        return self.tf_idf_score

    def check_sent(self, word, sentences):
        """Check if word is present in sentence list for calculating IDF (Inverse Document Frequency)"""
        final = [all([w in x for w in word]) for x in sentences]
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    
        return int(len(sent_len))

    def get_top_n(self, dict_elem, n):
        """Calculate most important keywords in text of interest"""
        result = dict(sorted(dict_elem.items(),
                    key=itemgetter(1), reverse=True)[:n])
        return result


# 3. Parse Titles & Articles

# 4. Count Important Words & Phrases

# 5. Search Twitter API
## Using Important Words & Phrases

In [None]:
import get_news

import tweepy  # python package for accessing Tweet streaming API
from tweepy import API
from tweepy import Stream
import json
import logging
import pandas as pd
import configparser
import requests
from datetime import date, timedelta
import urllib.parse

twitter_config = configparser.ConfigParser()
twitter_config.read('config.ini')

access_token = twitter_config['twitterAuth']['access_token']
access_token_secret = twitter_config['twitterAuth']['access_token_secret']
consumer_key = twitter_config['twitterAuth']['consumer_key']
consumer_secret = twitter_config['twitterAuth']['consumer_secret']


class Tweets():
    
    def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, logger=logging):
        #self.logger = logging.basicConfig(filename='tweets.log', filemode='w',
                                         #format=f'%(asctime)s - %(levelname)s - %(message)s')
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = access_token
        self.access_token_secret = access_token_secret
        self.logger = logging.getLogger(__name__)

    def tweepy_auth(self):

        self.auth = tweepy.OAuthHandler(self.consumer_key, self.consumer_secret)
        self.auth.set_access_token(self.access_token, self.access_token_secret)

        # create API object
        self.api = API(self.auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

        try:
            self.api.verify_credentials()
        except Exception as e:
            self.logger.error("Error during Tweepy authentication")
            raise e
        
        self.logger.info("Tweepy API Authenticated")
    
    def tweet_search(self, query):
        """Search for tweets within previous 7 days.
            Inputs: 
                https-encoded query
                language
                'until' date
                geocode (latitude/longitude)
            Returns: 
                Tweet object
        """
        self.tweet_search_list = []
        query = urllib.parse.urlencode(query)
        # latitude & longitude of Colombus, OH, USA
        latitude = '39.9828671'
        longitude = '-83.1309131'
        # radius of united states
        radius = '3881mi'

        query_result = tweepy.Cursor(self.api, q=query, lang='en', until={
                                     date.today()}, geocode=[latitude, longitude, radius])

        for status in tweepy.Cursor(query_result).items():
            self.tweet_search_list.append(status)
            return self.tweet_search_list

        # TODO append tweets to dataframe & return it
        self.tweet_search_df = pd.DataFrame(self.tweet_search_list)
        return self.tweet_search_df
        
    def tweet_trends(self):
            # returns JSON
        # 1 refers to USA WOEID 
        self.tweet_trends_list = []
        result = tweepy.Cursor(self.api.trends_place(1))

        for trend in tweepy.Cursor(result).items():
            self.tweet_trends_list.append(trend)
            return self.tweet_trends_list
        
        #TODO append to dataframe
        self.tweet_trends_df = pd.DataFrame(self.tweet_trends_list)
        return self.tweet_trends_df    

# define stream listener class
class TwitterStreamListener(tweepy.StreamListener):
    def __init__(self, api=None):
        super(TwitterStreamListener, self).__init__()
        self.num_tweets = 0
        # self.file = open('tweets.txt', 'w')
        # self.db = ''
        self.tweet_list = []
        # self.file = open("tweets.json", "w")

    def on_status(self, status):
        tweet = status._json

        with open("tweets.json", "w") as f:
            # write tweets to json file
            json.dump(tweet, f)
        
        with open("tweets.json", "r") as file:
            # create python object from json
            tweets_json = file.read().split("\n")

            for tweet in tweets_json:
                tweet_obj = json.loads(tweet)

                #flatten nested fields
                if 'quoted_status' in tweet_obj:
                    tweet_obj['quote_tweet'] =tweet_obj['quoted_status']['extended_tweet']['full_text']
                if 'user' in tweet_obj:
                    tweet_obj['location'] = tweet_obj['user']['location']
                # if 'created_at' in tweet_obj:
                #     tweet_obj['created_at'] = pd.to_datetime(tweet)
                

                self.tweet_list.append(status)
                self.num_tweets += 1

                # flatten data to dataframe
                # tweets = pd.json_normalize(self.tweet_list, record_path=['articles'])
                self.tweets_df = pd.DataFrame(self.tweet_list, columns=["tweet_id", "publishedAt", "userID", "text", "location"])

                return self.tweets_df
            
        if self.num_tweets < 450:  # whatever the max stream rate is for the twitter API Client
            return True
        else:
            return False





# 6. Search TikTok
## Using Important Words & Phrases

# 7. Add Late-Arriving Dimensions/Data
### *data corresponding to 3 days before news hit

In [None]:
from timer import Timer
from database import *
from get_news import *
import configparser

# configure ConfigParser
c = configparser.ConfigParser()
c.read('config.ini')

# references .config credentials
host = c['database']['host']
username = c['database']['user']
password = c['database']['password']
db = c['database']['database']

news_api_key = c['newsAuth']['api_key']
tiktok_id = c['tiktokAuth']['s_v_web_id']
twitter_api_key = c['twitterAuth']['api_key']

# instantiate DataBase class using .config files
postgres_db = DataBase(host, username, password)

# instantiate News class
news = News(news_api_key)

# connect to server
postgres_server = postgres_db.create_server_connection()

# connect to social media news db
connection = postgres_db.create_db_connection(db)

# execute defined queries to create db tables if needed


try:
    postgres_db.execute_query(connection, create_article_table)
    postgres_db.execute_query(connection, create_article_text_table)
    postgres_db.execute_query(connection, create_tweets_table)
    postgres_db.execute_query(connection, create_political_event_table)

    postgres_db.execute_query(connection, create_tiktok_sounds_table)
    postgres_db.execute_query(connection, create_tiktok_music_table)
    postgres_db.execute_query(
    connection, create_tiktok_stats_table)  # not running?
    postgres_db.execute_query(connection, create_tiktok_tags_table)
    postgres_db.execute_query(connection, create_tiktoks_table)
except (DatabaseError, ConnectionError) as e:
    logging.error({e}, 'Check SQL queries')

# news.request_pop_news()
# news.get_top_headlines()

# apply get_text function using urls from all_news df
url_text = news.all_news_df['url'].apply(
        lambda row: news.article_text(news.all_news_df['url']),
        axis=1)
# put url_text into df
news.article_text_df['text'] = url_text

# get keywords from article text

# article_text_df['keys'] = keyword_extraction(article_text)
# get top 3 words of significance
keywords = news.article_text_df['keywords'].apply(
    lambda row: news.get_top_n(news.tf_idf_score, 3)
)

# TODO test get_news & find order of key:value pairs
news.article_text_df['keyword1'] = keywords[1]
news.article_text_df['keyword2'] = keywords[2]
news.article_text_df['keyword3'] = keywords[3]


# execute mogrify - insert news into database
postgres_db.execute_mogrify(connection, news.all_news_df, 'articles')

# append text and keys to database
postgres_db.execute_mogrify(connection, news.article_text_df, 'article_text')


# 8. Tally Up
### Partition counts by day

# 9. Plot & Analyze
- On which platform (Twitter or TikTok) do folks engage with politics the most?
- Where in the US is engagement the highest?
- Which political events seem to cause the most reaction among youth?