## Functions

Contains all functions used in this thesis.

### 0. Import Packages

In [None]:
import twint
import nest_asyncio
import pandas as pd
import os
import string
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import math
import warnings
import requests
import datetime
from datetime import timedelta
import time
from io import StringIO
import statsmodels.api as sm
from spellchecker import SpellChecker
from bs4 import BeautifulSoup

nltk.download('punkt')
nltk.download('stopwords')

warnings.filterwarnings('ignore')

### 1. Twint

In [None]:
def profile_search(startdate, enddate, topic, account, save):
        
        """
        Scrapes tweets of a profile during a given time frame with the .Profile-function of the package TWINT.
        INPUTS: startdate:        STRING, start date in yyyy-mm-dd format
                enddate:          STRING, end date in yyyy-mm-dd format
                topic:            STRING, name of topic, used to name folders
                main_account:     STRING, name of main account
        """
        
        # scrape using TWINT-package
        c = twint.Config()
        c.Until = enddate
        c.Since = startdate
        c.Username = account
        c.Retweets = "True"
        c.Hide_output = True
        c.Output = save + account + '_' + startdate + '_' + enddate + '.txt'
        twint.run.Profile(c)
        
        # if the scraping was successful, prepare data
        if os.path.exists(save + account + '_' + startdate + '_' + enddate + '.txt'):            
            data = pd.read_fwf(save + account + '_' + startdate + '_' + enddate + '.txt',header=None,encoding="UTF-8")
            if len(data.columns)>3:
                # prepare data frame for all scraped data
                data = data.dropna(subset=[0,1,2,3,4]).reset_index(drop=True)
                data_clean = pd.DataFrame(columns=['id', 'date', 'time', 'acc', 'tweet'])
                for i in range(len(data)):
                    try:
                        x = list(data.loc[i])
                        s = ''
                        for j in range(len(x)):
                            s = s + str(x[j]) + ' '
                        s = s.rstrip(' ')
                        data_clean.loc[i,'id'] = s.split(' ')[0]
                        data_clean.loc[i,'date'] = s.split(' ')[1]
                        data_clean.loc[i,'time'] = s.split(' ')[2]
                        data_clean.loc[i,'acc'] = s.split('<')[1].split('>')[0]
                        data_clean.loc[i,'tweet'] = s.split('<')[1].split('>')[1][1:].replace(' nan', '')
                    except IndexError:
                        continue
                data_clean = data_clean.dropna().reset_index(drop=True)

                # prepare data frame for retweets
                retweets = data_clean.loc[data_clean['tweet'].str.contains('RT @')].reset_index(drop=True)
                for i in range(len(retweets)):
                    retweets.loc[i,'rt_acc'] = retweets.loc[i,'tweet'].split(' ')[1].rstrip(':')
                
                # prepare data frame for non-retweets
                noretweets = data_clean.loc[~data_clean['tweet'].str.contains('RT @')].reset_index(drop=True)
                print('Profile searched @' + account + ' - tweets/replies found: ' + str(len(noretweets)) + ' (' + str(len(data_clean)) + ' total)')
                
                # save
                data_clean.to_csv(save + account + '_' + startdate + '_' + enddate + '.csv',encoding='utf-8',index=None)
                retweets.to_csv(save + 'retweets_' + account + '.csv',encoding='utf-8',index=None)
                noretweets.to_csv(save + 'select_' + account + '.csv',encoding='utf-8',index=None)
            else:
                print('Issue when reading tweets. Probably mentioned many accounts. Account @' + account + ' skipped.')
        else:
            print('No tweets scraped. Select other account or timeframe')

In [None]:
def search_search(startdate, enddate, topic, account, save, search):
        
        """
        Scrapes tweets of a profile during a given time frame with the .Search-function of the package TWINT.
        INPUTS: startdate:        STRING, start date in yyyy-mm-dd format
                enddate:          STRING, end date in yyyy-mm-dd format
                topic:            STRING, name of topic, used to name folders
                account:          STRING, name of account
                save:             STRING, path of directory
                search:           STRING, keywords of search, split by 'OR'
        """
        
        # scrape using TWINT-package
        c = twint.Config()
        c.Until = enddate
        c.Since = startdate
        c.Username = account
        c.Search = search
        c.Hide_output = True
        c.Output = save + account + '_' + startdate + '_' + enddate + '.txt'
        twint.run.Search(c)
        
        # if the scraping was successful, prepare data
        if os.path.exists(save + account + '_' + startdate + '_' + enddate + '.txt'):
            data = pd.read_fwf(save + account + '_' + startdate + '_' + enddate + '.txt',header=None,encoding="UTF-8")
            if len(data.columns)>3:
                data = data.dropna(subset=[0,1,2,3,4]).reset_index(drop=True)
                data_clean = pd.DataFrame(columns=['id', 'date', 'time', 'acc', 'tweet'])
                for i in range(len(data)):
                    try:
                        x = list(data.loc[i])
                        s = ''
                        for j in range(len(x)):
                            s = s + str(x[j]) + ' '
                        s = s.rstrip(' ')
                        data_clean.loc[i,'id'] = s.split(' ')[0]
                        data_clean.loc[i,'date'] = s.split(' ')[1]
                        data_clean.loc[i,'time'] = s.split(' ')[2]
                        data_clean.loc[i,'acc'] = s.split('<')[1].split('>')[0]
                        data_clean.loc[i,'tweet'] = s.split('<')[1].split('>')[1][1:].replace(' nan', '')
                    except IndexError:
                        continue
                data_clean = data_clean.dropna().reset_index(drop=True)
                data_clean.to_csv(save + 'select_' + account + '.csv',encoding='utf-8',index=None)
                print('Profile searched @' + account + ' - tweets/replies found: ' + str(len(data_clean)))
            else:
                print('Issue when reading tweets. Probably mentioned many accounts. Account @' + account + ' skipped.')

In [None]:
def mentions_search(startdate, enddate, topic, account, save):
        
        """
        Searches all mentions of a profile.
        INPUTS: startdate:        STRING, start date in yyyy-mm-dd format
                enddate:          STRING, end date in yyyy-mm-dd format
                topic:            STRING, name of topic, used to name folders
                account:          STRING, name of account
                save:             STRING, path of directory
        """
        
        # scrape using TWINT-package
        c = twint.Config()
        c.Until = enddate
        c.Since = startdate
        c.Search = '@' + account
        c.Hide_output = True
        c.Output = save + account + '_' + startdate + '_' + enddate + '.txt'
        twint.run.Search(c)
        
        # if the scraping was successful, prepare data
        if os.path.exists(save + account + '_' + startdate + '_' + enddate + '.txt'):
            data = pd.read_fwf(save + account + '_' + startdate + '_' + enddate + '.txt',header=None,encoding="UTF-8")
            if len(data.columns)>3:
                data = data.dropna(subset=[0,1,2,3,4]).reset_index(drop=True)
                data_clean = pd.DataFrame(columns=['id', 'date', 'time', 'acc', 'tweet'])
                for i in range(len(data)):
                    try:
                        x = list(data.loc[i])
                        s = ''
                        for j in range(len(x)):
                            s = s + str(x[j]) + ' '
                        s = s.rstrip(' ')
                        data_clean.loc[i,'id'] = s.split(' ')[0]
                        data_clean.loc[i,'date'] = s.split(' ')[1]
                        data_clean.loc[i,'time'] = s.split(' ')[2]
                        data_clean.loc[i,'acc'] = s.split('<')[1].split('>')[0]
                        data_clean.loc[i,'tweet'] = s.split('<')[1].split('>')[1][1:].replace(' nan', '')
                    except IndexError:
                        continue
                data_clean = data_clean.dropna().reset_index(drop=True)
                data_clean.to_csv(save + 'mentions_' + account + startdate + '_' + enddate + '.csv',encoding='utf-8',index=None)
                print('Profile searched @' + account + ' - mentions found: ' + str(len(data_clean)))
            else:
                print('Issue when reading tweets. Probably mentioned many accounts. Account @' + account + ' skipped.')                

In [None]:
def profile_search_retweets(startdate, enddate, topic, account, save_rt, save_sel, rtaccs, search):
        
        """
        Checks if an account retweeted an account from a list of accounts. If yes, it is searched for a list of hashtags.
        INPUTS: startdate:        STRING, start date in yyyy-mm-dd format
                enddate:          STRING, end date in yyyy-mm-dd format
                topic:            STRING, name of topic, used to name folders
                account:          STRING, name of account
                save_rt:          STRING, path of directory for retweets-part
                save_sel:         STRING, path of directory for part searching for hashtag usage
                rtaccs:           LIST, list of accounts
                search:           STRING, hashtags, split by 'OR'
        """
        
        # scrape using TWINT-package
        c = twint.Config()
        c.Until = enddate
        c.Since = startdate
        c.Username = account
        c.Retweets = "True"
        c.Hide_output = True
        c.Output = save_rt + account + '_' + startdate + '_' + enddate + '.txt'
        twint.run.Profile(c)

        # if the scraping was successful, prepare data
        if os.path.exists(save_rt + account + '_' + startdate + '_' + enddate + '.txt'):
            data = pd.read_fwf(save_rt + account + '_' + startdate + '_' + enddate + '.txt',header=None,encoding="utf-8")
            if len(data.columns)>3:
                # prepare all data
                data = data.dropna(subset=[0,1,2,3,4]).reset_index(drop=True)
                data_clean = pd.DataFrame(columns=['id', 'date', 'time', 'acc', 'tweet'])
                for i in range(len(data)):
                    try:
                        x = list(data.loc[i])
                        s = ''
                        for j in range(len(x)):
                            s = s + str(x[j]) + ' '
                        s = s.rstrip(' ')
                        data_clean.loc[i,'id'] = s.split(' ')[0]
                        data_clean.loc[i,'date'] = s.split(' ')[1]
                        data_clean.loc[i,'time'] = s.split(' ')[2]
                        data_clean.loc[i,'acc'] = s.split('<')[1].split('>')[0]
                        data_clean.loc[i,'tweet'] = s.split('<')[1].split('>')[1][1:].replace(' nan', '')
                    except IndexError:
                        continue
                data_clean = data_clean.dropna().reset_index(drop=True)
                data_clean.to_csv(save_rt + account + '_' + startdate + '_' + enddate + '.csv',encoding='utf-8',index=None)
                
                # reduce to retweets, call search_search if account retweeted an account in the list
                retweets = data_clean.loc[data_clean['tweet'].str.contains('RT @')].reset_index(drop=True)
                if len(retweets)>0:
                    for i in range(len(retweets)):
                        retweets.loc[i,'rt_acc'] = retweets.loc[i,'tweet'].split(' ')[1].rstrip(':')
                    rtcounts = retweets['rt_acc'].value_counts().rename_axis('acc').reset_index(name='count')
                    rt_select = rtcounts.loc[rtcounts['acc'].isin(rtaccs)].reset_index(drop=True)
                    if len(rt_select)>0:
                        search_search(startdate, enddate, topic, account, save_sel, search)
            else:
                print('Issue when reading tweets. Probably mentioned many accounts. Account @' + account + ' skipped.')  
        else:
            # search most recent tweets as time frame may not be reachable
            print('@' + account + ' has too much tweets or is private/locked. Timeframe not directly reachable. Trying to find retweets without timeframe.')
            profile_search_retweets_current(startdate, enddate, topic, account, save_rt, save_sel, rtaccs, search)

In [None]:
def profile_search_retweets_current(startdate, enddate, topic, account, save_rt, save_sel, rtaccs, search):
        
        """
        Checks if an account retweeted an account from a list of accounts. If yes, it is searched for a list of hashtags.
        This function uses the most recent tweets in case the time frame given may not be directly reachable.
        INPUTS: startdate:        STRING, start date in yyyy-mm-dd format
                enddate:          STRING, end date in yyyy-mm-dd format
                topic:            STRING, name of topic, used to name folders
                account:          STRING, name of account
                save_rt:          STRING, path of directory for retweets-part
                save_sel:         STRING, path of directory for part searching for hashtag usage
                rtaccs:           LIST, list of accounts
                search:           STRING, hashtags, split by 'OR'
        """
        
        # scrape using TWINT-package        
        c = twint.Config()
        c.Username = account
        c.Retweets = "True"
        c.Hide_output = True
        c.Output = save_rt + account + '_current.txt'
        twint.run.Profile(c)

        # if the scraping was successful, prepare data
        if os.path.exists(save_rt + account + '_current.txt'):
            data = pd.read_fwf(save_rt + account + '_current.txt',header=None,encoding='utf-8')
            if len(data.columns)>3:
                # prepare all data
                data = data.dropna(subset=[0,1,2,3,4]).reset_index(drop=True)
                data_clean = pd.DataFrame(columns=['id', 'date', 'time', 'acc', 'tweet'])
                for i in range(len(data)):
                    try:
                        x = list(data.loc[i])
                        s = ''
                        for j in range(len(x)):
                            s = s + str(x[j]) + ' '
                        s = s.rstrip(' ')
                        data_clean.loc[i,'id'] = s.split(' ')[0]
                        data_clean.loc[i,'date'] = s.split(' ')[1]
                        data_clean.loc[i,'time'] = s.split(' ')[2]
                        data_clean.loc[i,'acc'] = s.split('<')[1].split('>')[0]
                        data_clean.loc[i,'tweet'] = s.split('<')[1].split('>')[1][1:].replace(' nan', '')
                    except IndexError:
                        continue
                data_clean = data_clean.dropna().reset_index(drop=True)
                data_clean.to_csv(save_rt + account + '_current.csv',encoding='utf-8',index=None)

                # reduce to retweets, call search_search if account retweeted an account in the list
                retweets = data_clean.loc[data_clean['tweet'].str.contains('RT @')].reset_index(drop=True)
                if len(retweets)>0:
                    for i in range(len(retweets)):
                        retweets.loc[i,'rt_acc'] = retweets.loc[i,'tweet'].split(' ')[1].rstrip(':')
                    rtcounts = retweets['rt_acc'].value_counts().rename_axis('acc').reset_index(name='count')
                    rt_select = rtcounts.loc[rtcounts['acc'].isin(rtaccs)].reset_index(drop=True)
                    if len(rt_select)>0:
                        search_search(startdate, enddate, topic, account, save_sel, search)
            else:
                print('Issue when reading tweets. Probably mentioned many accounts. Account @' + account + ' skipped.')
        else:
            print('No tweets scraped. Account is probably private or locked.')

In [None]:
def main_acc(startdate, enddate, topic, account, directory):   

    """
    Scrapes the tweets from a given main account.
    INPUTS: startdate:        STRING, start date in yyyy-mm-dd format
            enddate:          STRING, end date in yyyy-mm-dd format
            topic:            STRING, name of topic, used to name folders
            account:          STRING, name of main account
            directory:        STRING, directory
    """    
    
    # set directories
    topic_dir = directory + topic + '/'
    main_dir = topic_dir + '01_main/'
    retweets_dir = topic_dir + '02_retweets/'
    
    # generate folder structure if it does not exist yet
    if not os.path.exists(topic_dir): 
        os.makedirs(topic_dir)
        os.makedirs(main_dir)
        os.makedirs(retweets_dir)
    
    # scrape tweets of main account
    profile_search(startdate, enddate, topic, account, main_dir)
    
    # generate list of hashtags used by main account
    select = pd.read_csv(main_dir + 'select_' + account + '.csv',encoding='UTF-8')
    select = select.loc[select['tweet'].str.contains('#')].reset_index(drop=True)
    hashtags = []
    for i in range(len(select)):
        split = select.loc[i,'tweet'].split('#')
        for j in range(1,len(split)):
            if split[j].split(' ')[0].lower().translate(str.maketrans('', '', string.punctuation)) != '':
                hashtags.append(split[j].split(' ')[0].lower().translate(str.maketrans('', '', string.punctuation)))    
    # too much hashtags crashes TWINT --> reduce to max 20 (most used)
    if len(hashtags)<=20:
        hashtags = list(dict.fromkeys(hashtags))
        pd.DataFrame(hashtags).to_csv(main_dir + 'hashtag.csv',encoding='utf-8',index=None,header=None)
    else:
        hashtags_df = pd.DataFrame()
        hashtags_df['hashtags'] = hashtags
        counts = hashtags_df['hashtags'].value_counts().rename_axis('hashtag').reset_index(name='count')
        sel_hashtags = counts[0:20]
        pd.DataFrame(sel_hashtags['hashtag']).to_csv(main_dir + 'hashtag.csv',encoding='utf-8',index=None,header=None)
    
    # generate list of scraped accounts
    scraped_accs = pd.DataFrame(columns=['acc'])
    scraped_accs.loc[0,'acc'] = account
    scraped_accs.to_csv(topic_dir + 'scraped_accs.csv',encoding='utf-8',index=None)

In [None]:
def create_hstring(directory, topic):

    """
    Converting the hashtag.csv to a string, which is usable by TWINT.
    INPUTS: directory:        STRING, directory
            topic:            STRING, name of topic, used to name folders
    OUTPUT: hstring:          STRING, hashtags, split by 'OR'
    """    
    
    # set directories
    topic_dir = directory + topic + '/'
    main_dir = topic_dir + '01_main/'
    
    # import hashtags.csv
    hashtags = pd.read_csv(main_dir + 'hashtag.csv', header=None)
    
    # convert to a string, split by 'OR'
    hlist = list(hashtags[0])
    hstring = ''
    for i in range(len(hlist)):
        if i==len(hlist)-1:
            hstring = hstring + hlist[i]
        else:
            hstring = hstring + hlist[i] + ' OR '
    return hstring

In [None]:
def main_retweets(startdate, enddate, topic, mainaccount, directory):

    """
    Selects the tweets of accounts retweeted by the main account.
    INPUTS: startdate:        STRING, start date in yyyy-mm-dd format
            enddate:          STRING, end date in yyyy-mm-dd format
            topic:            STRING, name of topic, used to name folders
            mainaccount:      STRING, name of main account
            directory:        STRING, directory
    """    
    
    # set directories    
    topic_dir = directory + topic + '/'
    main_dir = topic_dir + '01_main/'
    retweets_dir = topic_dir + '02_retweets/'
    
    # import dataset of main account's retweets
    retweets = pd.read_csv(main_dir + 'retweets_' + mainaccount + '.csv')
    
    if len(retweets)>0:
        
        # create list of retweeted accounts
        rtcounts = retweets['rt_acc'].value_counts().rename_axis('acc').reset_index(name='count')
        print('Retweeted accounts found: ' + str(len(rtcounts)))
        rt_accs = list(rtcounts['acc'])
        
        # create list of hashtags in a string
        hstring = create_hstring(directory, topic)
        
        # create list of accounts already scraped
        scraped_accs = pd.read_csv(topic_dir + 'scraped_accs.csv',encoding='utf-8')
        scraped_list = list(scraped_accs['acc'])
        
        # for each retweeted account, scrape tweets containing one of the selected hashtags
        for i in range(len(rt_accs)):
            try:
                account = rt_accs[i].lstrip('@')
                if account not in scraped_list:
                    search_search(startdate, enddate, topic, account, retweets_dir, hstring)
                    scraped_list.append(account)
            except ValueError:
                print('Account not found: @' + account)
                continue
        
        # update file listing scraped accounts
        scraped_accs = pd.DataFrame(columns=['acc'])
        scraped_accs['acc'] = scraped_list
        scraped_accs.to_csv(topic_dir + 'scraped_accs.csv',encoding='utf-8',index=None)
        
    else:
        print('No retweeted accounts found.')

In [None]:
def combine_1_2(topic, mainacc, directory):

    """
    Combines the results of stages 1 and 2 of the bubble-generating procedure.
    INPUTS: topic:            STRING, name of topic, used to name folders
            mainaccount:      STRING, name of main account
            directory:        STRING, directory
    """    

    # set directories 
    topic_dir = directory + topic + '/'
    main_dir = topic_dir + '01_main/'
    retweets_dir = topic_dir + '02_retweets/'
    
    # add tweets of stage 1
    comb = pd.DataFrame(columns=['id','date','time','acc','tweet'])
    comb = comb.append(pd.read_csv(main_dir + 'select_' + mainacc + '.csv', encoding='utf-8'))
    
    # add tweets of stage 2
    for index, filename in enumerate(os.listdir(retweets_dir)):
        if filename.endswith('.csv') and filename.startswith('select_'):
            comb=comb.append(pd.read_csv(retweets_dir + filename, encoding='utf-8'))
    
    # save combined dataframe
    comb.to_csv(topic_dir + 'combine_1_2.csv',encoding='utf-8',index=None)
    print('Tweets found (step 1&2): ' + str(len(comb)))

In [None]:
def main_mentions(startdate, enddate, topic, mainaccount, directory):

    """
    Selects tweets mentioning the main account.
    INPUTS: startdate:        STRING, start date in yyyy-mm-dd format
            enddate:          STRING, end date in yyyy-mm-dd format
            topic:            STRING, name of topic, used to name folders
            mainaccount:      STRING, name of main account
            directory:        STRING, directory
    """    
    
    # set directories        
    topic_dir = directory + topic + '/'
    main_dir = topic_dir + '01_main/'
    retweets_dir = topic_dir + '02_retweets/'
    mentions_dir = topic_dir + '03_mentions/'
    mentions_rt_dir = mentions_dir + 'retweets/'
    mentions_sel_dir = mentions_dir + 'select/'
    
    # create non-existing folders
    if not os.path.exists(mentions_dir): 
        os.makedirs(mentions_dir)
        os.makedirs(mentions_rt_dir)
        os.makedirs(mentions_sel_dir)
    
    # scrape the tweets
    mentions_search(startdate, enddate, topic, mainaccount, mentions_dir)
    
    # import results from scraping
    mentions = pd.read_csv(mentions_dir + 'mentions_' + mainaccount + startdate + '_' + enddate + '.csv',encoding='utf-8')
    
    # generating list of accounts mentioning main account more than 5 times
    mentioncounts = mentions['acc'].value_counts().rename_axis('acc').reset_index(name='count')
    mentions_over_5 = mentioncounts.loc[mentioncounts['count']>5]
    print('mentions_over_5: ' + str(len(mentions_over_5)))
    accounts = list(mentions_over_5['acc'])
    
    # import accounts retweeted by main account
    retweets = pd.read_csv(main_dir + 'retweets_' + mainaccount + '.csv')
    
    # generate list of main account and the retweeted accounts
    if len(retweets)>0:
        rtcounts = retweets['rt_acc'].value_counts().rename_axis('acc').reset_index(name='count')
        acclist = list(rtcounts['acc'])
        acclist.append('@' + mainaccount)
    else:
        acclist = ['@' + mainaccount]
    
    # create list of hashtags in a string
    hstring = create_hstring(directory, topic)

    # create list of accounts already scraped
    scraped_accs = pd.read_csv(topic_dir + 'scraped_accs.csv',encoding='utf-8')
    scraped_list = list(scraped_accs['acc'])
    
    # for each account, scrape tweets containing hashtags if account retweeted one of the selected accounts
    for a in range(len(accounts)):
        acc = accounts[a]
        if acc not in scraped_list:
            try:
                print('Scraping @' + acc)
                profile_search_retweets(startdate, enddate, topic, acc, mentions_rt_dir, mentions_sel_dir, acclist, hstring)
                scraped_list.append(acc)
            except ValueError:
                print('Account not found: @' + acc)
                continue
    
    # update file listing scraped accounts
    scraped_accs = pd.DataFrame(columns=['acc'])
    scraped_accs['acc'] = scraped_list
    scraped_accs.to_csv(topic_dir + 'scraped_accs.csv',encoding='utf-8',index=None)

In [None]:
def combine_add_3(topic, directory):

    """
    Combines the results of stages 1 to 3 of the bubble-generating procedure.
    INPUTS: topic:            STRING, name of topic, used to name folders
            directory:        STRING, directory
    """    

    # set directories 
    topic_dir = directory + topic + '/'
    main_dir = topic_dir + '01_main/'
    retweets_dir = topic_dir + '02_retweets/'
    mentions_dir = topic_dir + '03_mentions/'
    mentions_rt_dir = mentions_dir + 'retweets/'
    mentions_sel_dir = mentions_dir + 'select/'

    # import combined dataset of stages 1 and 2
    comb = pd.DataFrame(columns=['id','date','time','acc','tweet'])
    comb = comb.append(pd.read_csv(topic_dir + 'combine_1_2.csv', encoding='utf-8'))
    
    # add data from stage 3
    for index, filename in enumerate(os.listdir(mentions_sel_dir)):
        if filename.endswith('.csv') and filename.startswith('select_'):
            comb=comb.append(pd.read_csv(mentions_sel_dir + filename, encoding='utf-8'))
    
    # save combined dataset
    comb.to_csv(topic_dir + 'combine_3.csv',encoding='utf-8',index=None)
    print('Tweets found (steps 1-3): ' + str(len(comb)))

In [None]:
def vague_clean(topic, file, directory):

    """
    Vaguely cleaning the dataset.
    INPUTS: topic:            STRING, name of topic, used to name folders
            file:             STRING, name of file
            directory:        STRING, directory
    """    

    # set directories 
    topic_dir = directory + topic + '/'
    mentions_dir = topic_dir + '03_mentions/'
    mentions_rt_dir = mentions_dir + 'retweets/'
    mentions_sel_dir = mentions_dir + 'select/'
    
    # import dataset
    data = pd.read_csv(topic_dir + file + '.csv',encoding='utf-8')
    print('Total tweets: ' + str(len(data)))
    
    # remove all tweets mostly consisting of hashtags, mentions, or links, as well as short tweets
    print('Removing all hashtags, mentions, and links.')
    for i in range(len(data)):
        split = data.loc[i,'tweet'].split(' ')
        cleaned_str = ''
        for j in range(len(split)):
            if split[j].startswith('@'): continue
            if split[j].startswith('#'): continue
            if split[j].startswith('http'): continue
            else:
                cleaned_str = cleaned_str + split[j] + ' '
        cleaned_str = cleaned_str.rstrip(' ')
        if len(cleaned_str.split(' ')) > 2:
            data.loc[i,'cleaned_tweet'] = cleaned_str
    print('Removing duplicate and short tweets.')
    data = data.drop_duplicates(subset=['cleaned_tweet'])
    print('Remaining tweets: ' + str(len(data)))
    
    # save remaining tweets
    subset = ['id', 'date', 'time', 'acc', 'tweet']
    data = data[subset]
    data.to_csv(topic_dir + file + '_no_duplicates.csv',encoding='utf-8',index=None)

In [None]:
def expand(startdate, enddate, topic, file, directory, min_tweets):

    """
    Scrapes the tweets of all accounts in the dataset.
    INPUTS: startdate:        STRING, start date in yyyy-mm-dd format
            enddate:          STRING, end date in yyyy-mm-dd format
            topic:            STRING, name of topic, used to name folders
            file:             STRING, filename of dataset
            directory:        STRING, directory
            min_tweets:       INTEGER, number of tweets of an account in the dataset in order for other tweets of this
                                       account to be considered as well
    """    
    
    # set directories     
    topic_dir = directory + topic + '/'
    mentions_dir = topic_dir + '03_mentions/'
    mentions_rt_dir = mentions_dir + 'retweets/'
    mentions_sel_dir = mentions_dir + 'select/'
    expand_dir = topic_dir + '04_expand/'
    
    # create non-existent folder
    if not os.path.exists(expand_dir): 
        os.makedirs(expand_dir)
    
    # import dataset
    data = pd.read_csv(topic_dir + file + '.csv',encoding='utf-8')
    
    # generate list on which the dataset is expanded
    counts = data['acc'].value_counts().rename_axis('acc').reset_index(name='count')
    counts = counts.loc[counts['count']>min_tweets]
    print('Expand on ' + str(len(counts)) + ' accounts.')
    accounts = list(counts['acc'])
    
    # expand dataset
    for i in range(len(accounts)):
        account = accounts[i]
        search_search(startdate, enddate, topic, account, expand_dir, '')

In [None]:
def combine_expand(topic, directory, file=False):

    """
    Combines the results of stages 1 to 4 of the bubble-generating procedure.
    INPUTS: topic:            STRING, name of topic, used to name folders
            directory:        STRING, directory
            file (optional):  STRING, filename of dataset
    """    

    # set directories and import data
    topic_dir = directory + topic + '/'
    if file!=False:
        data = pd.read_csv(topic_dir + file + '.csv',encoding='utf-8')
        print('Total tweets: ' + str(len(data)))
        expand_dir = topic_dir + '04_expand/'
    else:
        data = pd.DataFrame()
        expand_dir = topic_dir + 'expand/'
    
    # add tweets of stage 4 to dataset
    for index, filename in enumerate(os.listdir(expand_dir)):
        if filename.endswith('.csv') and filename.startswith('select_'):
            data=data.append(pd.read_csv(expand_dir + filename, encoding='utf-8'))
    data=data.dropna(subset=['tweet'])
    
    # save dataset
    data.to_csv(topic_dir + 'final.csv',encoding='utf-8',index=None)
    print('Total tweets: ' + str(len(data)))
    
    # vaguely cleaning the dataset
    vague_clean(topic, 'final', directory)

In [None]:
def create_bubble(start, end, topic, main_account, directory):

    """
    Creates a bubble from a single Twitter account.
    INPUTS: start:            STRING, start date in yyyy-mm-dd format
            end:              STRING, end date in yyyy-mm-dd format
            topic:            STRING, name of topic, used to name folders
            main_account:     STRING, name of main account
            directory:        STRING, directory
    """ 
    
    # Stage 1
    print('---------------------------------------------')
    print('Stage 1: Main Account')
    main_acc(start, end, topic, main_account, directory)
    print('Stage 1 completed.')
    
    # Stage 2
    print('---------------------------------------------')
    print('Stage 2: Retweeted Accounts')
    main_retweets(start, end, topic, main_account, directory)
    print('Combining stage 1 and stage 2.')
    combine_1_2(topic, main_account, directory)
    print('Stage 2 completed.')
    
    # Stage 3
    print('---------------------------------------------')
    print('Stage 3: Mentions of Main Account')
    main_mentions(start, end, topic, main_account, directory)
    print('Combining stages 1 to 3.')
    combine_add_3(topic, directory)
    print('Cleaning...')
    vague_clean(topic, 'combine_3', directory)
    print('Stage 3 completed.')
    
    # Stage 4
    print('---------------------------------------------')
    print('Stage 4: Expand Tweets')
    expand(start, end, topic, 'combine_3_no_duplicates', directory,0)
    print('Combine and clean all data.')
    combine_expand(topic, directory, 'combine_3_no_duplicates')
    print('Bubble created.')

In [None]:
def create_bubble_otherstart(start, end, topic, main_account, directory, otherstart):

    """
    Creates a bubble where the first two stages include an other Twitter account.
    INPUTS: start:            STRING, start date in yyyy-mm-dd format
            end:              STRING, end date in yyyy-mm-dd format
            topic:            STRING, name of topic, used to name folders
            main_account:     STRING, name of main account
            directory:        STRING, directory
            otherstart:       STRING, name of other account
    """ 
    
    # Stage 1
    print('---------------------------------------------')
    print('Stage 1: Main Account')
    main_acc(start, end, topic, otherstart, directory)
    print('Stage 1 completed.')
    
    # Stage 2
    print('---------------------------------------------')
    print('Stage 2: Retweeted Accounts')
    main_retweets(start, end, topic, otherstart, directory)
    print('Combining stage 1 and stage 2.')
    combine_1_2(topic, otherstart, directory)
    print('Stage 2 completed.')
    
    # Stage 3
    print('---------------------------------------------')
    print('Stage 3: Mentions of Main Account')
    main_mentions(start, end, topic, main_account, directory)
    print('Combining stages 1 to 3.')
    combine_add_3(topic, directory)
    print('Cleaning...')
    vague_clean(topic, 'combine_3', directory)
    print('Stage 3 completed.')
    
    # Stage 4
    print('---------------------------------------------')
    print('Stage 4: Expand Tweets')
    expand(start, end, topic, 'combine_3_no_duplicates', directory,0)
    print('Combine and clean all data.')
    combine_expand(topic, directory, 'combine_3_no_duplicates')
    print('Bubble created.')

In [None]:
def oldbubble_newtime(start, end, topic, oldbubble_file, directory):

    """
    Creates a bubble with the same accounts of another bubble in a different time frame.
    INPUTS: start:            STRING, start date in yyyy-mm-dd format
            end:              STRING, end date in yyyy-mm-dd format
            topic:            STRING, name of topic, used to name folders
            oldbubble_file:   STRING, filename of final dataset of the old bubble
            directory:        STRING, directory
    """ 
    
    # set directories
    topic_dir = directory + topic + '/'
    expand_dir = topic_dir + 'expand/'
    
    # generate non-existing folders
    if not os.path.exists(topic_dir): 
        os.makedirs(expand_dir)
    if not os.path.exists(expand_dir): 
        os.makedirs(expand_dir)

    # generate list of accounts
    oldbubble = pd.read_csv(oldbubble_file)
    accounts = list(dict.fromkeys(list(oldbubble['acc'])))
    print('Expand on ' + str(len(accounts)) + ' accounts.')
    
    # scrape tweets of accounts in new time frame
    for i in range(len(accounts)):
        account = accounts[i]
        search_search(start, end, topic, account, expand_dir, '')
    
    # combine results
    combine_expand(topic, directory)

### 2. Reducing Language Elements and Creating N-grams

In [None]:
def tweet_split(tweet):

    """
    Splits a tweet on spaces, #, and @.
    INPUT:  tweet:   STRING, tweet
    OUTPUT: split:   LIST, split tweet
    """ 
    
    # split the tweet
    s_re = re.split('( |#|@)', tweet)
    split = []
    keep = ''
    for s in range(len(s_re)):
        if s_re[s]==' ' or s_re[s]=='':
            continue
        elif s_re[s].startswith('@') or s_re[s].startswith('#'):
            keep = s_re[s]
        else:
            if keep!='':
                split.append(keep+s_re[s])
                keep=''
            else:
                split.append(s_re[s])
    return split

In [None]:
def normalize(df, joint_dir, character):

    """
    Normalizes a dataset of hashtags or account names.
    INPUTS: df:            DATAFRAME, dataframe containing hashtags or mentions in column 'element'
            joint_dir:     STRING, directory
            character:     STRING, '#' or '@'
    """ 
    
    for h in range(len(df)):
        
        # split element on capital letters
        cap_split = re.findall('[A-Z][^A-Z]*', df.loc[h,'element'])
        
        if cap_split != []:
            
            # check for single letters after split and combine them
            cap_split_checked = []
            cap_split_singleletters_dump = ''
            for e in range(len(cap_split)):
                if len(''.join([i for i in cap_split[e].translate(str.maketrans('', '', string.punctuation)) if not i.isdigit()]))>1:
                    if cap_split_singleletters_dump != '':
                        cap_split_checked.append(cap_split_singleletters_dump)
                        cap_split_singleletters_dump = ''
                    cap_split_checked.append(cap_split[e])
                else:
                    cap_split_singleletters_dump = cap_split_singleletters_dump + cap_split[e]
            if cap_split_singleletters_dump != '':
                cap_split_checked.append(cap_split_singleletters_dump)
            
            # combine split element to string with spaces
            cap_split_str = ''
            characters = 0
            if len(cap_split_checked)>1:
                for i in range(len(cap_split_checked)):
                    cap_split_str = cap_split_str + cap_split_checked[i] + ' '
                    characters = characters + len(cap_split_checked[i])
                if characters == len(df.loc[h,'element'])-1:
                    df.loc[h,'cleaned']=cap_split_str.rstrip(' ')
        stripped = df.loc[h,'element'].lstrip(character)
        
        # mark account name elements
        if character=='@':
            stripped = 'M_' + stripped
        
        # enter cleaned element in dataframe
        if pd.isnull(df.loc[h,'cleaned']):
            df.loc[h,'cleaned']=stripped

    # save dataframe
    df.to_excel(joint_dir + character + '.xlsx',index=False)    

In [None]:
def collect_hashtags_mentions(data, topic, twint_dir, joint_dir):

    """
    Collects all hashtags and mentions of a dataset of tweets.
    INPUTS:  data:          DATAFRAME, dataframe containing tweets
             topic:         STRING, name of dataframe topic
             twint_dir:     STRING, directory of results from TWINT scraping
             joint_dir:     STRING, directory of event analysis
    OUTPUTS: hashtags:      LIST, contains all hashtags of data
             mentions:      LIST, contains all mentions of data
    """ 
    
    # initialize empty lists
    hashtags = []
    mentions = []
    
    # drop tweets with no content
    data = data.dropna(subset=['tweet']).reset_index(drop=True)
    
    for tweet in range(len(data)):
        
        # split tweet
        split = tweet_split(data.loc[tweet,'tweet'])
        
        # remove all mentions and hashtags at the end of the tweet
        cont = True
        while cont==True:
            if len(split)>0:
                if split[len(split)-1].startswith('#') or split[len(split)-1].startswith('@') or split[len(split)-1].startswith('http'):
                    del split[len(split)-1]
                else:
                    cont=False
            else:
                cont=False
        
        # append hashtags and mentions to lists
        for element in range(len(split)):
            if split[element].startswith('#'):
                hashtags.append(split[element])
            if split[element].startswith('@'):
                mentions.append(split[element])
    
    # generate and save dataframes
    hashtags_df = pd.DataFrame(hashtags).value_counts().rename_axis('element').reset_index(name='count')
    mentions_df = pd.DataFrame(mentions).value_counts().rename_axis('element').reset_index(name='count')
    hashtags_df.to_csv(joint_dir+topic+'_#.csv',index=False)
    mentions_df.to_csv(joint_dir+topic+'_@.csv',index=False)
    return hashtags, mentions

In [None]:
def clean_hashtags_mentions(twint_dir, joint_dir, topic0, topic1):

    """
    Generates dataset listing hashtags and mentions and suggests normalizations.
    INPUTS:  twint_dir:     STRING, directory of results from TWINT scraping
             joint_dir:     STRING, directory of event analysis
             topic0:        STRING, name of topic related to extreme 0
             topic1:        STRING, name of topic related to extreme 1
    """ 
    
    # create non-existing folder
    if not os.path.exists(joint_dir): 
        os.makedirs(joint_dir)
        
    # import data
    data0 = pd.read_csv(twint_dir+ '/' + topic0 + '/final_no_duplicates.csv')
    data1 = pd.read_csv(twint_dir+ '/' + topic1 + '/final_no_duplicates.csv')
    
    # collect hashtags and mentions of both datasets
    hashtags0, mentions0 = collect_hashtags_mentions(data0, topic0, twint_dir, joint_dir)
    hashtags1, mentions1 = collect_hashtags_mentions(data1, topic1, twint_dir, joint_dir)
    
    # combine lists of mentions and hashtags for both datasets
    for h in range(len(hashtags1)):
        hashtags0.append(hashtags1[h])
    hashtags = hashtags0
    for m in range(len(mentions1)):
        mentions0.append(mentions1[m])
    mentions = mentions0
    
    # count values in hashtag- and mention-list, add empty column 'cleaned'
    hashtags_df = pd.DataFrame(hashtags).value_counts().rename_axis('element').reset_index(name='count')
    mentions_df = pd.DataFrame(mentions).value_counts().rename_axis('element').reset_index(name='count')    
    hashtags_df['cleaned'] = np.nan
    mentions_df['cleaned'] = np.nan
    
    # normalize hashtags and mentions
    normalize(hashtags_df, joint_dir, '#')
    normalize(mentions_df, joint_dir, '@')

In [None]:
def stop_words(own, a):

    """
    Generates list of stop words from nltk-package and includes list of own stop words.
    INPUTS:  own:     LIST, own stop words
             a:       STRING, special character '’'
    OUTPUT:  stop_np: LIST, stop words
    """
    
    # generate list of stop words from nltk-package 
    stop = stopwords.words('english')
    
    # replace special character for each element
    stop_np = []
    for i in range(len(stop)):
        stop_np.append(stop[i].replace(a,"'").translate(str.maketrans('', '', string.punctuation)))
        
    # append own stop words
    append = own
    for i in range(len(append)):
        stop_np.append(append[i])
        
    # return stop word list
    return stop_np

In [None]:
def ct1_rep_hashtags_mentions(data, joint_dir, a, b, c):

    """
    Removes emojis, punctuation, links, mark mentions, normalizes hashtags, and sets everything to lower-case letters.
    INPUTS:  data:         DATAFRAME, contains tweets
             joint_dir:    STRING, directory of event analysis
             a:            STRING, special character '’'
             b:            STRING, special character '”'
             c:            STRING, special character '“'
    OUTPUT:  data:         DATAFRAME, contains edited tweets
    """
    
    # import hashtag normalization suggestions
    h_rep = pd.read_excel(joint_dir + '#.xlsx')
    
    new_tweet = ''
    for tweet in range(len(data)):
        
        # split tweet, remove emojis, remove special characters
        split = tweet_split(data.loc[tweet,'tweet'].encode('ascii', 'ignore').decode('ascii').replace(a,"").replace(b,"").replace(c,""))
        
        # remove all mentions and hashtags at the end of the tweet
        cont = True
        while cont==True:
            if len(split)>0:
                if split[len(split)-1].startswith('#') or split[len(split)-1].startswith('@') or split[len(split)-1].startswith('http'):
                    del split[len(split)-1]
                else:
                    cont=False
            else:
                cont=False
                
        # remove hashtags, links, and punctuation; replace mentions
        if len(split)>0:
            new_tweet = ''
            for element in range(len(split)):
                if split[element].startswith('#'):
                    h_rep_x = h_rep.loc[h_rep['element'].str.lower()==split[element].lower()].reset_index(drop=True)
                    if len(h_rep_x)>0:
                        new_tweet = new_tweet + str(h_rep_x.loc[0,'cleaned'].translate(str.maketrans('', '', string.punctuation))) + ' '
                    else:
                        new_tweet = new_tweet + 'h_' + split[element].translate(str.maketrans('', '', string.punctuation)) + ' '
                elif split[element].startswith('@'):
                    new_tweet = new_tweet + 'm_' + split[element].translate(str.maketrans('', '', string.punctuation)) + ' '
                    continue
                elif split[element].startswith('http'):
                    continue
                else:
                    new_tweet = new_tweet + split[element].translate(str.maketrans('', '', string.punctuation)) + ' '
            
            # save edited tweet and set to lower-case letters
            data.loc[tweet,'tweet'] = new_tweet.lower().rstrip(' ')
        else:
            data.loc[tweet,'tweet'] = np.nan
    
    # drop empty tweets
    data = data.dropna(subset=['tweet']).reset_index(drop=True)
    
    return data

In [None]:
def ct2_mcn_random(data):

    """
    Performs Multiple Character Normalization (MCN), and removes random letter sequences.
    INPUT:   data:         DATAFRAME, contains tweets
    OUTPUT:  data:         DATAFRAME, contains edited tweets
    """
    
    # initialize spell checker
    spell = SpellChecker()
    
    for tweet in range(len(data)):
        
        # split tweet
        split = tweet_split(data.loc[tweet,'tweet'])
        
        new_tweet = ''
        for element in range(len(split)):
            
            # pass mentions
            if split[element].startswith('m_'):
                new_tweet = new_tweet + split[element] + ' '
            
            # multiple character normalization
            else:
                replaced = re.sub(r'(.)\1{2,}', r'\1', split[element])
                if len(replaced)>0:
                    if len(replaced)/len(set(replaced))>4:
                        continue
                    else:
                        
                        # apply spell chacker to replaced elements
                        if split[element]!=replaced:
                            new_tweet = new_tweet + spell.correction(replaced).translate(str.maketrans('', '', string.punctuation)) + ' '
                        else:
                            new_tweet = new_tweet + replaced + ' '
                else:
                    new_tweet = new_tweet + replaced + ' '
        
        # save edited tweet
        data.loc[tweet,'tweet'] = new_tweet.rstrip(' ')
    
    return data

In [None]:
def ct3_stop_stem(data, a):

    """
    Removes stop words and applies the Porter stemmer.
    INPUTS:  data:         DATAFRAME, contains tweets
             a:            STRING, special character '’'
    OUTPUT:  data:         DATAFRAME, contains edited tweets
    """
    
    # initialize Porter stemmer
    stemmer = PorterStemmer()
    
    # create list of stop words
    own_stopwords = ['cannot', 'gonna', 'gotta', 'im', 'ive', 'like', 'cant', 'whats', 'wanna', 'us', 'amp', 'lets', 'gimme', 'gimmee']
    
    # create complete list of stop words
    stop_np = stop_words(own_stopwords, a)
    
    for tweet in range(len(data)):
        
        # split tweet
        split = tweet_split(data.loc[tweet,'tweet'])
        new_tweet = ''
        for element in range(len(split)):
            
            # remove stop words
            if split[element] in stop_np:
                continue
            
            # pass mentions
            elif split[element].startswith('m_'):
                new_tweet = new_tweet + split[element] + ' '
            
            # stem remaining elements
            else:
                new_tweet = new_tweet + stemmer.stem(split[element]) + ' '
        
        # save edited tweet
        data.loc[tweet,'tweet'] = new_tweet.rstrip(' ')
    
    return data

In [None]:
def clean_tweets(twint_dir, joint_dir, topic, a, b, c):

    """
    Cleans the tweets.
    INPUTS:  twint_dir:    STRING, directory of results from TWINT scraping
             joint_dir:    STRING, directory of event analysis
             topic:        STRING, name of topic
             a:            STRING, special character '’'
             b:            STRING, special character '”'
             c:            STRING, special character '“'
    OUTPUT:  data:         DATAFRAME, contains edited tweets
    """
    
    # import data
    data = pd.read_csv(twint_dir+ '/' + topic + '/final_no_duplicates.csv').dropna(subset=['tweet']).reset_index(drop=True)
    
    # stage 1a
    print('Stage 1a: Cleaning hashtags and mentions; removing emojis, links, punctuation, and capitalizations...')
    data = ct1_rep_hashtags_mentions(data, joint_dir, a, b, c)
    print('Stage 1a completed.')
    
    # stage 1b
    print('Stage 1b: MCN and removing random letter sequences...')
    data = ct2_mcn_random(data)
    data.to_csv(joint_dir+topic+'_cleaned.csv',index=False)
    print('Stage 1b completed.')
    
    # stage 1c
    print('Stage 1c: Removing stop words and applying the Porter stemmer...')
    data = ct3_stop_stem(data, a)
    print('Stage 1c completed.')
    
    # save
    data.to_csv(joint_dir+topic+'_stemmed.csv',index=False)
    return data

In [None]:
def make_ngrams(df, column, n, joint_dir, topic):

    """
    Generates n-grams.
    INPUTS:  df:           DATAFRAME, contains data
             column:       STRING, indicates column of df for which n-grams should be generated
             n:            INTEGER, 2 for bigrams; 3 for trigrams
             joint_dir:    STRING, directory of event analysis
             topic:        STRING, name of topic
    """
    
    # drops empty entries
    df = df.dropna().reset_index(drop=True) # DELETE!!!
    
    ngrams = []
    for i in range(len(df)):
        
        # bigrams
        if n==2:
            i_ngrams = list(nltk.bigrams(nltk.word_tokenize(df.loc[i,column])))
        
        # trigrams
        if n==3:
            i_ngrams = list(nltk.trigrams(nltk.word_tokenize(df.loc[i,column])))
        
        # generate list if n-grams
        for j in range(len(i_ngrams)):
            ngrams.append(i_ngrams[j])
    
    # generate dataframe of n-grams
    x = pd.DataFrame()
    x['ngrams'] = ngrams
    
    # count n-grams
    ngramsdf = x['ngrams'].value_counts().rename_axis('ngrams').reset_index(name='count')
    
    # save
    if n==2:
        ngramsdf.to_csv(joint_dir+topic+'_2grams.csv',index=False)
    if n==3:
        ngramsdf.to_csv(joint_dir+topic+'_3grams.csv',index=False)

In [None]:
def ngrams(twint_dir, topic, joint_dir, n, a, b, c):

    """
    Cleans the tweets and generates n-grams.
    INPUTS:  twint_dir:    STRING, directory of results from TWINT scraping
             topic:        STRING, name of topic
             joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams
             a:            STRING, special character '’'
             b:            STRING, special character '”'
             c:            STRING, special character '“'        
    """
    
    # stage 1
    print('---------------------------------------------')
    print('Stage 1: Cleaning the Tweets')
    if not os.path.exists(joint_dir+topic+'_stemmed.csv'):
        tweets_cleaned = clean_tweets(twint_dir, joint_dir, topic, a, b, c)
    else:
        tweets_cleaned = pd.read_csv(joint_dir+topic+'_stemmed.csv')
    print('Stage 1 completed.')
    
    # stage 2
    print('---------------------------------------------')
    print('Stage 2: Creating N-grams')
    make_ngrams(tweets_cleaned, 'tweet', n, joint_dir, topic)
    print('Stage 2 completed.')
    print('N-grams created.')

### 3. Phrases

In [None]:
def ngrams_to_phrases(n, joint_dir, topic, data, tweets, column):

    """
    Finding phrases belonging to n-grams.
    INPUTS:  n:            INTEGER, 2 for bigrams; 3 for trigrams
             joint_dir:    STRING, directory of event analysis
             topic:        STRING, name of topic
             data:         DATAFRAME, containing cleaned tweets
             tweets:       DATAFRAME, containing stemmed tweets
             column:       STRING, indicates column where tweets are saved
    """
    
    # initialize Porter stemmer
    stemmer = PorterStemmer()
    
    # create phrases-dataframe
    phrases = pd.DataFrame(columns=['ngram', 'phrase'])
    
    # create non-existend folder
    if not os.path.exists(joint_dir+topic+'_'+str(n)+'_phrases_build/'): 
        os.makedirs(joint_dir+topic+'_'+str(n)+'_phrases_build/')
    
    for tweet in range(len(tweets)):
        i=0
        if type(tweets.loc[tweet, column])==str: # if there is a tweet with only removed words, this tweet has to be skipped
            try:
                
                # generate n-grams
                if n==2:
                    i_ngrams=list(nltk.bigrams(nltk.word_tokenize(tweets.loc[tweet,column])))
                if n==3:
                    i_ngrams=list(nltk.trigrams(nltk.word_tokenize(tweets.loc[tweet,column])))
                
                # split cleaned tweet
                l=data.loc[tweet,column].split(' ')
                
                for j in range(len(i_ngrams)):
                    
                    # skip over marked elements
                    if n==2 and (i_ngrams[j][0].startswith('m_') or i_ngrams[j][0].startswith('h_') or i_ngrams[j][1].startswith('m_') or i_ngrams[j][1].startswith('h_')):
                        continue
                    elif n==3 and (i_ngrams[j][0].startswith('m_') or i_ngrams[j][0].startswith('h_') or i_ngrams[j][1].startswith('m_') or i_ngrams[j][1].startswith('h_') or i_ngrams[j][2].startswith('m_') or i_ngrams[j][2].startswith('h_')):
                        continue
                    
                    # find start and end element of cleaned tweet relating to n-gram
                    else:
                        foundstart=0
                        foundend=0
                        phrase=''
                        
                        # find start
                        while foundstart==0:
                            if stemmer.stem(l[i])==i_ngrams[j][0]:
                                phrase=phrase+l[i]
                                foundstart=1
                                i_save=i+1
                                i=i_save
                            else:
                                i=i+1
                        
                        # find end
                        while foundend==0:
                            if n==2:
                                if stemmer.stem(l[i])==i_ngrams[j][1]:
                                    phrase=phrase+' '+l[i]
                                    foundend=1
                                else:
                                    phrase=phrase+' '+l[i]
                                    i=i+1
                            if n==3:
                                if stemmer.stem(l[i])==i_ngrams[j][2]:
                                    phrase=phrase+' '+l[i]
                                    foundend=1
                                    i=i_save
                                else:
                                    phrase=phrase+' '+l[i]
                                    i=i+1                            
                        
                        # save phrase
                        phrase=phrase.translate(str.maketrans('', '', string.punctuation))
                        phrase=' '.join(phrase.rstrip(' ').split())
                        row=len(phrases)
                        phrases.loc[row,'ngram']=i_ngrams[j]
                        phrases.loc[row,'phrase']=phrase
            except:
                print(tweet)
                continue
        
        # save every 1000 tweets and at the end
        if tweet % 1000 == 0:
            phrases.to_csv(joint_dir+topic+'_'+str(n)+'_phrases_build/phr'+str(int(tweet/1000))+'.csv',index=False)
            phrases = pd.DataFrame(columns=['ngram', 'phrase'])
        elif tweet == len(tweets)-1:
            phrases.to_csv(joint_dir+topic+'_'+str(n)+'_phrases_build/phr'+str(int(tweet/1000)+1)+'.csv',index=False)

    # merge individual saves, save final phrases document
    for index, filename in enumerate(os.listdir(joint_dir+topic+'_'+str(n)+'_phrases_build/')):
        phrases=phrases.append(pd.read_csv(joint_dir+topic+'_'+str(n)+'_phrases_build/'+filename))
    phrases = phrases.reset_index(drop=True)
    phrases=phrases.groupby(['ngram','phrase']).size().reset_index(name='count').sort_values(by='count',ascending=False).reset_index(drop=True)
    phrases.to_csv(joint_dir+topic+'_phrases_'+str(n)+'grams.csv',index=False)

In [None]:
def compute_phrases(joint_dir, topic, n):

    """
    Finding phrases belonging to n-grams; easier for inputs.
    INPUTS:  joint_dir:    STRING, directory of event analysis
             topic:        STRING, name of topic
             n:            INTEGER, 2 for bigrams; 3 for trigrams
    """
    
    # import data
    data = pd.read_csv(joint_dir + topic + '_cleaned.csv')
    tweets = pd.read_csv(joint_dir + topic + '_stemmed.csv')
    
    # call function ngrams_to_phrases
    ngrams_to_phrases(n, joint_dir, topic, data, tweets, 'tweet')
    
    print('Phrases computed.')

### 4. Chi-squared

In [None]:
def chi_squared(topic0, topic1, joint_dir, n):

    """
    Compute chi2-value for every phrase.
    INPUTS:  topic0:       STRING, name of topic relating to extreme 0
             topic1:       STRING, name of topic relating to extreme 1
             joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams
    OUTPUT:  counts:       DATAFRAME, contains chi2-values
    """
    
    # set up counts-dataframe containing combined n-gram counts
    data1 = pd.read_csv(joint_dir+topic1+'_'+str(n)+'grams.csv').rename(columns={'count': 'count_1'})
    data0 = pd.read_csv(joint_dir+topic0+'_'+str(n)+'grams.csv').rename(columns={'count': 'count_0'})
    counts = pd.merge(data0, data1, on='ngrams',how='outer').fillna(0)
    counts['count_comb'] = counts['count_1'] + counts['count_0']
    counts = counts.sort_values(by='count_comb',ascending=False).reset_index(drop=True)
    
    # reduce to highest n-gram counts
    value = counts.loc[9999,'count_comb']
    counts = counts.loc[counts['count_comb']>=value]
    
    # compute values of total n-gram counts per extreme
    total_1 = sum(counts['count_1'])
    total_0 = sum(counts['count_0'])
    
    # calculate chi2-values
    for i in range(len(counts)):
        fp0 = counts.loc[i,'count_0']
        fp1 = counts.loc[i,'count_1']
        fnp0 = total_0 - fp0
        fnp1 = total_1 - fp1
        num = ((fp0*fnp1)-(fp1*fnp0))**2
        denom = (fp0+fp1)*(fp0+fnp0)*(fp1+fnp1)*(fnp0+fnp1)
        chi2 = num / denom
        counts.loc[i,'chi2'] = chi2
    
    # sort by chi2-values and drop n-grams containing elements starting with h_ or m_
    counts = counts.sort_values(by='chi2',ascending=False).reset_index(drop=True)
    counts = counts.loc[~counts['ngrams'].str.contains('h_|m_')].reset_index(drop=True)
    
    # drop all n-grams containing elements with only 1 or 2 characters
    for i in range(len(counts)):
        split = counts.loc[i,'ngrams'].translate(str.maketrans('', '', string.punctuation)).split()
        short = 0
        for element in range(len(split)):
            if len(split[element])<3:
                short = 1
        if short==1:
            counts=counts.drop([i])
    
    # save and return
    counts = counts.reset_index(drop=True)
    counts.to_csv(joint_dir+'chi2_test_'+str(n)+'grams.csv',index=False)
    return counts

In [None]:
def selected_counts(counts, topic0, topic1, joint_dir, n, select, a, b, c):

    """
    Selects a given number of n-grams/phrases based on chi2-values.
    INPUTS:  counts:       DATAFRAME, contains chi2-values
             topic0:       STRING, name of topic relating to extreme 0
             topic1:       STRING, name of topic relating to extreme 1
             joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams
             select:       INTEGER, defines number of n-grams/phrases to be selected.
             a:            STRING, special character '’'
             b:            STRING, special character '”'
             c:            STRING, special character '“'  
    """    
    
    # import and combine phrases
    phrases0 = pd.read_csv(joint_dir + topic0 + '_phrases_' + str(n) + 'grams.csv')
    phrases1 = pd.read_csv(joint_dir + topic1 + '_phrases_' + str(n) + 'grams.csv')
    phrases = phrases0.append(phrases1).reset_index(drop=True)
    phrases = phrases.rename(columns={'count': 'counts'})
    phrases = phrases.groupby(['ngram','phrase']).counts.sum().reset_index().sort_values(by='counts',ascending=False).reset_index(drop=True)
    
    # select top 2000 most used n-grams, find phrases belonging to n-grams
    selected_counts = counts[0:2000]
    for i in range(len(selected_counts)):
        ngram=selected_counts.loc[i,'ngrams']
        phrases_x=phrases.loc[phrases['ngram']==ngram].sort_values(by='counts',ascending=False).reset_index(drop=True)
        if len(phrases_x.loc[0,'phrase'].replace(' ',''))>3:
            selected_counts.loc[i,'phrase_freq_1']=phrases_x.loc[0,'phrase']
            if len(phrases_x)>1:
                # second phrase has to be used more than 5 times to be considered
                if phrases_x.loc[1,'counts']>5:
                    if len(phrases_x.loc[1,'phrase'].replace(' ',''))>3:
                        selected_counts.loc[i,'phrase_freq_2']=phrases_x.loc[1,'phrase']
    
    #compute hashtag ratio and delete hashtag-dominated n-grams (same as in clean_tweets)
    hashtags = pd.read_excel(joint_dir+'#.xlsx')
    hashtags.columns = ['element', 'count', 'tweet']
    for tweet in range(len(hashtags)):
        split = tweet_split(hashtags.loc[tweet,'tweet'].encode('ascii', 'ignore').decode('ascii').replace(a,"").replace(b,"").replace(c,""))
        new_tweet = ''
        for element in range(len(split)):
            new_tweet = new_tweet + split[element].translate(str.maketrans('', '', string.punctuation)) + ' '
        hashtags.loc[tweet,'tweet'] = new_tweet.lower().rstrip(' ')
    spell = SpellChecker()
    for tweet in range(len(hashtags)):
        split = tweet_split(hashtags.loc[tweet,'tweet'])
        new_tweet = ''
        for element in range(len(split)):
            replaced = re.sub(r'(.)\1{2,}', r'\1', split[element])
            if len(replaced)>0:
                if len(replaced)/len(set(replaced))>4:
                    continue
                else:
                    if split[element]!=replaced:
                        new_tweet = new_tweet + spell.correction(replaced).translate(str.maketrans('', '', string.punctuation)) + ' '
                    else:
                        new_tweet = new_tweet + replaced + ' '
            hashtags.loc[tweet,'tweet'] = new_tweet.lower().rstrip(' ')
    stemmer = PorterStemmer()
    own_stopwords = ['cannot', 'gonna', 'gotta', 'im', 'ive', 'like', 'cant', 'whats', 'wanna', 'us', 'amp', 'lets', 'gimme', 'gimmee']
    stop_np = stop_words(own_stopwords, a)
    for tweet in range(len(hashtags)):
        split = tweet_split(hashtags.loc[tweet,'tweet'])
        new_tweet = ''
        for element in range(len(split)):
            if split[element] in stop_np:
                continue
            else:
                new_tweet = new_tweet + stemmer.stem(split[element]) + ' '
        hashtags.loc[tweet,'tweet'] = new_tweet.rstrip(' ')
    for i in range(len(hashtags)):
        if n==2:
            i_ngrams = list(nltk.bigrams(nltk.word_tokenize(hashtags.loc[i,'tweet'])))
        if n==3:
            i_ngrams = list(nltk.trigrams(nltk.word_tokenize(hashtags.loc[i,'tweet'])))
        for j in range(len(i_ngrams)):
            hashtags.loc[i,'ngram'+str(j)] = str(i_ngrams[j])
    for i in range(len(selected_counts)):
        hashtags_x = hashtags[hashtags.eq(selected_counts.loc[i,'ngrams']).any(1)]
        selected_counts.loc[i,'hashtag_ratio'] = sum(hashtags_x['count'])/selected_counts.loc[i,'count_comb']
    selected_counts=selected_counts.loc[selected_counts['hashtag_ratio']<0.2].reset_index(drop=True)
    
    # select equal amount of ngrams that are more likely to be used by one extreme
    tweets0 = pd.read_csv(joint_dir + topic0 + '_stemmed.csv')
    tweets1 = pd.read_csv(joint_dir + topic1 + '_stemmed.csv')
    selected_counts['most_likely'] = (selected_counts['count_0']/len(tweets0) < selected_counts['count_1']/len(tweets1)).astype(int)
    select0 = selected_counts.loc[selected_counts['most_likely']==0].sort_values(by='chi2',ascending=False).reset_index(drop=True)[0:int(select/2)]
    select1 = selected_counts.loc[selected_counts['most_likely']==1].sort_values(by='chi2',ascending=False).reset_index(drop=True)[0:int(select/2)]
    comb = select0.append(select1).reset_index(drop=True)
    
    # save selected n-grams
    comb.to_csv(joint_dir+'selected_counts_'+str(n)+'grams.csv',index=False)

In [None]:
def chi_and_select(topic0, topic1, joint_dir, n, select, a, b, c):

    """
    Computes chi2-values and selects a given number of n-grams/phrases based on chi2-values.
    INPUTS:  topic0:       STRING, name of topic relating to extreme 0
             topic1:       STRING, name of topic relating to extreme 1
             joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams
             select:       INTEGER, defines number of n-grams/phrases to be selected.
             a:            STRING, special character '’'
             b:            STRING, special character '”'
             c:            STRING, special character '“'  
    """    
    
    # stage 1
    print('---------------------------------------------')
    print('Stage 1: Chi-Squared Test')
    counts = chi_squared(topic0, topic1, joint_dir, n)
    print('Stage 1 completed.')
    
    # stage 2
    print('---------------------------------------------')
    print('Stage 2: Select phrases.')
    selected_counts(counts, topic0, topic1, joint_dir, n, select, a, b, c)
    print('Stage 2 completed.')
    print('Phrases selected.')

### 5. GDELT

In [None]:
def all_articles(domain, start, end, gdelt_dir):

    """
    Finds all articles of a given outlet in a given time frame.
    INPUTS:  domain:      STRING, domain
             start:       STRING, start date in yymmdd-format
             end:         STRING, end date in yymmdd-format
             gdelt_dir:   STRING, directory where GDELT-results are saved
    """    
    
    # define base url for GDELT-API
    base='https://api.gdeltproject.org/api/v2/doc/doc?query='
    
    # convert dates
    start_dt=datetime.datetime.strptime(start,"%y%m%d")
    end_dt=datetime.datetime.strptime(end,"%y%m%d")
    iter_dt=start_dt
    
    # initialize articles-dataframe
    cols=['url','title','seendate','domain']
    articles=pd.DataFrame(columns=cols)
    
    # iterate in steps of 1 hour
    while iter_dt+timedelta(hours=1)<=end_dt:
        
        # define start and end time for current search
        cur_start=str(iter_dt).replace('-','').replace(':','').replace(' ','')
        cur_end=str(iter_dt+timedelta(hours=1)).replace('-','').replace(':','').replace(' ','')
        
        # generate url
        url=base+'domainis:'+domain+'&sourcelang:english&sourcecountry:us&format=json&STARTDATETIME='+cur_start+'&ENDDATETIME='+cur_end+'&MAXRECORDS=250'
        
        # start time
        starttime=time.time()
        
        # visit url, read json, append results to articles
        r=requests.get(url)
        j=r.text.lstrip('{"articles": ').rstrip(' }')
        if j!='':
            results = pd.read_json(StringIO(j))
            articles=articles.append(results[cols]).reset_index(drop=True)
            print(cur_start+': '+str(len(results))+' articles found. Total articles: '+str(len(articles))+'.')
        else:
            print(cur_start+': No new articles found.')
        iter_dt=iter_dt+timedelta(hours=1)
        
        # check elapsed time, if less than 5 seconds: implement a delay
        elapsedtime=time.time()-starttime
        if elapsedtime<5:
            time.sleep(5-elapsedtime)
    
    # drop duplicate articles
    n_before=len(articles)
    articles=articles.drop_duplicates(subset=['url'])
    print('Deleted '+str(n_before-len(articles))+' articles. '+str(len(articles))+' remaining.')
    
    # generate non-existent folder
    if not os.path.exists(gdelt_dir+'domains/'): 
        os.makedirs(gdelt_dir+'domains/')
    
    # save articles
    articles.to_csv(gdelt_dir+'domains/'+domain.split('.')[0]+'_'+start+'_'+end+'.csv',index=False)

In [None]:
def single_phrase(domain, phrase, start, end, phrase2=0):

    """
    Finds all articles of a given outlet in a given time frame containing one or two phrases.
    INPUTS:  domain:             STRING, domain
             phrase:             STRING, phrase 1 that has to be mentioned for an article to be found
             start:              STRING, start date in yymmdd-format
             end:                STRING, end date in yymmdd-format
             phrase2 (optional)  STRING, phrase 2 that has to be mentioned for an article to be found
    OUTPUT:  articles:           DATAFRAME, contains all articles from GDELT-API matching the input parameters
    """
    
    # define base url for GDELT-API
    base='https://api.gdeltproject.org/api/v2/doc/doc?query='
    
    # convert dates
    start_dt=datetime.datetime.strptime(start,"%y%m%d")
    end_dt=datetime.datetime.strptime(end,"%y%m%d")
    iter_dt=start_dt
    
    # initialize articles-dataframe
    cols=['url','title','seendate','domain']
    articles=pd.DataFrame(columns=cols)
    
     # iterate in steps of 1 day
    while iter_dt+timedelta(days=1)<=end_dt:
        
        # define start and end time for current search
        cur_start=str(iter_dt).split(' ')[0].replace('-','')
        cur_end=str(iter_dt+timedelta(days=1)).split(' ')[0].replace('-','')
        
        # generate url
        if phrase2==0:
            url=base+'domainis:'+domain+' "'+phrase+'"'+'&sourcelang:english&format=json&STARTDATETIME='+cur_start+'000000&ENDDATETIME='+cur_end+'000000&MAXRECORDS=250'
        else:
            url=base+'domainis:'+domain+' ("'+phrase+'"'+' OR '+' "'+phrase2+'")'+'&sourcelang:english&format=json&STARTDATETIME='+cur_start+'000000&ENDDATETIME='+cur_end+'000000&MAXRECORDS=250'
        
        # start time
        starttime=time.time()
        
        # visit url, read json, append results to articles
        r=requests.get(url)
        j=r.text.lstrip('{"articles": ').rstrip(' }')
        if j!='':
            results=pd.read_json(StringIO(j))
            articles=articles.append(results[cols]).reset_index(drop=True)
            print(cur_start+': '+str(len(results))+' articles found. Total articles: '+str(len(articles))+'.')
        else:
            print(cur_start+': No new articles found.')
        iter_dt=iter_dt+timedelta(days=1)
        
        # check elapsed time, if less than 5 seconds: implement a delay
        elapsedtime=time.time()-starttime
        if elapsedtime<5:
            time.sleep(5-elapsedtime)
    
    # drop duplicate articles
    n_before=len(articles)
    articles=articles.drop_duplicates(subset=['url'])
    print('Deleted '+str(n_before-len(articles))+' articles. '+str(len(articles))+' remaining.')
    
    return articles

In [None]:
def all_phrases(domain, start, end, selected_counts_path, gdelt_dir, joint_dir):

    """
    Finds all articles of a given outlet in a given time frame containing one or two phrases for a loop of multiple n-grams.
    INPUTS:  domain:                STRING, domain
             start:                 STRING, start date in yymmdd-format
             end:                   STRING, end date in yymmdd-format
             selected_counts_path:  STRING, path where selected n-grams of a certain experiment are defined
             gdelt_dir:             STRING, directory where GDELT-results are saved
             joint_dir:             STRING, directory of event analysis
    """

    # select domain name
    domain_stripped=domain.split('.')[0]
    
    # import selected n-grams and the accompanying phrases
    selected_counts=pd.read_csv(selected_counts_path)
    selected_counts=selected_counts.fillna(0)
    
    # generate non-existent folder
    folder=gdelt_dir+start+'_'+end+'/'+domain_stripped+'/'
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    # generate list of n-grams for which articles of given domain have already been searched
    done=[]
    for index, filename in enumerate(os.listdir(folder)):
        done.append(filename.split('.')[0].replace('_',' '))
        
    # iterate through n-grams
    for i in range(len(selected_counts)):
        print(i)
        selected_ngram=selected_counts.loc[i,'ngrams']
        
        # if articles already searched, import results and save length
        if selected_ngram.translate(str.maketrans('', '', string.punctuation)) in done:
            print('Ngram already searched for '+domain+'.')
            articles=pd.read_csv(folder+selected_ngram.translate(str.maketrans('', '', string.punctuation)).replace(' ','_')+'.csv')
            selected_counts.loc[i,'articles']=len(articles)
            continue
        
        else:
            # set phrase and phrase2
            phrase=selected_counts.loc[i,'phrase_freq_1']
            phrase2=selected_counts.loc[i,'phrase_freq_2']
            
            # call single_phrase
            articles=single_phrase(domain, phrase, start, end, phrase2)
            articles['ngram']=selected_ngram
            ngram=selected_ngram.translate(str.maketrans('', '', string.punctuation)).replace(' ','_')
            
            # save results
            articles.to_csv(folder+ngram+'.csv',index=False)
            selected_counts.loc[i,'articles']=len(articles)
    
    # generate non-existent folder
    new_selected_counts_folder=joint_dir+'selected_counts_by_domain/'
    if not os.path.exists(new_selected_counts_folder): 
        os.makedirs(new_selected_counts_folder)
    
    # save results
    selected_counts.to_csv(new_selected_counts_folder+domain_stripped+'.csv',index=False)

### 6. Measuring Slant (Gentzkow & Shapiro, 2010)

In [None]:
def gs2010_reg1(joint_dir, n, topic0, topic1):

    """
    Generates regression data and coefficients for the approach of Gentzkow & Shapiro (2010).
    INPUTS:  joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams OR STRING: '2_3' for combination of bigrams and trigrams
             topic0:       STRING, name of topic relating to extreme 0
             topic1:       STRING, name of topic relating to extreme 1
    """
    
    # import or generate selected_counts-file
    if os.path.exists(joint_dir+'selected_counts_'+str(n)+'grams.csv'):
        selected_counts = pd.read_csv(joint_dir+'selected_counts_'+str(n)+'grams.csv')
    else:
        sc2 = pd.read_csv(joint_dir+'selected_counts_2grams.csv')
        sc3 = pd.read_csv(joint_dir+'selected_counts_3grams.csv')
        selected_counts = sc2.append(sc3).reset_index(drop=True)
    
    # generate list of n-grams
    ngrams = []
    for i in range(len(selected_counts)):
        ngrams.append(selected_counts.loc[i,'ngrams'])
    
    # combine stemmed tweets of both extremes
    extreme1=pd.read_csv(joint_dir+topic1+'_stemmed.csv')
    extreme0=pd.read_csv(joint_dir+topic0+'_stemmed.csv')
    extreme1['ideology']=1
    extreme0['ideology']=0
    tweets=extreme0.append(extreme1).dropna().reset_index(drop=True)
    
    # generate list of accounts
    accounts=list(dict.fromkeys(list(tweets['acc'])))
    
    # if only bigrams or trigrams
    if n!='2_3':
        
        # regression data
        reg_data=pd.DataFrame()
        reg_data['acc']=accounts
        
        for acc in range(len(reg_data)):
            
            # set ideology
            selected_tweets=tweets.loc[tweets['acc']==reg_data.loc[acc,'acc']].dropna().reset_index(drop=True)
            reg_data.loc[acc,'ideology']=selected_tweets.loc[0,'ideology']
            reg_data.loc[acc,'tweets']=len(selected_tweets)
            
            # compute frequencies
            for tweet in range(len(selected_tweets)):
                if n==2:
                    selected_tweets.loc[tweet,'ngrams']=str(list(nltk.bigrams(nltk.word_tokenize(selected_tweets.loc[tweet,'tweet']))))
                if n==3:
                    selected_tweets.loc[tweet,'ngrams']=str(list(nltk.trigrams(nltk.word_tokenize(selected_tweets.loc[tweet,'tweet']))))
            for nn in range(len(ngrams)):
                ngram = ngrams[nn]
                occ=len(selected_tweets.loc[selected_tweets['ngrams'].str.contains(ngram)])
                reg_data.loc[acc,'freq'+str(nn)]=occ/len(selected_tweets)
        
        # save
        reg_data.to_csv(joint_dir+'gs2010_reg1_data_'+str(n)+'grams.csv',index=False)
    
    # if combination of bigrams and trigrams
    else:
        
        # import data and merge
        data0 = pd.read_csv(joint_dir+'gs2010_reg1_data_2grams.csv')
        data1 = pd.read_csv(joint_dir+'gs2010_reg1_data_3grams.csv')
        reg_data = pd.merge(data0, data1, on=['acc', 'ideology', 'tweets'], how='outer')
        names = ['acc', 'ideology', 'tweets']
        for i in range(len(data0.columns)+len(data1.columns)-6):
            names.append('freq'+str(i))
        reg_data.columns = names
        
        # save
        reg_data.to_csv(joint_dir+'gs2010_reg1_data_'+str(n)+'grams.csv')
    
    # perform regressions
    coefficients = pd.DataFrame(columns=['ngram','a','b'])
    coefficients['ngram'] = ngrams
    for i in range(len(coefficients)):
        x = pd.DataFrame(reg_data['ideology'])
        x = sm.add_constant(x)
        y = pd.DataFrame(reg_data['freq'+str(i)])
        model = sm.OLS(y.astype(float), x.astype(float)).fit()
        coefficients.loc[i,'a'] = model.params[0]
        coefficients.loc[i,'b'] = model.params[1]
    
    # save regression results
    coefficients.to_csv(joint_dir+'gs2010_reg1_coefficients_'+str(n)+'grams.csv',index=False)

In [None]:
def gs2010_est_twitter_ideologies(joint_dir, n):

    """
    Estimates ideologies of the Twitter accounts in the bubbles with the approach of Gentzkow & Shapiro (2010).
    INPUTS:  joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams OR STRING: '2_3' for combination of bigrams and trigrams
    """
    
    # import regression data and coefficients
    reg_data = pd.read_csv(joint_dir+'gs2010_reg1_data_'+str(n)+'grams.csv')
    coefficients = pd.read_csv(joint_dir+'gs2010_reg1_coefficients_'+str(n)+'grams.csv')
    
    # generate dataframe showing true account ideologies
    ai_cols = ['acc', 'ideology']
    acc_ideologies = reg_data[ai_cols]
    
    # calculate estimated ideology for each account; as well as round estimated value to nearest extreme; compute correct percentage
    for acc in range(len(acc_ideologies)):
        num = 0
        denom = 0
        for i in range(0,len(coefficients)):
            num = num + coefficients.loc[i, 'b'] * (reg_data.loc[acc,'freq'+str(i)] - coefficients.loc[i, 'a'])
            denom = denom + (coefficients.loc[i, 'b'])**2
            acc_ideologies.loc[acc,'est_ideology'] = num / denom
        
        # rounded_est
        if acc_ideologies.loc[acc,'est_ideology'] < 0.5:
            acc_ideologies.loc[acc,'rounded_est'] = 0
        else:
            acc_ideologies.loc[acc,'rounded_est'] = 1
        
        # correct
        if acc_ideologies.loc[acc,'rounded_est'] == acc_ideologies.loc[acc,'ideology']:
            acc_ideologies.loc[acc, 'correct'] = 1
        else:
            acc_ideologies.loc[acc, 'correct'] = 0
    
    # calculate correlation between true and estimated ideologies
    corr = np.corrcoef(acc_ideologies['ideology'], acc_ideologies['est_ideology'])[0][1]
    
    print('Correlation true/estimated ideology: ' + str(round(corr,4)))
    print('Rounded estimate percentage correct: ' + str(round(acc_ideologies['correct'].mean()*100,4)) + '%')
    print('Variation in slant attributable to variation in ideology: ' + str(round((corr**2)*100,4)) + '%')
    print('Variation in slant attributable to noise: ' + str(round((1-corr**2)*100,4)) + '%')
    
    # save results
    acc_ideologies.to_csv(joint_dir+'gs2010_est_acc_ideologies_'+str(n)+'grams.csv',index=False)

In [None]:
def gs2010_ideology_estimate(domain, start, end, joint_dir, gdelt_dir, n):

    """
    Estimates ideology of an outlet with the approach of Gentzkow & Shapiro (2010).
    INPUTS:  domain:       STRING, domain
             start:        STRING, start date in yymmdd-format
             end:          STRING, end date in yymmdd-format
             joint_dir:    STRING, directory of event analysis
             gdelt_dir:    STRING, directory where GDELT-results are saved
             n:            INTEGER, 2 for bigrams; 3 for trigrams OR STRING: '2_3' for combination of bigrams and trigrams
    """
    
    # define time and domain name
    time = start + '_' + end
    domain_stripped = domain.split('.')[0]
    
    # import coefficient data
    gentzkow_coef = pd.read_csv(joint_dir+'/gs2010_reg1_coefficients_'+str(n)+'grams.csv')
    
    # compute number of all articles of selected domain in selected timeframe
    all_articles = pd.read_csv(gdelt_dir + 'domains/' + domain_stripped + '_' + time + '.csv')
    all_articles = all_articles.loc[all_articles['domain']==domain].reset_index(drop=True)
    total_articles = len(all_articles)
    
    # compute ideology
    num = 0
    denom = 0
    for i in range(len(gentzkow_coef)):
        ngram_cleaned = gentzkow_coef.loc[i, 'ngram'].translate(str.maketrans('', '', string.punctuation)).replace(' ','_')
        articles = pd.read_csv(gdelt_dir + time + '/' + domain_stripped + '/' + ngram_cleaned + '.csv')
        articles = articles.loc[articles['domain']==domain].reset_index(drop=True)
        selected_articles = len(articles)
        gentzkow_coef.loc[i, 'freq'] = selected_articles / total_articles
        spec_num = gentzkow_coef.loc[i, 'b'] * (gentzkow_coef.loc[i,'freq'] - gentzkow_coef.loc[i, 'a'])
        gentzkow_coef.loc[i, 'num'] = spec_num
        gentzkow_coef.loc[i, 'absnum'] = abs(spec_num)
        num = num + spec_num
        denom = denom + (gentzkow_coef.loc[i, 'b'])**2
    if not os.path.exists(joint_dir+'outlets/'): 
        os.makedirs(joint_dir+'outlets/')
    gentzkow_coef = gentzkow_coef.sort_values(by='absnum',ascending=False)
    gentzkow_coef.to_excel(joint_dir+'outlets/'+domain+'.xlsx',index=False)
    est_ideology = num / denom
    print('Estimated ideology ' + domain + ': ' + str(round(est_ideology,2)))
    
    # save results
    if not os.path.exists(joint_dir+'/gs2010_est_outlet_ideologies_'+str(n)+'grams.csv'): 
        save = pd.DataFrame()
        save.loc[0,'domain'] = domain
        save.loc[0,'ideology'] = est_ideology
        save.to_csv(joint_dir+'/gs2010_est_outlet_ideologies_'+str(n)+'grams.csv', index = False)
    else:
        save = pd.read_csv(joint_dir+'/gs2010_est_outlet_ideologies_'+str(n)+'grams.csv')
        l = len(save)
        save.loc[l,'domain'] = domain
        save.loc[l,'ideology'] = est_ideology
        save = save.drop_duplicates(subset='domain', keep='last')
        save.to_csv(joint_dir+'/gs2010_est_outlet_ideologies_'+str(n)+'grams.csv', index = False)

In [None]:
def gs2010_ideology_estimates(domains, start, end, joint_dir, gdelt_dir, n):
    
    """
    Estimates ideologies of outlets with the approach of Gentzkow & Shapiro (2010).
    INPUTS:  domains:      LIST, domains
             start:        STRING, start date in yymmdd-format
             end:          STRING, end date in yymmdd-format
             joint_dir:    STRING, directory of event analysis
             gdelt_dir:    STRING, directory where GDELT-results are saved
             n:            INTEGER, 2 for bigrams; 3 for trigrams OR STRING: '2_3' for combination of bigrams and trigrams
    """
    
    # loop over domains
    for i in range(len(domains)):
        gs2010_ideology_estimate(domains[i], start, end, joint_dir, gdelt_dir, n)

### 7. Adjusting Outlet Phrase Frequencies

In [None]:
def gs2010_ideology_estimates_adj(domains, start, end, joint_dir, gdelt_dir, n):
    
    """
    Estimates ideologies of outlets with the adjusted approach of Gentzkow & Shapiro (2010).
    INPUTS:  domains:      LIST, domains
             start:        STRING, start date in yymmdd-format
             end:          STRING, end date in yymmdd-format
             joint_dir:    STRING, directory of event analysis
             gdelt_dir:    STRING, directory where GDELT-results are saved
             n:            INTEGER, 2 for bigrams; 3 for trigrams OR STRING: '2_3' for combination of bigrams and trigrams
    """

    for i in range(len(domains)):
        domain = domains[i]
        
        # define time and domain name
        time = start + '_' + end
        domain_stripped = domain.split('.')[0]
        
        # import coefficient data
        gentzkow_coef = pd.read_csv(joint_dir+'/gs2010_reg1_coefficients_'+str(n)+'grams.csv')
        
        # compute number of all articles of selected domain in selected timeframe
        all_articles = pd.read_csv(gdelt_dir + 'domains/' + domain_stripped + '_' + time + '.csv')
        all_articles = all_articles.loc[all_articles['domain']==domain].reset_index(drop=True)
        total_articles = len(all_articles)
        
        # compute ideology
        num = 0
        denom = 0
        for i in range(len(gentzkow_coef)):
            ngram_cleaned = gentzkow_coef.loc[i, 'ngram'].translate(str.maketrans('', '', string.punctuation)).replace(' ','_')
            articles = pd.read_csv(gdelt_dir + time + '/' + domain_stripped + '/' + ngram_cleaned + '.csv')
            articles = articles.loc[articles['domain']==domain].reset_index(drop=True)
            selected_articles = len(articles)
            gentzkow_coef.loc[i, 'freq'] = selected_articles / total_articles * 0.028
            num = num + gentzkow_coef.loc[i, 'b'] * (gentzkow_coef.loc[i,'freq'] - gentzkow_coef.loc[i, 'a'])
            denom = denom + (gentzkow_coef.loc[i, 'b'])**2
        if not os.path.exists(joint_dir+'outlets/'): 
            os.makedirs(joint_dir+'outlets/')
        gentzkow_coef.to_excel(joint_dir+'outlets/'+domain+'.xlsx',index=False)
        est_ideology = num / denom
        print('Estimated ideology ' + domain + ': ' + str(round(est_ideology,2)))
        
        # save results
        if not os.path.exists(joint_dir+'/gs2010_adj_est_outlet_ideologies_'+str(n)+'grams.csv'): 
            save = pd.DataFrame()
            save.loc[0,'domain'] = domain
            save.loc[0,'ideology'] = est_ideology
            save.to_csv(joint_dir+'/gs2010_adj_est_outlet_ideologies_'+str(n)+'grams.csv', index = False)
        else:
            save = pd.read_csv(joint_dir+'/gs2010_adj_est_outlet_ideologies_'+str(n)+'grams.csv')
            l = len(save)
            save.loc[l,'domain'] = domain
            save.loc[l,'ideology'] = est_ideology
            save = save.drop_duplicates(subset='domain', keep='last')
            save.to_csv(joint_dir+'/gs2010_adj_est_outlet_ideologies_'+str(n)+'grams.csv', index = False)

### 8. Jensen et al. (2012)

In [None]:
def jensen2012_betas(joint_dir, n):
    
    """
    Estimates coefficients for the approach of Jensen et al. (2012).
    INPUTS:  joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams OR STRING: '2_3' for combination of bigrams and trigrams
    """
    
    # import data, change ideology 0 to -1
    data = pd.read_csv(joint_dir+'gs2010_reg1_data_'+str(n)+'grams.csv')
    data['ideology'] = data['ideology'].replace(0.0,-1.0)
    
    # setup betas-dataframe
    betas = pd.DataFrame(columns=['beta'])
    
    # calculate beta coefficients
    for i in range(len(data.columns)-3):
        ngram = 'freq'+str(i)
        beta = 0
        for acc in range(len(data)):
            beta = beta + data.loc[acc, ngram]*data.loc[acc,'ideology']
        betas.loc[i,'beta'] = beta
    
    # save results
    betas.to_csv(joint_dir+'jensen2012_betas.csv')

In [None]:
def jensen2012_est_twitter_ideologies(joint_dir, n):
    
    """
    Estimates ideologies of the Twitter accounts in the bubbles with the approach of Jensen et al. (2012).
    INPUTS:  joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams OR STRING: '2_3' for combination of bigrams and trigrams
    """
    
    # import regression data and coefficients
    data = pd.read_csv(joint_dir+'gs2010_reg1_data_'+str(n)+'grams.csv')
    data['ideology'] = data['ideology'].replace(0.0,-1.0)
    betas = pd.read_csv(joint_dir+'jensen2012_betas.csv')
    
    # generate dataframe showing true account ideologies
    ai_cols = ['acc', 'ideology']
    acc_ideologies = data[ai_cols]
    
    # calculate estimated ideology for each account; as well as round estimated value to nearest extreme; compute correct percentage
    for acc in range(len(acc_ideologies)):
        num = 0
        denom = 0
        for i in range(0,len(betas)):
            num = num + (betas.loc[i, 'beta'] * (data.loc[acc,'freq'+str(i)]))
            denom = denom + data.loc[acc,'freq'+str(i)]
            acc_ideologies.loc[acc,'est_ideology'] = num / denom
        # if no n-gram used by account
        if pd.isnull(acc_ideologies.loc[acc,'est_ideology']):
            acc_ideologies.loc[acc,'est_ideology'] = 0
        # rounded_est
        if acc_ideologies.loc[acc,'est_ideology'] < 0:
            acc_ideologies.loc[acc,'rounded_est'] = -1
        else:
            acc_ideologies.loc[acc,'rounded_est'] = 1
        # correct
        if acc_ideologies.loc[acc,'rounded_est'] == acc_ideologies.loc[acc,'ideology']:
            acc_ideologies.loc[acc, 'correct'] = 1
        else:
            acc_ideologies.loc[acc, 'correct'] = 0
    
    # calculate correlation between true and estimated ideologies
    corr = np.corrcoef(acc_ideologies['ideology'], acc_ideologies['est_ideology'])[0][1]
    
    print('Correlation true/estimated ideology: ' + str(round(corr,2)))
    print('Rounded estimate percentage correct: ' + str(round(acc_ideologies['correct'].mean()*100,2)) + '%')
    print('Variation in slant attributable to variation in ideology: ' + str(round((corr**2)*100,0)) + '%')
    print('Variation in slant attributable to noise: ' + str(round((1-corr**2)*100,0)) + '%')
    
    # save results
    acc_ideologies.to_csv(joint_dir+'jensen2012_est_acc_ideologies_'+str(n)+'grams.csv',index=False)    

In [None]:
def jensen2012_ideology_estimate(domains, start, end, joint_dir, gdelt_dir, n):
    
    """
    Estimates ideologies of outlets with the adjusted approach of Jensen et al. (2012).
    INPUTS:  domains:      LIST, domains
             start:        STRING, start date in yymmdd-format
             end:          STRING, end date in yymmdd-format
             joint_dir:    STRING, directory of event analysis
             gdelt_dir:    STRING, directory where GDELT-results are saved
             n:            INTEGER, 2 for bigrams; 3 for trigrams OR STRING: '2_3' for combination of bigrams and trigrams
    """
    
    for domain in range(len(domains)):
        domain = domains[domain]
        
        # define time and domain rame
        time = start + '_' + end
        domain_stripped = domain.split('.')[0]
        
        # import coefficient data
        betas = pd.read_csv(joint_dir+'jensen2012_betas.csv')
        sc = pd.read_csv(joint_dir+'selected_counts_'+str(n)+'grams.csv')
        betas['ngram'] = sc['ngrams']
        
        # compute number of all articles of selected domain in selected timeframe
        all_articles = pd.read_csv(gdelt_dir + 'domains/' + domain_stripped + '_' + time + '.csv')
        all_articles = all_articles.loc[all_articles['domain']==domain].reset_index(drop=True)
        total_articles = len(all_articles)
        
        # compute ideology
        num = 0
        denom = 0
        for i in range(len(betas)):
            ngram_cleaned = betas.loc[i, 'ngram'].translate(str.maketrans('', '', string.punctuation)).replace(' ','_')
            articles = pd.read_csv(gdelt_dir + time + '/' + domain_stripped + '/' + ngram_cleaned + '.csv')
            articles = articles.loc[articles['domain']==domain].reset_index(drop=True)
            selected_articles = len(articles)
            betas.loc[i, 'freq'] = selected_articles / total_articles
            num = num + (betas.loc[i,'freq']*betas.loc[i,'beta'])
            denom = denom + betas.loc[i,'freq']
        est_ideology = num / denom
        print('Estimated ideology ' + domain + ': ' + str(round(est_ideology,2)))
        
        # save results
        if not os.path.exists(joint_dir+'/jensen2012_est_outlet_ideologies_'+str(n)+'grams.csv'): 
            save = pd.DataFrame()
            save.loc[0,'domain'] = domain
            save.loc[0,'ideology'] = est_ideology
            save.to_csv(joint_dir+'/jensen2012_est_outlet_ideologies_'+str(n)+'grams.csv', index = False)
        else:
            save = pd.read_csv(joint_dir+'/jensen2012_est_outlet_ideologies_'+str(n)+'grams.csv')
            l = len(save)
            save.loc[l,'domain'] = domain
            save.loc[l,'ideology'] = est_ideology
            save = save.drop_duplicates(subset='domain', keep='last')
            save.to_csv(joint_dir+'/jensen2012_est_outlet_ideologies_'+str(n)+'grams.csv', index = False)

### 9. Finding Omitted Information

In [None]:
def scrape_articles(full_domain, gdelt_dir, start, end):
    
    """
    Scrape articles of given domains.
    INPUTS:  full_domain:  STRING, domain
             gdelt_dir:    STRING, directory where GDELT-results are saved
             start:        STRING, start date in yymmdd-format
             end:          STRING, end date in yymmdd-format
    """
    
    # define domain name
    domain = full_domain.split('.')[0]
    
    # import articles found on GDELT-API for given domain
    articles = pd.read_csv(gdelt_dir+'domains/'+domain+'_'+start+'_'+end+'.csv')
    articles = articles.loc[articles['domain']==full_domain].reset_index(drop=True)
    
    # foxnews
    if domain == 'foxnews':
        
        # iterate through articles
        for i in range(len(articles)):
            print(i)
            url = articles.loc[i,'url']
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            
            # find all p-elements in div 'article-body'
            elements = soup.find_all('div', {"class": ['article-body']})
            t = 0
            for e in range(len(elements)):
                text = elements[e].find_all('p')
                if len(text)>0:
                    for f in range(len(text)):
                        text_p = text[f].text
                        articles.loc[i,'text'+str(t)] = text_p
                        t = t + 1
            
            # find all div 'description', used for picture galleries
            if len(elements)==0:
                elements = soup.find_all('div', {"itemprop": ['description']})
                for f in range(len(elements)):
                    text_p = elements[f].text.replace('\n', ' ')
                    articles.loc[i,'text'+str(t)] = text_p
                    print(text_p)
                    t = t + 1
    
    # cnn
    if domain == 'cnn':
        articles = articles.loc[~articles['url'].str.contains('cnn-underscored')].reset_index(drop=True)
        for i in range(len(articles)):
            print(i)
            url = articles.loc[i,'url']
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            
            # find all div-elements with selected classes
            elements = soup.find_all('div', {"class": ['cn__column cn__column--2up0', 
                        'el__leafmedia el__leafmedia--sourced-paragraph', 
                        'zn-body__paragraph', 
                        'Paragraph__component BasicArticle__paragraph BasicArticle__pad Paragraph__isDropCap',
                        'Paragraph__component BasicArticle__paragraph BasicArticle__pad',
                        'sc-bdVaJa post-content-rendered render-stellar-contentstyles__Content-sc-9v7nwy-0 erzhuK',
                        'Paragraph__component',
                        'article__content',
                        'sc-bdVaJa post-content-rendered render-stellar-contentstyles__Content-sc-9v7nwy-0 dUdYqp',
                        'cnnix__article__intro',
                        'SpecialArticle__headDescription',
                        'SpecialArticle__paragraph SpecialArticle__pad SpecialArticle__widthStandard SpecialArticle__isDropCap',
                        'SpecialArticle__paragraph SpecialArticle__pad SpecialArticle__widthStandard',
                        'sc-bdVaJa post-content-rendered render-stellar-contentstyles__Content-sc-9v7nwy-0 dUdYqp',
                        'sc-dnqmqq render-stellar-contentstyles__List-sc-9v7nwy-1 eUPcFX',
                        'sc-bdVaJa post-content-rendered render-stellar-contentstyles__Content-sc-9v7nwy-0 daEDKg',
                        'pg-special-article__head pg-special-article__head-',
                        'el-unfurled__caption',
                        'block-grid']})
            t = 0
            for e in range(len(elements)):
                text = elements[e].text.replace('\n', ' ')
                articles.loc[i,'text'+str(t)] = text
                t = t + 1
    
    # alternet
    if domain == 'alternet':
        for i in range(len(articles)):
            url = articles.loc[i,'url']
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')

            # find all p-elements in div 'article-body'
            elements = soup.find_all('div', {"class": ['body-description']})
            t = 0
            for e in range(len(elements)):
                text = elements[e].find_all('p')
                if len(text)>0:
                    for f in range(len(text)):
                        text_p = text[f].text
                        articles.loc[i,'text'+str(t)] = text_p
                        t = t + 1
    # usatoday
    if domain == 'usatoday':
        for i in range(len(articles)):
            try:
                url = articles.loc[i,'url']
                page = requests.get(url)
                soup = BeautifulSoup(page.content, 'html.parser')

                # find all p-elements in div 'article-body'
                elements = soup.find_all('div', {"id": ['truncationWrap']})
                t = 0
                for e in range(len(elements)):
                    text = elements[e].find_all('p')
                    if len(text)>0:
                        for f in range(len(text)):
                            text_p = text[f].text
                            articles.loc[i,'text'+str(t)] = text_p
                            t = t + 1
                
                # for pictures/videos
                if len(elements)==0:
                    elements = soup.find_all('div', {"class": ['detail-text']})
                    for f in range(len(elements)):
                        text_p = elements[f].text.replace('\n', ' ')
                        articles.loc[i,'text'+str(t)] = text_p
                        t = t + 1
                
                # more div classes
                if len(elements)==0:
                    elements = soup.find_all('div', {"class": ['article-wrapper', 'article-inner theme-light', 'article-inner theme-dark']})
                    for e in range(len(elements)):
                        text = elements[e].find_all('p')
                        if len(text)>0:
                            for f in range(len(text)):
                                text_p = text[f].text
                                articles.loc[i,'text'+str(t)] = text_p
                                t = t + 1   
            except:
                continue
    
    # breitbart
    if domain == 'breitbart':
        for i in range(len(articles)):
            url = articles.loc[i,'url']
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')

            # find all p-elements in div 'article-body'
            elements = soup.find_all('div', {"class": ['entry-content']})
            t = 0
            for e in range(len(elements)):
                text = elements[e].find_all('p')
                if len(text)>0:
                    for f in range(len(text)):
                        text_p = text[f].text
                        articles.loc[i,'text'+str(t)] = text_p
                        t = t + 1        
    
    # save results
    articles.to_csv(gdelt_dir+'fulltexts/'+domain+'_'+start+'_'+end+'.csv',index=False)

In [None]:
def prep_omit(gdelt_dir, domain, start, end):
    
    """
    Prepares scraped article data.
    INPUTS:  gdelt_dir:    STRING, directory where GDELT-results are saved
             domain:       STRING, domain
             start:        STRING, start date in yymmdd-format
             end:          STRING, end date in yymmdd-format
    """
    
    # read article data
    articles = pd.read_csv(gdelt_dir+'fulltexts/'+domain.split('.')[0]+'_'+start+'_'+end+'.csv')
    del articles['url']
    del articles['title']
    del articles['seendate']
    del articles['domain']
    
    # collect text snippets
    text = []
    for row in range(len(articles)):
        for i in range(len(articles.columns)):
            if not pd.isnull(articles.loc[row, 'text'+str(i)]):
                text.append(articles.loc[row, 'text'+str(i)].replace(u'\xa0', u' ').replace(u'\\', u' '))
    
    # save unique text snippets
    text_unique = list(dict.fromkeys(text))
    pd.DataFrame(text_unique, columns=['text']).to_csv(gdelt_dir+'fulltexts/'+domain.split('.')[0]+'_unique'+'_'+start+'_'+end+'.csv',index=False)    

In [None]:
def stem_articles(gdelt_dir, joint_dir, domain, start, end, a, b, c):
    
    """
    Cleans and stems scraped article data, and removes stop words.
    INPUTS:  gdelt_dir:    STRING, directory where GDELT-results are saved
             joint_dir:    STRING, directory of event analysis
             domain:       STRING, domain
             start:        STRING, start date in yymmdd-format
             end:          STRING, end date in yymmdd-format
             a:            STRING, special character '’'
             b:            STRING, special character '”'
             c:            STRING, special character '“'             
    """
    
    # import full text
    text = pd.read_csv(gdelt_dir+'fulltexts/'+domain.split('.')[0]+'_unique'+'_'+start+'_'+end+'.csv')
    text_save = text.copy()
    
    # initialize Porter stemmer
    stemmer = PorterStemmer()
    own_stopwords = ['cannot', 'gonna', 'gotta', 'im', 'ive', 'like', 'cant', 'whats', 'wanna', 'us', 'amp', 'lets', 'gimme', 'gimmee']
    
    # generate list of stop words
    stop_np = stop_words(own_stopwords, a)
    
    for t in range(len(text)):
        
        # split, remove emojis, replace special characters
        split = text.loc[t,'text'].encode('ascii', 'ignore').decode('ascii').replace(a,"").replace(b,"").replace(c,"").split()
        
        # generate new cleaned and stemmed article text
        if len(split)>0:
            new_art = ''
            new_art_cl = ''
            for element in range(len(split)):
                if split[element] in stop_np:
                    new_art_cl = new_art_cl + split[element].translate(str.maketrans('', '', string.punctuation)).lower() + ' '
                    continue
                else:
                    new_art = new_art + stemmer.stem(split[element].translate(str.maketrans('', '', string.punctuation)).lower()) + ' '
                    new_art_cl = new_art_cl + split[element].translate(str.maketrans('', '', string.punctuation)).lower() + ' '
            text.loc[t,'text'] = new_art
            text_save.loc[t,'text'] = new_art_cl
        else:
            text.loc[t,'text'] = np.nan
            text_save.loc[t,'text'] = np.nan
    
    # generate non-existent folder
    if not os.path.exists(joint_dir+'omit/'): 
            os.makedirs(joint_dir+'omit/')
    
    # save stemmed and cleaned text
    text.to_csv(joint_dir+'omit/'+domain.split('.')[0]+'_stemmed.csv', index=False)
    text_save.to_csv(joint_dir+'omit/'+domain.split('.')[0]+'_cleaned.csv', index=False)

In [None]:
def make_extreme(domains, joint_dir, extreme):
    
    """
    Cleans and stems scraped article data, and removes stop words.
    INPUTS:  domains:      LIST, list of domains belonging to one extreme
             joint_dir:    STRING, directory of event analysis
             extreme:      INTEGER, 0 or 1          
    """
    
    # initialize empty dataframes
    s = pd.DataFrame(columns=['text'])
    c = pd.DataFrame(columns=['text'])
    
    # append data from domains
    for i in range(len(domains)):
        stemmed = pd.read_csv(joint_dir+'omit/'+domains[i].split('.')[0]+'_stemmed.csv')
        cleaned = pd.read_csv(joint_dir+'omit/'+domains[i].split('.')[0]+'_cleaned.csv')
        s = s.append(stemmed)
        c = c.append(cleaned)
    
    # reset indices
    s = s.reset_index(drop=True)
    c = c.reset_index(drop=True)
    
    # save resulting dataframes
    s.to_csv(joint_dir+'omit/extreme'+str(extreme)+'_stemmed.csv', index=False)
    c.to_csv(joint_dir+'omit/extreme'+str(extreme)+'_cleaned.csv', index=False)

In [None]:
def compute_phrases_omit(joint_dir, topic, n):
    
    """
    Cleans and stems scraped article data, and removes stop words.
    INPUTS:  joint_dir:    STRING, directory of event analysis
             topic:        STRING, name of topic
             n:            INTEGER, 2 for bigrams; 3 for trigrams
    """
    
    # import data
    data = pd.read_csv(joint_dir + 'omit/' + topic + '_cleaned.csv')
    stem = pd.read_csv(joint_dir + 'omit/' + topic + '_stemmed.csv')
    
    # call function ngrams_to_phrases
    ngrams_to_phrases(n, joint_dir + 'omit/', topic, data, stem, 'text')

In [None]:
def chi_squared_omit(joint_dir, n):
    
    """
    Performs the chi2-test for both extremes.
    INPUTS:  joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams
    """
    
    # generate chi2-values
    counts = chi_squared('extreme0', 'extreme1', joint_dir+'omit/', n)
    
    # import and combine phrases
    phrases0 = pd.read_csv(joint_dir +'omit/' + 'extreme0' + '_phrases_' + str(n) + 'grams.csv')
    phrases1 = pd.read_csv(joint_dir +'omit/' + 'extreme1' + '_phrases_' + str(n) + 'grams.csv')
    phrases = phrases0.append(phrases1).reset_index(drop=True)
    phrases = phrases.rename(columns={'count': 'counts'})
    phrases = phrases.groupby(['ngram','phrase']).counts.sum().reset_index().sort_values(by='counts',ascending=False).reset_index(drop=True)
    
    # find phrases belonging to n-grams
    for i in range(len(counts)):
        ngram=counts.loc[i,'ngrams']
        phrases_x=phrases.loc[phrases['ngram']==ngram].sort_values(by='counts',ascending=False).reset_index(drop=True)
        if len(phrases_x.loc[0,'phrase'].replace(' ',''))>3:
            counts.loc[i,'phrase_freq_1']=phrases_x.loc[0,'phrase']
            if len(phrases_x)>1:
                # second phrase has to be used more than 5 times to be considered
                if phrases_x.loc[1,'counts']>5:
                    if len(phrases_x.loc[1,'phrase'].replace(' ',''))>3:
                        counts.loc[i,'phrase_freq_2']=phrases_x.loc[1,'phrase']
    
    # import stemmed data
    stem0 = pd.read_csv(joint_dir +'omit/' + 'extreme0' + '_stemmed.csv')
    stem1 = pd.read_csv(joint_dir +'omit/' + 'extreme1' + '_stemmed.csv')
    
    # generate most_likely-indicator
    counts['most_likely'] = (counts['count_0']/len(stem0) < counts['count_1']/len(stem1)).astype(int)
    
    # save results
    counts.to_csv(joint_dir +'omit/counts_'+str(n)+'grams.csv',index=False)

In [None]:
def omit_exclude(joint_dir, n):
    
    """
    Excludes n-grams used to measure slant.
    INPUTS:  joint_dir:    STRING, directory of event analysis
             n:            INTEGER, 2 for bigrams; 3 for trigrams
    """
    
    # import n-grams sorted by chi2-values of articles
    counts = pd.read_csv(joint_dir +'omit/counts_'+str(n)+'grams.csv')
    
    # import n-grams used to measure slang
    sc = pd.read_csv(joint_dir + 'selected_counts_2grams.csv')
    
    # generate list of n-grams
    ngrams = list(sc['ngrams'])
    
    # drop n-grams of counts that are in sc
    if n==2:
        for i in range(len(counts)):
            if counts.loc[i,'ngrams'] in ngrams:
                counts = counts.drop([i])
    if n==3:
        ngrams_cleaned = []
        for i in range(len(ngrams)):
            ngrams_cleaned.append(ngrams[i].translate(str.maketrans('', '', string.punctuation)))
        for j in range(len(counts)):
            split = counts.loc[j,'ngrams'].translate(str.maketrans('', '', string.punctuation)).split()
            if (split[0] + ' ' + split[1] in ngrams_cleaned) or (split[1] + ' ' + split[2] in ngrams_cleaned):
                counts = counts.drop([j])
    
    # save new counts-dataframe
    counts.to_csv(joint_dir +'omit/counts_'+str(n)+'grams_final.csv', index=False)