In [23]:
import pandas as pd
import os
import numpy as np
import pycountry
from html import unescape
import re
import nltk
from nltk.corpus import stopwords
import pickle
from collections import Counter, defaultdict
from scipy.spatial.distance import cosine

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE
import glob

## Setup for looping

In [5]:
def number_to_regime(number):
    """Map The Economist Intelligence Unit number to a regime type"""
    if number > 8:
        return 'Full democracy'
    elif number > 6:
        return 'Flawed democracy'
    elif number > 4:
        return 'Hybrid regime'
    elif number >= 0:
        return 'Authoritarian regime'
    else:
        raise ValueError('Data has negative values')

In [6]:
# Get additional countries
countries_additional = ['South Korea', 'North Korea', 'Korea', 'UAE', 'Bonaire', 'Bosnia', "Cote D'ivoire", 'Falkland Islands', 'Micronesia', 
                        'Iran', 'Laos', 'Moldova', 'Palestine', 'Syria', 'Taiwan', 'Tanzania', 'USA', 'United States of America', 'US', 'Vatican', 'Vietnam', 'America']

country_list = [c.name for c in pycountry.countries]
countries_all = countries_additional + country_list

In [111]:
# Get deficient mapping countries
deficient_mapping = ['Switzerland', 'Nepal']

# Get democracy csv file
democracies = pd.read_excel('democracy-dictatorship.xlsx')

# Map the democracies
democracies['Country'] = [x[2:] if x in deficient_mapping else x[1:] for x in democracies.Country.unique()]

def get_countries(text, countries_additional=countries_additional):
    countries = []
    country_list = [c.name for c in pycountry.countries]
    for country in country_list + countries_additional:
        if country in str(text):
            countries.append(country)
    return countries

In [112]:
# Get mapping to the countries
mapping = {'UAE': 'United Arab Emirates',
           'Bosnia': 'Bosnia and Herzegovina',
           'USA': 'United States',
           'United States of America': 'United States',
           'US': 'United States',
           'America': 'United States',
           'Congo, The Democratic Republic of the': 'Democratic Republic of the Congo',
           'Congo': 'Democratic Republic of the Congo',
           'Czechia': 'Czech Republic',
           'Dominica': 'Dominican Republic',
            'Iran, Islamic Republic of': 'Iran',
           'Russian Federation': 'Russia',
           'Viet nam': 'Vietnam',
           'Syrian Arab Republic': 'Syria'

          }

In [114]:
democracy_mapped = {}
democracy_unmapped = []
democracy_list = [x for x in democracies.Country]

# Loop over all the countries
for c in countries_all:

    # If the country is in the list, add it directly
    if c in democracy_list: 
        democracy_mapped[c] = number_to_regime(
            democracies.loc[democracies.Country == c, year].values)

    # If the country is in the additional mapping key, add it by changing the key
    elif c in mapping.keys():
        map_country = mapping[c]
        democracy_mapped[c] = number_to_regime(
            democracies.loc[democracies.Country == map_country, year].values)

    # Else say this is unmapped and will not be used in future analysis
    else: democracy_unmapped.append(c)


In [115]:
def get_democracy_mapping(year=2010):
    # Create country-democracy-dictatorship mapping
    democracy_mapped = {}
    democracy_unmapped = []
    democracy_list = [x for x in democracies.Country]

    # Loop over all the countries
    for c in countries_all:

        # If the country is in the list, add it directly
        if c in democracy_list: 
            democracy_mapped[c] = number_to_regime(
                democracies.loc[democracies.Country == c, year].values)

        # If the country is in the additional mapping key, add it by changing the key
        elif c in mapping.keys():
            map_country = mapping[c]
            democracy_mapped[c] = number_to_regime(
                democracies.loc[democracies.Country == map_country, year].values)

        # Else say this is unmapped and will not be used in future analysis
        else: democracy_unmapped.append(c)
            
    return democracy_mapped

In [130]:
def unit_test_democracy():
    """Tests that at least one of the mappings is different"""
    
    # Get data for 2010 and 2015
    democracy_mapped1 = get_democracy_mapping(2010)
    democracy_mapped2 = get_democracy_mapping(2015)
    
    non_maps = []
    for d1, d2 in zip(democracy_mapped1.items(), democracy_mapped2.items()):
        if d1[0] == d2[0] and d1[1] != d2[1]:
            non_maps.append(d1[0])
    
    assert len(non_maps) != 0, 'None of the regimes have changed for any of the countries'

In [131]:
unit_test_democracy()

In [144]:
def map_to_regime_list(ls, democracy_mapped):
    """Mapping the number of countries to a regime list"""
    return [democracy_mapped.get(x, 'None') for x in ls]


def regime_list_to_category(ls):
    """Function to change the regime list to a single category"""
    
    # Get the number of unique regimes listed
    uniques = np.unique(ls)
    
    # If only one regime, then use it as the category
    if len(uniques) == 1:
        return uniques[0]
    
    # If there are two regimes, then look at whether they are democracy and authoritarianism.
    elif len(uniques) == 2:
        
        # Test if the category involves two items
        if ('Full democracy' in uniques) and ('Authoritarian regime' in uniques):
            return 'Democracy and Authoritarianism'
    
    # Otherwise, return nothing
    else:
        return 'Other'

In [29]:
# Get stopwords
stopwords = open(f'.{os.sep}supplementary_files{os.sep}stopwords.txt', 'r').readlines()
stopwords = set([x.replace('\n', '') for x in stopwords])

# Get a dictionary that maps any country to the word 'country'
country_map_dict = {k:'country' for k in countries_all}


In [57]:
def get_country_replacement(txt, countries_all=countries_all, replace_word='country'):
    """Replace all the countries into the word 'country'"""
    for country in countries_all:
        if country in txt:
            txt = txt.replace(country, replace_word)  
    return txt

def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


# Define function to clean and split text
def remove_symbols(text):
    """Remove all the symbols and stopwords"""
    text = unescape(text)
    text_cleaned = [re.sub('[^a-zA-Z0-9]', '', w) for w in text.strip().split() if w not in stopwords]
    return [w for w in text_cleaned if w != '']

In [61]:
def lower_and_rm_stopwords(text):
    """Capitalizes the letter of each word"""
    return [x.lower() for x in text if x != '' and x not in stopwords]


def clean(text):
    """Clean the text using the previously defined functions"""
    change_country = get_country_replacement(text)
    no_url = remove_URL(change_country)
    no_symbols = remove_symbols(no_url)
    clean_text = lower_and_rm_stopwords(no_symbols)
    
    return clean_text

## Getting all the files

Re-do the file for:
- 2012-04
- .\data_processed\politics_2012-07.csv

In [86]:
from tqdm import tqdm

In [87]:
processed_dir = 'data_processed'

In [90]:
df = pd.read_csv(file, compression = 'bz2', low_memory=False)

In [None]:
df = pd.read_csv('./data/politics_2012-04.bz2', compression='bz2', low_memory=False, error_bad_lines=False)

In [None]:
for file in tqdm(glob.glob('./data/*.bz2')):
    
    # Get the year and democracy mapping
    year = int(file[16:20])
    filename = file[7:-4]
    directory_to_save = f'.{os.sep}{processed_dir}{os.sep}{filename}.csv'
    print(directory_to_save)
    
    democracy_mapped = get_democracy_mapping(year)
    
    # Read file
    df = pd.read_csv(file, compression = 'bz2', low_memory=False)

    # Get the country data
    df['countries'] = df.body.map(get_countries)

    # Filter out only the posts that mention countries
    df_countries = df.loc[df.countries.map(lambda x: x != []), ['body', 'countries', 'score']]

    # Get the regimes
    df_countries['regimes'] = df_countries['countries'].apply(lambda ls: [democracy_mapped.get(x, 'None') for x in ls])

    # Get the outcome variable
    df_countries['outcome'] = df_countries['regimes'].map(regime_list_to_category)

    # Get the tokens
    df_countries['tokens'] = df_countries.body.map(clean)
    
    # Export to csv
    df_countries.to_csv(directory_to_save, index=False)

In [193]:
for file in glob.glob('./data/*.bz2')[31:]:
    
    # Get the year and democracy mapping
    year = int(file[16:20])
    filename = file[7:-4]
    directory_to_save = f'.{os.sep}{processed_dir}{os.sep}{filename}.csv'
    print(directory_to_save)
    
    democracy_mapped = get_democracy_mapping(year)
    
    # Read file
    try:
        df = pd.read_csv(file, compression = 'bz2', low_memory=False)
    except:
        df = pd.read_csv(file, compression = 'bz2', low_memory=False, lineterminator='\n')

    # Get the country data
    df['countries'] = df.body.map(get_countries)

    # Filter out only the posts that mention countries
    df_countries = df.loc[df.countries.map(lambda x: x != []), ['body', 'countries', 'score']]

    # Get the regimes
    df_countries['regimes'] = df_countries['countries'].apply(lambda ls: [democracy_mapped.get(x, 'None') for x in ls])

    # Get the outcome variable
    df_countries['outcome'] = df_countries['regimes'].map(regime_list_to_category)

    # Get the tokens
    df_countries['tokens'] = df_countries.body.map(clean)
    
    # Export to csv
    df_countries.to_csv(directory_to_save, index=False)

.\data_processed\politics_2012-08.csv
.\data_processed\politics_2012-09.csv
.\data_processed\politics_2012-10.csv
.\data_processed\politics_2012-11.csv
.\data_processed\politics_2012-12.csv
.\data_processed\politics_2013-01.csv
.\data_processed\politics_2013-02.csv
.\data_processed\politics_2013-03.csv
.\data_processed\politics_2013-04.csv
.\data_processed\politics_2013-05.csv
.\data_processed\politics_2013-06.csv
.\data_processed\politics_2013-07.csv
.\data_processed\politics_2013-08.csv
.\data_processed\politics_2013-09.csv
.\data_processed\politics_2013-10.csv
.\data_processed\politics_2013-11.csv
.\data_processed\politics_2013-12.csv
.\data_processed\politics_2014-01.csv
.\data_processed\politics_2014-02.csv
.\data_processed\politics_2014-03.csv
.\data_processed\politics_2014-04.csv
.\data_processed\politics_2014-05.csv
.\data_processed\politics_2014-06.csv
.\data_processed\politics_2014-07.csv
.\data_processed\politics_2014-08.csv
.\data_processed\politics_2014-09.csv
.\data_proce