In [1]:
import numpy as np
import pandas as pd
import re
import utils
import ast
import swifter

In [2]:
df_itemized = pd.read_csv("xandr_segments_itemized.csv") # load data
df_itemized["name_list"] = df_itemized["name_list"].swifter.apply(ast.literal_eval) # csv is not the right format for storing lists, but well... this works
df_itemized = df_itemized.set_index("id")

Pandas Apply:   0%|          | 0/646024 [00:00<?, ?it/s]

In [3]:
eu_countries = pd.read_json("eu_countries.json") # load and prepare eu countries

eu_countries["names_re"] = eu_countries.swifter.apply(lambda row: utils.list_to_regex(np.concatenate(row[["strings", "identifiers"]].values.flatten())), axis=1)
eu_names_re = utils.list_to_regex(np.concatenate(eu_countries[["strings", "identifiers"]].values.flatten()))
eu_codes = set(np.concatenate(eu_countries["codes"].values.flatten()))

Pandas Apply:   0%|          | 0/28 [00:00<?, ?it/s]

In [4]:
travel_word_list = ["travel", "departure", "destination",
                    "tourism", "tourist", "vacation", "holiday", "voyage", "expedia"]

eu_false_positives = [r"(furniture|nail) ?polish",  # country FPs
                      r"irish ?(whiskey|cream)",
                      r"speak(er|ing)",
                      # "language",
                      # "hispanic",
                      r"tour de france",
                      r"greek joghurt",

                      # Country code FPs
                      r"accuen",  # does market research and very little location-specifics. Thus many FPs
                      r"xaxisca",  # canadian source
                      r"xaxisapc",  # asia pacific source
                      r"tailtarget",  # mostly latAm focussed -> numerous es/pt FPs
                      ]

false_positive_re = utils.list_to_regex(travel_word_list + eu_false_positives)
filtered_false_positives = df_itemized[df_itemized.swifter.apply(
    lambda x: not re.search(false_positive_re, x["name"]), axis=1)]

Pandas Apply:   0%|          | 0/646024 [00:00<?, ?it/s]

In [5]:
def filter_eu_names(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(eu_names_re, row["name"]) or len(eu_codes.intersection(row["name_list"])))


names_re_list = eu_countries["names_re"].values
codes_set_list = eu_countries["codes"].apply(lambda x: set(x)).values
country_names = eu_countries["name"].values

def identify_eu_references(row):
    """Takes a row of a dataframe and checks for references to eu-countries. Countries are added to the column "name_list".

    Arguments:
        row -- pandas.Series containing the row of a dataframe

    Returns:
        modified row
    """
    name = row["name"]
    country_hits = []
    for i, names_re in enumerate(names_re_list):
         
        if re.search(names_re, name) or len(codes_set_list[i].intersection(row["name_list"])):
            country_hits.append(country_names[i])
    
    row["countries"] = country_hits
    return row


filtered_false_positives = filtered_false_positives.swifter.apply(identify_eu_references, axis=1)

Pandas Apply:   0%|          | 0/612022 [00:00<?, ?it/s]

In [6]:
df_eu_reference = filtered_false_positives[filtered_false_positives.swifter.apply(lambda x: len(x["countries"])>0, axis=1)]

Pandas Apply:   0%|          | 0/612022 [00:00<?, ?it/s]

In [7]:
df_eu_reference.to_csv("xandr_segments_eu.csv")

# Perform the same analysis for the USA

In [8]:
us_codes = {"us", "usa"}

us_regex = utils.list_to_regex(["united states", "america", "xaxisus"])

def filter_us_reference(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(us_regex, row["name"]) or len(us_codes.intersection(row["name_list"])))


df_us_reference = filtered_false_positives[filtered_false_positives.swifter.apply(filter_us_reference, axis=1)]


Pandas Apply:   0%|          | 0/612022 [00:00<?, ?it/s]

In [9]:
df_us_reference.to_csv("xandr_segments_us.csv")