# 01 02 Data and search query processing (Climate)
This script processes the ___climate___ search queries:
1. Data cleaning
2. Descriptive variables
3. Prepare data for manual coding

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import re
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
from collections import Counter
from statistics import mean, median
import nltk
from nltk.corpus import stopwords
from datetime import datetime
date = datetime.now().strftime('%d%m%Y')
import random



In [2]:
# setting paths
PATH = '/Users/marieke/SearchingForBias'

In [3]:
df = pd.read_csv(PATH+"/data/DigSocSurvey_26012021.tsv", delimiter='\t')
df.shape

(1994, 479)

### Background variables

In [4]:
cols = ["base_lft", "base_gender", "base_opl", "base_regio", "base_polar", "base_intpol", 
        "MVH_att_cc_1", "MVH_att_cc_2", "MVH_att_importance_2"]

In [5]:
df_bg = df[["ID"]+cols].copy()
df_bg.shape

(1994, 10)

In [6]:
att_cols = ["MVH_att_cc_1", "MVH_att_cc_2"]
for c in att_cols:
    print(df_bg[c].value_counts(dropna=False))

2    556
1    418
3    274
5    265
4    261
6    126
7     94
Name: MVH_att_cc_1, dtype: int64
4    468
6    397
5    374
7    300
3    185
2    161
1    109
Name: MVH_att_cc_2, dtype: int64


In [7]:
## climate attitudes
df_bg["MVH_att_cc_1"] = df_bg["MVH_att_cc_1"].map({1:7, 2:6, 3:5, 4:4, 5:3, 6:2, 7:1}) # reverse scale
df_bg["att_cc_mean"] = df_bg[att_cols].mean(axis=1) # mean score
df_bg["att_cc_mean"].describe()

count    1994.000000
mean        4.771063
std         1.516714
min         1.000000
25%         4.000000
50%         5.000000
75%         6.000000
max         7.000000
Name: att_cc_mean, dtype: float64

In [8]:
# issue attitudes
df_bg.MVH_att_importance_2.value_counts(dropna=False)

4    887
5    510
3    397
2    129
1     71
Name: MVH_att_importance_2, dtype: int64

In [9]:
# poltical orientation
df_bg.base_polar.value_counts(dropna=False) # 8 respondents missing
# set 11 (I don't want to say) to missing.
df_bg['base_polar'] = df_bg['base_polar'].replace(11, np.nan) # now 149 respondents missing.

In [10]:
# politiical interest
df_bg.base_intpol.value_counts(dropna=False)
# 13 respondents missing.

6.0     648
7.0     275
5.0     193
3.0     155
0.0     149
9.0     133
4.0     120
2.0     119
1.0     104
10.0     85
NaN      13
Name: base_intpol, dtype: int64

In [11]:
# education
opl = {1:1,
       2:1,
       3:2,
       4:2,
       5:2,
       6:3,
       7:3,
      }

df_bg['opl_3cat'] = df_bg['base_opl'].replace(opl)

In [12]:
df_bg.opl_3cat.value_counts(dropna=False)

2    1036
3     792
1     166
Name: opl_3cat, dtype: int64

In [13]:
# gender
df_bg.base_gender.value_counts(dropna=False)

1    1004
2     985
3       5
Name: base_gender, dtype: int64

In [14]:
# age
#df_bg.base_lft.value_counts(dropna=False)

In [15]:
df_bg.to_pickle(PATH+'/data/climate/01_df_bg.pkl')

### Search queries

In [16]:
sq_cols = ["MVH_search_cc_1", "MVH_search_cc_2", "MVH_search_cc_3"]

In [17]:
df = df[["ID"]+sq_cols].copy()
df.set_index('ID', inplace=True)

In [18]:
df.head()

Unnamed: 0_level_0,MVH_search_cc_1,MVH_search_cc_2,MVH_search_cc_3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,boeit,mij,niet
2,klimaat,klimaatverandering,wereldwijd klimaat
3,Climate change,What causes the global warming?,Klimaatverandering
4,Global warming,C02,Greenhouse gases
5,klimaatbeleid Nederland,CO2 uitstoot,Klimaat


In [19]:
# data cleaning functions
def lower_punc(x):
    #print(x.dtypes)
    x = re.sub(r'[^\w\s]|_', '', x.lower()).strip()
    return x

EXCLUDE = ["", "geen", "nvt", "niets", "geen idee", "niks", "weet niet", "niet", "idem"]
def remove_exclude(x):
    x = ' '.join([x for x in x.lower().split() if x not in EXCLUDE])
    return x

def remove_stopwords(x):
    x = [w for w in x.lower() if w not in stopwords.words('dutch')]
    return x

In [20]:
cols_lp = [str(c)+"_lp" for c in df.columns]

In [21]:
# lowercase + punctuation removal
for old, new in zip(df.columns, cols_lp):
    df[new] = df[old].apply(lower_punc)

In [22]:
def get_nwords(x):
    count = len(x.split())
    return count

def get_length(x):
    count = len(x)
    return count

In [23]:
# number of words
c_nwords = []
for c in cols_lp:
    name = str(c)+"_nwords"
    df[name] = df[c].apply(get_nwords)
    c_nwords.append(name)
    
df['im_nwords_min'] = df[c_nwords].min(axis=1)
df['im_nwords_max'] = df[c_nwords].max(axis=1)
df['im_nwords_mean'] = df[c_nwords].mean(axis=1)

In [24]:
# length of search query
c_len = []
for c in cols_lp:
    name = str(c)+"_len"
    df[name] = df[c].apply(get_length)
    c_len.append(name)
    
df['im_len_min'] = df[c_len].min(axis=1)
df['im_len_max'] = df[c_len].max(axis=1)
df['im_len_mean'] = df[c_len].mean(axis=1)

In [25]:
df['im_nwords_mean'].describe()

count    1994.000000
mean        2.151789
std         1.485289
min         0.000000
25%         1.333333
50%         1.666667
75%         2.666667
max        14.000000
Name: im_nwords_mean, dtype: float64

In [26]:
df['total'] = df['MVH_search_cc_1_lp']+" "+df['MVH_search_cc_2_lp']+" "+df['MVH_search_cc_3_lp']

In [27]:
df.to_pickle(PATH+'/data/climate/02_df_sq_vars.pkl')

### Prepare search query list for manual annotation.

In [28]:
# list of all queries and list of all words
all_sq = [item for sublist in df[cols_lp].values.tolist() for item in sublist]
u_sq = set(all_sq)
all_words = [item for sublist in all_sq for item in sublist.split()]
u_words = set(all_words)

In [29]:
len(u_sq)

2767

In [30]:
# how many search queries total?
len(all_sq) # 1994 x 3

5982

In [31]:
def fuzzy_matching(sq_list, threshold=100): 
    '''Takes a list of words. Fuzzy matches search queries
    (fuzz.token_sort_ratio) with similarity above treshold (default=100) and creates (a) a list of 
    unique search queries and (b) a dict including the search query matches (key = search query in annotation list, 
    values = matches not included in list)'''
    
    # Note. This method is considerably slow when using python default SequenceMatcher.
    # Install python-Levenshtein to speed things up (though install issues on macs). 
    
    sq_list = [w for w in set(sq_list) if w not in EXCLUDE]
    
    to_annotate = sq_list.copy()
    keys=[]
    vals=[]
    for i, element in enumerate(tqdm(sq_list)):
        if not any(element in sublist for sublist in vals):
            v_list = []
            for choice in sq_list[i+1:]:
                if fuzz.token_sort_ratio(element, choice)>=threshold:
                    v_list.append(choice)
      
            if v_list:
                vals.append(v_list)
                keys.append(element)
    lookup = dict(zip(keys,vals))
    
    remove = [x for v in lookup.values() for x in v]
    to_annotate = [x for x in to_annotate if x not in remove]
    
    print('Number of unique search queries to annotate:', len(to_annotate),
         '\nNumber of doubles removed:', len(remove))
    return to_annotate, lookup

In [32]:
unique, matches = fuzzy_matching(list(u_sq))

100%|██████████| 2758/2758 [06:23<00:00,  7.20it/s] 

Number of unique search queries to annotate: 2671 
Number of doubles removed: 87





In [33]:
random.seed(1235)
random.shuffle(unique)

In [35]:
# save to csv
pd.DataFrame(unique, columns=["search query"]).to_csv(PATH+f'/data/climate/manual_coding/sq_to_annotate_{date}.csv', sep=";", index=False)

In [36]:
#store matches
with open(PATH+f'/data/climate/manual_coding/sq_matches_{date}.txt', 'w') as outfile:
    json.dump(matches, outfile)

In [37]:
matches

{'overheid klimaat': ['klimaat overheid'],
 'zou ik': ['ik zou'],
 'klimaat status': ['status klimaat'],
 'klimaat geschiedenis': ['geschiedenis klimaat'],
 'verandering klimaat': ['klimaat verandering'],
 'klimaat actuele situatie': ['actuele situatie klimaat'],
 'klimaat nos': ['nos klimaat'],
 'knmi klimaat': ['klimaat knmi'],
 'kabinet maatregelen klimaat': ['maatregelen kabinet klimaat'],
 'verbetering klimaat': ['klimaat verbetering'],
 'mens en klimaat': ['klimaat en mens'],
 'nederland luchtvervuiling': ['luchtvervuiling nederland'],
 'klimaat europa': ['europa klimaat'],
 'oorzaak klimaatverandering': ['klimaatverandering oorzaak'],
 'overleg klimaat': ['klimaat overleg'],
 'klimaat stand van zaken': ['stand van zaken klimaat'],
 'klimaat  nl': ['klimaat nl'],
 'klimaat en milieu': ['milieu en klimaat'],
 'woning verduurzamen': ['verduurzamen woning'],
 'maatregelen klimaat': ['klimaat maatregelen'],
 'klimaatverandering aanpak': ['aanpak klimaatverandering'],
 'europa klimaat