In [10]:
import pandas as pd
import regex as re

# must be specified to be able to use methods of package
import sys
sys.path.append('../src') 
# import packages

from src.political_ads.Preprocessor import *
from src.political_ads.keyword_filter import *

### Comparing Zeroshot with the Keyword search approach


Below are the preprocessing methods applied for both data samples

The following section cleans, removes dubs from each sample and then compares the two: keyword vs. zeroshot

In [11]:
# preprocessing methods appplied to both samples

# Remove links
def clean_data(dataframe):
#replace URL of a text
    dataframe['ad_creative_body'] = dataframe['ad_creative_body'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')



# Remove emojis characters
def remove_emojis(data):
    try:
      emoj = re.compile("["
          u"\U0001F600-\U0001F64F"  # emoticons
          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
          u"\U0001F680-\U0001F6FF"  # transport & map symbols
          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
          u"\U00002500-\U00002BEF"  # chinese char
          u"\U00002702-\U000027B0"
          u"\U00002702-\U000027B0"
          u"\U000024C2-\U0001F251"
          u"\U0001f926-\U0001f937"
          u"\U00010000-\U0010ffff"
          u"\u2640-\u2642" 
          u"\u2600-\u2B55"
          u"\u200d"
          u"\u23cf"
          u"\u23e9"
          u"\u231a"
          u"\ufe0f"  # dingbats
          u"\u3030"
                        "]+", re.UNICODE)
      return re.sub(emoj, '', data)
    except:
      return data

def remove_punctuations(text):
    return re.sub('[^A-Za-z0-9]+', ' ', text)
  
  #Consider stemming and lemmatizing
def preprocess_df(input: pd.DataFrame) -> pd.DataFrame:
  """makes the df ready by lowercasing, removing emojis and numbers"""
  output = input.copy()
  # Lowercase the text
  output["ad_creative_body"] = output["ad_creative_body"].str.lower()

  # Apply emoji removal to df
  output["ad_creative_body"] = output["ad_creative_body"].map(remove_emojis)

  output["ad_creative_body"] = output["ad_creative_body"].map(remove_punctuations)
 
  # Remove all numbers from text
  #output['Title'] = output['Title'].str.replace('\d+', '')
  #output['Description'] = output['Description'].str.replace('\d+', '')

  return output


In [12]:
#Code to read from our full dataset:
preprocess = Preprocessor()
df = preprocess.file_to_df('/home/gustavgyrst/Desktop/Research Project/political-ad-api/data/all_politicians_aggregated.txt')
df.head(3)


Unnamed: 0,ad_creation_time,ad_creative_body,spend,impressions,delivery_by_region,demographic_distribution,page_id,page_name,bylines,id,spend_lo,spend_hi,impressions_lo,impressions_hi
0,2021-10-14,To show that you're part of our fight to prote...,1749.5,74999.5,"[{'percentage': '0.004793', 'region': 'Alabama...","[{'percentage': '0.010689', 'age': '18-24', 'g...",38471053686,Elizabeth Warren,"WARREN DEMOCRATS, INC.",195916456012328,1500,1999,70000,79999
1,2021-10-14,Nothing is spookier than the ongoing assault o...,149.5,27499.5,"[{'percentage': '0.005051', 'region': 'Alabama...","[{'percentage': '0.000629', 'age': '18-24', 'g...",38471053686,Elizabeth Warren,"WARREN DEMOCRATS, INC.",197833945796073,100,199,25000,29999
2,2021-10-14,To show that you're part of our fight to prote...,1749.5,162499.5,"[{'percentage': '0.004873', 'region': 'Alabama...","[{'percentage': '0.005081', 'age': '18-24', 'g...",38471053686,Elizabeth Warren,"WARREN DEMOCRATS, INC.",396978045241497,1500,1999,150000,174999


In [13]:
#getting the climate ads from the total raw dataset
c_filter = Filter()
df_climate = c_filter.get_climate_ads(df)


In [14]:
#df_climate_nodub = df_climate[df_climate['ad_creative_body'].duplicated()==False]

# Another approach / same result
df_climate_nodub = df_climate.drop_duplicates(subset='ad_creative_body', keep='last')

In [15]:
df_climate_nodub

Unnamed: 0,ad_creation_time,ad_creative_body,spend,impressions,delivery_by_region,demographic_distribution,page_id,page_name,bylines,id,spend_lo,spend_hi,impressions_lo,impressions_hi
4774,2020-02-10,Every kid in America should have the same acce...,49.5,499.5,"[{'percentage': '0.007273', 'region': 'Alabama...","[{'percentage': '0.407273', 'age': '25-34', 'g...",38471053686,Elizabeth Warren,"WARREN FOR PRESIDENT, INC.",1044291342615870,0,99,0,999
6458,2020-01-26,I’m proposing a tax on the top 0.1% — the Jeff...,49.5,499.5,,,38471053686,Elizabeth Warren,"WARREN FOR PRESIDENT, INC.",193650885156389,0,99,0,999
8230,2020-01-06,Elizabeth Warren is returning to New Hampshire...,549.5,32499.5,"[{'percentage': '0.001361', 'region': 'Connect...","[{'percentage': '0.107619', 'age': '45-54', 'g...",38471053686,Elizabeth Warren,"WARREN FOR PRESIDENT, INC.",1026661577694555,500,599,30000,34999
9682,2019-12-20,Climate scientists published new research sugg...,249.5,9499.5,"[{'percentage': '0.006624', 'region': 'Rhode I...","[{'percentage': '0.006735', 'age': '25-34', 'g...",38471053686,Elizabeth Warren,"WARREN FOR PRESIDENT, INC.",580884722482084,200,299,9000,9999
9683,2019-12-20,Climate scientists published new research sugg...,49.5,1499.5,"[{'percentage': '0.000701', 'region': 'Delawar...","[{'percentage': '0.000708', 'age': '65+', 'gen...",38471053686,Elizabeth Warren,"WARREN FOR PRESIDENT, INC.",2750300318326852,0,99,1000,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602149,2019-05-06,Climate change is one of the most pressing iss...,49.5,499.5,,,280704452367167,Jason Crow,Jason Crow for Congress,429519637623675,0,99,0,999
602154,2019-05-06,Climate change is one of the most pressing iss...,49.5,499.5,"[{'percentage': '1', 'region': 'Colorado'}]","[{'percentage': '0.090278', 'age': '45-54', 'g...",280704452367167,Jason Crow for Congress,Jason Crow for Congress,448985355867823,0,99,0,999
602157,2019-05-06,Climate change is one of the most pressing iss...,49.5,499.5,,,280704452367167,Jason Crow for Congress,Jason Crow for Congress,455789565194363,0,99,0,999
602162,2019-05-06,Climate change is one of the most pressing iss...,49.5,499.5,"[{'percentage': '1', 'region': 'Colorado'}]","[{'percentage': '0.026087', 'age': '45-54', 'g...",280704452367167,Jason Crow,Jason Crow for Congress,394531874725343,0,99,0,999


In [16]:
# apply this to the dataframe
clean_data(df_climate_nodub)

# apply it to df
keywords_cleaned = preprocess_df(df_climate_nodub)

  dataframe['ad_creative_body'] = dataframe['ad_creative_body'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['ad_creative_body'] = dataframe['ad_creative_body'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


In [17]:
#Adding label for use when merging the two datasets later down
keywords_cleaned['sample'] = 'climate_keywords'

In [18]:
keywords_cleaned = keywords_cleaned[['ad_creative_body', 'sample']]

In [19]:
keywords_cleaned

Unnamed: 0,ad_creative_body,sample
4774,every kid in america should have the same acce...,climate_keywords
6458,i m proposing a tax on the top 0 1 the jeff be...,climate_keywords
8230,elizabeth warren is returning to new hampshire...,climate_keywords
9682,climate scientists published new research sugg...,climate_keywords
9683,climate scientists published new research sugg...,climate_keywords
...,...,...
602149,climate change is one of the most pressing iss...,climate_keywords
602154,climate change is one of the most pressing iss...,climate_keywords
602157,climate change is one of the most pressing iss...,climate_keywords
602162,climate change is one of the most pressing iss...,climate_keywords


Same steps are applied to the zeroshot sample...

In [20]:
#zeroshot
zeroshot = pd.read_csv("unique_ads_preprocessed_labeled.csv")

In [21]:
#example of a dub
dub = zeroshot[zeroshot.ad_creative_body == 'conservatives it s time to act the attacks from the left are relentless but we don t have to standby and take the abuse we can unite with more than 2 3 million fellow conservatives to preserve and protect our great nation by joining amac amac the association of mature american citizens is the conservative alternative to aarp the fight for our freedoms and our cherished american values has never been more intense the great folks at amac believe america is a great country because america is a good country and that those three words that start our constitution we the people still mean something still matter so do the 2 3 million members who count on amac to fight for the things they care about their freedom their values amac also works hard to deliver real value to their members providing the best benefits discounts and services found in one place stand up and show your support for our great country by joining amac www amac us carter as you know we are up against a formidable army of liberal companies media celebrities lobbyists activists and politicians if we keep spending our hard earned money with aarp or do nothing then we cannot expect to win this fight we can counter the left if we come together on issues that are important to us a united voice is a strong voice and with your help we can move the needle back to the right being a part of a great organization like amac gives us the opportunity to make a difference amac offers 4 great membership options each one includes you and your spouse here are just a few of the member benefits waiting for you medicare insurance plans social security advisor under 65 health insurance prescription discount card discount dental and vision plans auto and home insurance roadside assistance discounts travel lodging rental cars restaurants financial planning join the fight join amac today click or call 844 887 2622']

In [22]:
#Dropping dubs - approxx 7000
zeroshot = zeroshot.drop_duplicates(subset='ad_creative_body', keep='last')

In [23]:
zeroshot

Unnamed: 0,ad_creative_body,impr_geo_agg,spend_agg,impr_agg,page_id,page_name,ad_creation_time,party,emotions,ANGRY,ANNOYED,AMUSED,AFRAID,HAPPY,INSPIRED,SAD,DONT_CARE,zero_shot_label,sent_length,language
0,it is not enough to defeat donald trump. we mu...,"{'Illinois': 2262508, 'Ohio': 2400000, 'Texas'...",803277.5,25442483.5,124955570892789,Bernie Sanders,2020-03-05,Democrat,"{'AFRAID': 0.09056519009021387, 'AMUSED': 0.16...",0.116733,0.135540,0.166387,0.090565,0.097104,0.184500,0.088420,0.120751,inequality,long,en
1,we just won a huge victory in the primary elec...,"{'South Dakota': 40245, 'South Carolina': 1640...",349932.5,24854683.0,1316372698453411,Alexandria Ocasio-Cortez,2020-09-11,Democrat,"{'AFRAID': 0.09745483939705116, 'AMUSED': 0.13...",0.134677,0.142094,0.130600,0.097455,0.103810,0.171917,0.099821,0.119625,not_classified,long,en
2,i’ve got a small favor to ask – can you add yo...,"{'Alaska': 58937, 'Pennsylvania': 692570, 'Nev...",271561.5,19366864.5,124955570892789,Bernie Sanders,2020-03-17,Democrat,"{'AFRAID': 0.07969887694789736, 'AMUSED': 0.15...",0.131562,0.158462,0.151987,0.079699,0.108307,0.165868,0.079153,0.124963,not_classified,long,en
3,the dnc just raised the debate donor threshold...,"{'Alabama': 139291, 'Pennsylvania': 799612, 'N...",616753.5,18443203.5,36872302227,Cory Booker,2019-07-25,Democrat,"{'AFRAID': 0.09375970552559433, 'AMUSED': 0.13...",0.133796,0.140846,0.131026,0.093760,0.110115,0.175223,0.099503,0.115731,not_classified,long,en
4,the enormous student debt crisis is the result...,"{'Alabama': 324699, 'Pennsylvania': 748758, 'N...",221751.5,18399401.5,38471053686,Elizabeth Warren,2020-02-04,Democrat,"{'AFRAID': 0.10524118111376297, 'AMUSED': 0.12...",0.139728,0.137710,0.124227,0.105241,0.105821,0.159556,0.111214,0.116504,not_classified,long,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57842,nancy pelosi's leading the charge to violate t...,{'Florida': 500},49.5,499.5,666196620070127,Maria Elvira Salazar,2021-10-21,Republican,"{'AFRAID': 0.11236538195107602, 'AMUSED': 0.13...",0.168018,0.161798,0.131983,0.112365,0.089497,0.129414,0.096356,0.110568,not_classified,long,en
57843,the consequences of war is that our daughters ...,{'Maine': 500},49.5,499.5,344010372303511,Angus King,2018-08-07,Independent,"{'AFRAID': 0.09867305237088528, 'AMUSED': 0.13...",0.125781,0.135670,0.134549,0.098673,0.098556,0.174744,0.115859,0.116168,not_classified,long,en
57844,nancy pelosi's impeachment hit-list keeps grow...,{'Nebraska': 500},49.5,499.5,109592402468562,Deb Fischer,2019-11-11,Republican,"{'AFRAID': 0.0816414466029903, 'AMUSED': 0.140...",0.141876,0.180827,0.140967,0.081641,0.099557,0.141513,0.090407,0.123211,not_classified,long,en
57845,nancy pelosi's agenda for congress does not re...,{'Texas': 500},49.5,499.5,1558042497785635,Beth Van Duyne,2020-04-20,Republican,"{'AFRAID': 0.0887159273868603, 'AMUSED': 0.144...",0.129105,0.155833,0.144826,0.088716,0.102707,0.162510,0.094649,0.121654,not_classified,long,en


In [24]:
# apply this to the dataframe
clean_data(zeroshot)

# apply it to df
zeroshot_cleaned = preprocess_df(zeroshot)

  dataframe['ad_creative_body'] = dataframe['ad_creative_body'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


In [25]:
zeroshot_cleaned

Unnamed: 0,ad_creative_body,impr_geo_agg,spend_agg,impr_agg,page_id,page_name,ad_creation_time,party,emotions,ANGRY,ANNOYED,AMUSED,AFRAID,HAPPY,INSPIRED,SAD,DONT_CARE,zero_shot_label,sent_length,language
0,it is not enough to defeat donald trump we mus...,"{'Illinois': 2262508, 'Ohio': 2400000, 'Texas'...",803277.5,25442483.5,124955570892789,Bernie Sanders,2020-03-05,Democrat,"{'AFRAID': 0.09056519009021387, 'AMUSED': 0.16...",0.116733,0.135540,0.166387,0.090565,0.097104,0.184500,0.088420,0.120751,inequality,long,en
1,we just won a huge victory in the primary elec...,"{'South Dakota': 40245, 'South Carolina': 1640...",349932.5,24854683.0,1316372698453411,Alexandria Ocasio-Cortez,2020-09-11,Democrat,"{'AFRAID': 0.09745483939705116, 'AMUSED': 0.13...",0.134677,0.142094,0.130600,0.097455,0.103810,0.171917,0.099821,0.119625,not_classified,long,en
2,i ve got a small favor to ask can you add your...,"{'Alaska': 58937, 'Pennsylvania': 692570, 'Nev...",271561.5,19366864.5,124955570892789,Bernie Sanders,2020-03-17,Democrat,"{'AFRAID': 0.07969887694789736, 'AMUSED': 0.15...",0.131562,0.158462,0.151987,0.079699,0.108307,0.165868,0.079153,0.124963,not_classified,long,en
3,the dnc just raised the debate donor threshold...,"{'Alabama': 139291, 'Pennsylvania': 799612, 'N...",616753.5,18443203.5,36872302227,Cory Booker,2019-07-25,Democrat,"{'AFRAID': 0.09375970552559433, 'AMUSED': 0.13...",0.133796,0.140846,0.131026,0.093760,0.110115,0.175223,0.099503,0.115731,not_classified,long,en
4,the enormous student debt crisis is the result...,"{'Alabama': 324699, 'Pennsylvania': 748758, 'N...",221751.5,18399401.5,38471053686,Elizabeth Warren,2020-02-04,Democrat,"{'AFRAID': 0.10524118111376297, 'AMUSED': 0.12...",0.139728,0.137710,0.124227,0.105241,0.105821,0.159556,0.111214,0.116504,not_classified,long,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57842,nancy pelosi s leading the charge to violate t...,{'Florida': 500},49.5,499.5,666196620070127,Maria Elvira Salazar,2021-10-21,Republican,"{'AFRAID': 0.11236538195107602, 'AMUSED': 0.13...",0.168018,0.161798,0.131983,0.112365,0.089497,0.129414,0.096356,0.110568,not_classified,long,en
57843,the consequences of war is that our daughters ...,{'Maine': 500},49.5,499.5,344010372303511,Angus King,2018-08-07,Independent,"{'AFRAID': 0.09867305237088528, 'AMUSED': 0.13...",0.125781,0.135670,0.134549,0.098673,0.098556,0.174744,0.115859,0.116168,not_classified,long,en
57844,nancy pelosi s impeachment hit list keeps grow...,{'Nebraska': 500},49.5,499.5,109592402468562,Deb Fischer,2019-11-11,Republican,"{'AFRAID': 0.0816414466029903, 'AMUSED': 0.140...",0.141876,0.180827,0.140967,0.081641,0.099557,0.141513,0.090407,0.123211,not_classified,long,en
57845,nancy pelosi s agenda for congress does not re...,{'Texas': 500},49.5,499.5,1558042497785635,Beth Van Duyne,2020-04-20,Republican,"{'AFRAID': 0.0887159273868603, 'AMUSED': 0.144...",0.129105,0.155833,0.144826,0.088716,0.102707,0.162510,0.094649,0.121654,not_classified,long,en


In [26]:
merge = pd.merge(zeroshot_cleaned, keywords_cleaned, on='ad_creative_body', how='left')

In [27]:
merge[merge['sample']=='climate_keywords']

Unnamed: 0,ad_creative_body,impr_geo_agg,spend_agg,impr_agg,page_id,page_name,ad_creation_time,party,emotions,ANGRY,...,AMUSED,AFRAID,HAPPY,INSPIRED,SAD,DONT_CARE,zero_shot_label,sent_length,language,sample
22,unbelievable after stacey abrams helped chang...,"{'Mississippi': 82272, 'Kansas': 59103, 'Tenne...",261879.5,10547980.5,109111900634787,Reverend Raphael Warnock,2021-09-23,Democrat,"{'AFRAID': 0.10363804180351888, 'AMUSED': 0.13...",0.127941,...,0.134265,0.103638,0.112651,0.166789,0.097768,0.116851,climate,long,en,climate_keywords
122,named the hill s top climate nerd businessman ...,"{'Delaware': 29, 'Oregon': 302, 'Missouri': 11...",187494.0,5362496.0,1934158976830122,Sean Casten for Congress,2020-10-22,Democrat,"{'AFRAID': 0.09520029463749527, 'AMUSED': 0.15...",0.113530,...,0.155492,0.095200,0.108481,0.174267,0.084025,0.142153,climate,long,en,climate_keywords
181,rep alexandria ocasio cortez and i just introd...,"{'Alabama': 21288, 'Pennsylvania': 162900, 'Ne...",60080.5,4325930.5,124955570892789,Bernie Sanders,2019-08-07,Democrat,"{'AFRAID': 0.13320879445363867, 'AMUSED': 0.12...",0.146349,...,0.121521,0.133209,0.098221,0.153596,0.097866,0.104601,climate,long,en,climate_keywords
251,take any of our big problems health care clima...,"{'Vermont': 8757, 'Oregon': 6916, 'Nebraska': ...",44892.5,3509993.0,38471053686,Elizabeth Warren,2019-11-12,Democrat,"{'AFRAID': 0.10195171367028856, 'AMUSED': 0.14...",0.125602,...,0.143505,0.101952,0.097674,0.173508,0.101257,0.116497,climate,long,en,climate_keywords
329,50 000 responses missing 11 59 p m tonight mi...,"{'Alaska': 5829, 'Oregon': 55139, 'Minnesota':...",105736.0,2879986.0,134498556891640,Val Demings,2021-10-27,Democrat,"{'AFRAID': 0.11732185281751054, 'AMUSED': 0.13...",0.149745,...,0.134380,0.117322,0.098848,0.146121,0.112531,0.103381,climate,long,en,climate_keywords
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51632,the climate crisis is one of the biggest chall...,"{'Texas': 2, 'Colorado': 497}",49.5,499.5,280704452367167,Jason Crow,2021-06-24,Democrat,"{'AFRAID': 0.10799956387941563, 'AMUSED': 0.13...",0.119724,...,0.131082,0.108000,0.111768,0.185126,0.091129,0.119118,climate,long,en,climate_keywords
51633,the climate crisis is one of the biggest chall...,"{'Texas': 2, 'Colorado': 497}",49.5,499.5,280704452367167,Jason Crow,2021-06-24,Democrat,"{'AFRAID': 0.10799956387941563, 'AMUSED': 0.13...",0.119724,...,0.131082,0.108000,0.111768,0.185126,0.091129,0.119118,climate,long,en,climate_keywords
51634,the climate crisis is one of the biggest chall...,"{'Texas': 2, 'Colorado': 497}",49.5,499.5,280704452367167,Jason Crow,2021-06-24,Democrat,"{'AFRAID': 0.10799956387941563, 'AMUSED': 0.13...",0.119724,...,0.131082,0.108000,0.111768,0.185126,0.091129,0.119118,climate,long,en,climate_keywords
51635,the climate crisis is one of the biggest chall...,"{'Texas': 2, 'Colorado': 497}",49.5,499.5,280704452367167,Jason Crow,2021-06-24,Democrat,"{'AFRAID': 0.10799956387941563, 'AMUSED': 0.13...",0.119724,...,0.131082,0.108000,0.111768,0.185126,0.091129,0.119118,climate,long,en,climate_keywords


In [28]:
merge_view = merge[['ad_creative_body', 'zero_shot_label', 'sample']]

In [29]:
merge_view.zero_shot_label.unique()

#zeroshot labels related to climate change:
climate_zeroshot_labels = ['climate', 'education', 'environment',
       'climate, economy', 'climate, immigration',
       'environment, education', 'climate, education',
       'climate, inequality',
       'environment, economy']

In [30]:
merge_view[merge_view['zero_shot_label'].isin(climate_zeroshot_labels) == True]

Unnamed: 0,ad_creative_body,zero_shot_label,sample
22,unbelievable after stacey abrams helped chang...,climate,climate_keywords
122,named the hill s top climate nerd businessman ...,climate,climate_keywords
163,rick scott subtracted 1 3 billion from public ...,education,
181,rep alexandria ocasio cortez and i just introd...,climate,climate_keywords
203,when it comes to progressive leadership it s ...,education,
...,...,...,...
51632,the climate crisis is one of the biggest chall...,climate,climate_keywords
51633,the climate crisis is one of the biggest chall...,climate,climate_keywords
51634,the climate crisis is one of the biggest chall...,climate,climate_keywords
51635,the climate crisis is one of the biggest chall...,climate,climate_keywords


zeroshot finds 1827 unique ad_creative_body texts with the climate labels applied

keyword finds 1339 unique ad_creative_body texts with related to climate change

527 of the 1827 zeroshot unique ad_creative_body texts are not found by the keywords approach

1330 of the 1339 keyword unique ad_creative_body texts are also in the zeroshot sample.
