In [None]:
import sys
sys.path.append('../src') 
from political_ads.preprocessor import Preprocessor

preprocess = Preprocessor()
data = preprocess.file_to_df("..\\data\\all_politicians_aggregated_final.txt")

In [None]:
from political_ads.keyword_filter import Filter
import pandas as pd

# filter object
filter = Filter()

climate_ads = filter.get_climate_ads(data)

In [None]:
data_small = data.sample(n=1000)
data_mini = data.sample(n=100)

In [None]:
labeled = pd.read_csv("..\\data\\split_dataset\\no_duplicates_chunk_6_labelled.csv")

In [None]:
env = labeled[labeled["unique_label"] == "environment"]
labeled.iloc[5294]["ad_creative_body"]

## Merge the labeled chunks and add them back to the main dataset

In [None]:
frames = []

for i in range(7):
    print(f"Labeling Chunk {i}")
    chunk = pd.read_csv(f"..\\data\\split_dataset\\no_duplicates_chunk_{i}_labelled.csv")
    frames.append(chunk)
    #chunk.to_csv(f"..\\data\\split_dataset\\no_duplicates_chunk_{i}_labelled.csv", index=False)

df = pd.concat(frames)

In [None]:
df.to_csv(f"..\\data\\split_dataset\\merged_chunks_labelled.csv", index=False)

In [None]:
import pandas as pd
labeled = pd.read_csv("..\\data\\split_dataset\\merged_chunks_labelled.csv")

In [None]:
labeled[labeled["ad_creative_body"] == "Hope to see everyone tonight!"].index[0]

labeled.iloc[60636]["unique_label"]

In [None]:
'''
Write method that adds the label to each row of the total advertisements dataframe data
'''
# import progress bar
from tqdm import tqdm
# instantiate tqdm
tqdm.pandas()

def add_label(row):
    text = row["ad_creative_body"]
    label = labeled[labeled["ad_creative_body"] == text]["unique_label"]
    return label

# Progress apply to data
data["zero_label"] = data.progress_apply(add_label, axis=1)


## Running Zero Shot Classification on Big Dataset

In [None]:
#%pip install transformers
#%pip install tensorflow
#%pip install pytorch

In [None]:
#Install huggingface transformers
#pip install transformers
# Use progress bar
#%pip install tqdm
from tqdm import tqdm
from transformers import pipeline

# Helper function: Return the best two lables with the corresponding scores
def assign_labels(sentence, model, tags):
    try:
      results = model(sentence, tags, multi_clsass=True)
      labels = {}
      labels[results['labels'][0]] = round(float(results['scores'][0]), 2)
      labels[results['labels'][1]] = round(float(results['scores'][1]), 2)
      #rs = f"{results['labels'][0]} ({results['scores'][0]:.2f}), {results['labels'][1]} ({results['scores'][1]:.2f})"
      return labels
    except:
      return "not_classified"

# helper to get only the first label
def get_first_label(labels: dict):
  try:
    # only if value is above threshold of 85%
    first = ""
    for k, v in labels.items():
      if v >= 0.85:
        first = k
        break
      else:
        return "not_classified"
    return first
  except:
    return "not_classified"

# main function that applies zero shot classification to the texts and adds labels & unique label columns
def generate_labels(input: pd.DataFrame) -> pd.DataFrame:
  tqdm.pandas()
  output = input.copy()
  classifier = pipeline("zero-shot-classification")
  # Define candidate tags
  candidates_tags = ["climate change"]
  # Apply to mini test data
  # Use use apply with tqdm progress bar
  output["labels"] = output["ad_creative_body"].progress_apply(assign_labels, model=classifier, tags=candidates_tags)
  output["unique_label"] = output["labels"].apply(get_first_label)
  return output

#mini_sample_en = generate_labels(mini_sample_en)

In [None]:
# %pip uninstall tensoflow
# %pip install tensorflow==2.2.0
# %pip unsinstall transformer
# %pip install transformer==3.3.1

In [None]:
test = generate_labels(data_mini)

In [None]:
# check which ad_creative_body texts are duplicated
data_small.drop_duplicates(subset="ad_creative_body", keep="first", inplace=True)

In [None]:
no_duplicates = data.drop_duplicates(subset="ad_creative_body", keep="first")

In [None]:
# input - df: a Dataframe, chunkSize: the chunk size
# output - a list of DataFrame
# purpose - splits the DataFrame into smaller chunks
def split_dataframe(df, chunk_size = 10000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [None]:
chunks = split_dataframe(no_duplicates)

for i in range(len(chunks)):
    chunks[i].to_csv(f"..\\data\\split_dataset\\no_duplicates_chunk_{i}.csv", index=False)

In [None]:
for i in range(len(chunks)):
    print(f"Labeling Chunk {i}")
    chunk = pd.read_csv(f"..\\data\\split_dataset\\no_duplicates_chunk_{i}.csv")
    chunk = generate_labels(chunk)
    chunk.to_csv(f"..\\data\\split_dataset\\no_duplicates_chunk_{i}_labelled.csv", index=False)
