# Importing libraries

In [4]:
import os
import re
import pathlib

import pickle
import numpy as np
import pandas as pd

import spacy


from tqdm import tqdm
from collections import Counter

from sentence_transformers import SentenceTransformer
from sentence_transformers import util

import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm
2023-03-16 14:34:59.244886: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-16 14:35:00.216232: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 14:35:00.216434: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: 

In [5]:
nlp = spacy.load("en_core_web_trf")

In [6]:
# from transformers import pipeline
# nlp = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english")

# Loading data

In [7]:
east_df = pd.read_csv("../data/structured/TEXT/east.csv", low_memory=False)
west_df = pd.read_csv("../data/structured/TEXT/west.csv", low_memory=False)
north_df = pd.read_csv("../data/structured/TEXT/north.csv", low_memory=False)
south_df = pd.read_csv("../data/structured/TEXT/south.csv", low_memory=False)
central_df = pd.read_csv("../data/structured/TEXT/central.csv", low_memory=False)

In [8]:
east_df.columns

Index(['Unnamed: 0', 'TEXT', 'PHONES', 'CITY', 'IMAGES', 'DEMO', 'VENDOR'], dtype='object')

# Loading model

In [6]:
model = SentenceTransformer("AnnaWegmann/Style-Embedding")

Downloading (…)95525/.gitattributes: 100%|██████████| 1.17k/1.17k [00:00<00:00, 138kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 19.5kB/s]
Downloading (…)ed27695525/README.md: 100%|██████████| 3.96k/3.96k [00:00<00:00, 867kB/s]
Downloading (…)27695525/config.json: 100%|██████████| 718/718 [00:00<00:00, 193kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 117/117 [00:00<00:00, 25.6kB/s]
Downloading (…)aluation_results.csv: 100%|██████████| 659/659 [00:00<00:00, 34.4kB/s]
Downloading (…)d27695525/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.13MB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 499M/499M [00:12<00:00, 39.7MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 6.35kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 45.0kB/s]
Downloading (…)95525/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.26MB/s]
Downloading (…)okenizer_config.json: 1

# Calculating similarity through existing style embedding models on raw data

In [4]:
def generate_heatmap(data_df, vendor_id):
    phones = set(data_df[data_df.VENDOR==vendor_id]['PHONES'].to_list())
    phones = [eval(phone) for phone in phones]
    phones = [item for sublist in phones for item in sublist]
    print("All phone numbers:", phones)
        
    sent_list = set(data_df[data_df.VENDOR==vendor_id]['TEXT'].to_list())
    outer_list = []
    for index1, sent1 in enumerate(sent_list):
        inner_list = []
        for index2, sent2 in enumerate(sent_list):
            emb1 = model.encode(sent1)
            emb2 = model.encode(sent2)
            inner_list.append(util.cos_sim(emb1, emb2).cpu().detach().numpy()[0][0])
        outer_list.append(inner_list)

    fig = px.imshow(outer_list, text_auto=True, aspect="auto")
    fig.show('iframe')

In [5]:
generate_heatmap(west_df, 63072)

All phone numbers: ['814-602-6807']


In [6]:
generate_heatmap(west_df, 63076)

All phone numbers: ['8145049661']


In [7]:
generate_heatmap(west_df, 63081)

All phone numbers: ['814 920 8940', '814 923 6153', '814 920 8940']


# Calculating similarity through existing style embedding models on processed data

In [9]:
extensionsToCheck = ('.com', '.co', '.in', '.net', '.to', '.org', '.us', '.edu', '.gov', '.int')
def anonymize_links(sent):
    # Removing all the links
    sent = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', "<LINK>", sent)
    sent = sent.split(" ")
    sent = ["<LINK>" if text.endswith(extensionsToCheck) else text for text in sent]
    return " ".join(sent)

def anonymize_age(sent):
    # Removing the age and post id from the advertisement
    ages = re.findall(r'age:\s*(\d+)', sent)
    if ages:
        for age in ages:
            # Removing age under the age: section
            sent = sent.replace("age: " + age, "age: <AGE>")
            # Removing age everywhere else
            sent = sent.replace(" " + age + " ", "<AGE>")
    return sent

def anonymize_postid(sent):
    # Removing the Post ID from the advertisements
    id_ = re.findall(r'Post ID:\s*(\d+)', sent)
    if id_:
        sent = sent.replace("Post ID: " + id_[0], "Post ID: <POST_ID>")
    return sent


num_order = r'[0-9]'
def find_location(data_df):
    location_dict = {}
    all_vendors = list(data_df.VENDOR.unique())
    pbar = tqdm(total=len(all_vendors))
    
    for vendor in all_vendors:
        location_list = []
        data = data_df[data_df.VENDOR == vendor]
        text = data.TEXT.to_list()
        for sent in text:
            doc = nlp(sent)
            for entity in doc.ents:
                # Checking for all Countries, cities, states, Non-GPE locations, mountain ranges, bodies of water, Buildings, airports, highways, bridges, etc
                if entity.label_ in {"GPE", "LOC", "FAC"}:
                    # Removing numbers from the string
                    location = re.sub(num_order, '', entity.text)
                    # Removing / , and . 
                    location = location.replace("/", "#").replace(",", "#").replace(".", "#").split("#")
                    for loc in location:
                        if len(loc) > 1 and loc not in location_list:
                            location_list.append(loc.strip())
        location_dict[vendor] = set(location_list)
        
        pbar.update(1)
    pbar.close()
    return location_dict

num_order = r'[0-9]'
def find_names(data_df):
    names_dict = {}
    all_vendors = list(data_df.VENDOR.unique())
    pbar = tqdm(total=len(all_vendors))
    
    for vendor in all_vendors:
        names_list = []
        data = data_df[data_df.VENDOR == vendor]
        text = data.TEXT.to_list()
        for sent in text:
            doc = nlp(sent)
            for entity in doc.ents:
                # Checking for all Countries, cities, states, Non-GPE locations, mountain ranges, bodies of water, Buildings, airports, highways, bridges, etc
                if entity.label_ in {"PERSON"}:
                    # Removing numbers from the string
                    names = re.sub(num_order, '', entity.text)
                    # Removing / , and . 
                    names = names.replace("/", "#").replace(",", "#").replace(".", "#").split("#")
                    for name in names:
                        # Removing all the wrongly identified names that have two or more tokens
                        if  0 < len(name.split(" ")) < 2 and name not in names_list:
                            # Removing all the special symbols from the names
                            name = re.sub("[^A-Z]", "", name, 0, re.IGNORECASE)
                            # Checking if the name is atleast 3 characters long 
                            if len(name) >= 3:
                                names_list.append(name.strip())
        if len(names_list) != 0:
            names_dict[vendor] = set(names_list)
        
        pbar.update(1)
    pbar.close()
    return names_dict

def find_email(data_df):
    email_dict = {}
    all_vendors = list(data_df.VENDOR.unique())
    pbar = tqdm(total=len(all_vendors))
    
    for vendor in all_vendors:
        email_list = []
        data = data_df[data_df.VENDOR == vendor]
        text = data.TEXT.to_list()
        for sent in text:
            sent = re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", sent)
            email_list.append(sent)
        
        email_list = [item for sublist in email_list for item in sublist]
        email_dict[vendor] = set(email_list)
        pbar.update(1)
    pbar.close()
        
    return email_dict

def anonymize_locations(sent, all_loc_dict):
    for location in all_loc_dict.keys():
        sent = re.sub(r'([^\w\s])', r' \1 ', sent)
        if " " + location + " " in sent:
            sent = sent.replace(" " + location + " ", " " + all_loc_dict[location] + " ")
    return sent

def anonymize_emails(sent, all_email_dict):
    for email in all_email_dict.keys():
        if email in sent:
            sent = sent.replace(email, all_email_dict[email])
    return sent

def anonymize_names(sent, all_name_dict):
    for name in all_name_dict.keys():
        if name in sent:
            
            sent = sent.replace(name, all_name_dict[name])
    return sent

def anonymize_numbers(sent):
    sent = ''.join(i if not i.isdigit() else "N" for i in sent)
    return sent

In [10]:
# Uncomment it to fetch a dictionary with anonymized names 
"""
for data_df in [north_df, south_df, west_df, east_df, central_df]:
    print("Demography:", list(data_df.DEMO.unique())[0])
    location_dict = find_email(data_df)
    
    with open(os.path.join(os.getcwd(), "../pickled/dictionaries",'email_dict_' + list(data_df.DEMO.unique())[0] + '.pickle'), 'wb') as handle:
        pickle.dump(location_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""

'\nfor data_df in [north_df, south_df, west_df, east_df, central_df]:\n    print("Demography:", list(data_df.DEMO.unique())[0])\n    location_dict = find_email(data_df)\n    \n    with open(os.path.join(os.getcwd(), "../pickled/dictionaries",\'email_dict_\' + list(data_df.DEMO.unique())[0] + \'.pickle\'), \'wb\') as handle:\n        pickle.dump(location_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n'

In [11]:
# Loading pickled dictionary

with open('../pickled/dictionaries/loc_dict_east.pickle', 'rb') as handle:
    east_loc_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/loc_dict_west.pickle', 'rb') as handle:
    west_loc_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/loc_dict_north.pickle', 'rb') as handle:
    north_loc_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/loc_dict_south.pickle', 'rb') as handle:
    south_loc_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/loc_dict_central.pickle', 'rb') as handle:
    central_loc_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/names_dict_central.pickle', 'rb') as handle:
    central_name_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/names_dict_east.pickle', 'rb') as handle:
    east_name_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/names_dict_west.pickle', 'rb') as handle:
    west_name_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/names_dict_north.pickle', 'rb') as handle:
    north_name_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/names_dict_south.pickle', 'rb') as handle:
    south_name_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/email_dict_central.pickle', 'rb') as handle:
    central_email_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/email_dict_east.pickle', 'rb') as handle:
    east_email_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/email_dict_west.pickle', 'rb') as handle:
    west_email_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/email_dict_north.pickle', 'rb') as handle:
    north_email_dict = pickle.load(handle)
    
with open('../pickled/dictionaries/email_dict_south.pickle', 'rb') as handle:
    south_email_dict = pickle.load(handle)

In [12]:
# Getting IDs for all the names
names = list(set([item for sublist in list(east_name_dict.values()) for item in sublist])) + list(set([item for sublist in list(west_name_dict.values()) for item in sublist])) + list(set([item for sublist in list(north_name_dict.values()) for item in sublist])) + list(set([item for sublist in list(south_name_dict.values()) for item in sublist])) + list(set([item for sublist in list(central_name_dict.values()) for item in sublist]))
locs = list(set([item for sublist in list(east_loc_dict.values()) for item in sublist])) + list(set([item for sublist in list(west_loc_dict.values()) for item in sublist])) + list(set([item for sublist in list(north_loc_dict.values()) for item in sublist])) + list(set([item for sublist in list(south_loc_dict.values()) for item in sublist])) + list(set([item for sublist in list(central_loc_dict.values()) for item in sublist]))
emails = list(set([item for sublist in list(east_email_dict.values()) for item in sublist])) + list(set([item for sublist in list(west_email_dict.values()) for item in sublist])) + list(set([item for sublist in list(north_email_dict.values()) for item in sublist])) + list(set([item for sublist in list(south_email_dict.values()) for item in sublist])) + list(set([item for sublist in list(central_email_dict.values()) for item in sublist]))

In [13]:
location_df = pd.read_csv("../data/others/us_cities_states_counties.csv",  sep="|", on_bad_lines='skip')

locs =  set(location_df.City.to_list() + location_df['State full'].to_list())
locs = set([str(loc).lower() for loc in locs])
locs = sorted(locs, key=lambda x: (-len(x), x))

In [14]:
names_dict, loc_dict, email_dict = ({} for i in range(3))

for idx, name in enumerate(names):
    if len(name) >= 1:
        doc = nlp(name.lower())
        if doc[0].pos_ in {"PROPN"}:
            names_dict[name] = "<PERSON_" + str(idx) + ">"
    
for idx, loc in enumerate(locs):
    loc_dict[loc] = "<LOCATION_" + str(idx) + ">"
    
for idx, email in enumerate(emails):
    if len(email) >= 2:
        email_dict[email] = "<EMAIL_" + str(idx) + ">"

In [129]:
def process_data(data_df, all_loc_dict=loc_dict, all_name_dict=names_dict, all_email_dict=email_dict):
    data_list = []
    all_vendors = list(data_df.VENDOR.unique())
    pbar = tqdm(total=len(all_vendors))
    for vendor_id in all_vendors:
        data = data_df[data_df.VENDOR == vendor_id]
        # cleaning the text
        data.TEXT = data.TEXT.apply(lambda x: anonymize_age(x))
        data.TEXT = data.TEXT.apply(lambda x: anonymize_postid(x))
        data.TEXT = data.TEXT.apply(lambda x: anonymize_emails(x, all_email_dict))
        data.TEXT = data.TEXT.apply(lambda x: anonymize_links(x))
        data.TEXT = data.TEXT.apply(lambda x: anonymize_numbers(x))
        # data.TEXT = data.TEXT.apply(lambda x: anonymize_locations(x, all_loc_dict))
        # data.TEXT = data.TEXT.apply(lambda x: anonymize_names(x, all_name_dict))
        data.TEXT = data.TEXT.apply(lambda x: x.strip())
        data_list.append(data)
        pbar.update(1)
    pbar.close()
    return pd.concat(data_list)

In [None]:
north_df = process_data(north_df)
west_df = process_data(west_df)
south_df = process_data(south_df)
east_df = process_data(east_df)
central_df = process_data(central_df)

In [136]:
north_df.to_csv("../data/processed/TEXT/north.csv")
west_df.to_csv("../data/processed/TEXT/west.csv")
south_df.to_csv("../data/processed/TEXT/south.csv")
east_df.to_csv("../data/processed/TEXT/east.csv")
central_df.to_csv("../data/processed/TEXT/central.csv")

# Generating heatmaps

In [132]:
sample_df = process_data(west_df, 63072)
generate_heatmap(sample_df, 63072)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



All phone numbers: ['814-602-6807']


In [133]:
sample_df = process_data(west_df, 63076)
generate_heatmap(sample_df, 63076)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



All phone numbers: ['8145049661']


In [134]:
sample_df = process_data(west_df, 63081)
generate_heatmap(sample_df, 63081)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



All phone numbers: ['814 923 6153', '814 920 8940', '814 920 8940']


# Computing similarity between advertisements of 2 vendors 

In [137]:
def compute_similarity_between_vendor_ads(data_df1, vendor_id1, data_df2, vendor_id2):
    sent_list1 = set(data_df1[data_df1.VENDOR==vendor_id1]['TEXT'].to_list())
    sent_list2 = set(data_df2[data_df2.VENDOR==vendor_id2]['TEXT'].to_list())
    outer_list = []
    for index1, sent1 in enumerate(sent_list):
        inner_list = []
        for index2, sent2 in enumerate(sent_list):
            emb1 = model.encode(sent1)
            emb2 = model.encode(sent2)
            inner_list.append(util.cos_sim(emb1, emb2).cpu().detach().numpy()[0][0])
        outer_list.append(inner_list)

    fig = px.imshow(outer_list, text_auto=True, aspect="auto")
    fig.show('iframe')

In [138]:
compute_similarity_between_vendor_ads(west_df, 63072, west_df, 63081)

In [139]:
compute_similarity_between_vendor_ads(west_df, 63081, west_df, 63076)

# Computing avg-similarity between advertisements of same vendors

In [4]:
def compute_similarity_for_vendors(data_df):
    vendor_dict = {}
    vendors = data_df.VENDOR.to_list()    
    
    pbar = tqdm(total=len(vendors))
    for vendor_id in vendors:
        sent_list = set(data_df[data_df.VENDOR==vendor_id]['TEXT'].to_list())
        outer_list = []
        for index1, sent1 in enumerate(sent_list):
            inner_list = []
            for index2, sent2 in enumerate(sent_list):
                emb1 = model.encode(sent1)
                emb2 = model.encode(sent2)
                inner_list.append(util.cos_sim(emb1, emb2).cpu().detach().numpy()[0][0])
            outer_list.append(inner_list)
        vendor_dict[vendor_id] = np.array(outer_list).mean()
        pbar.update(1)
    
    pbar.close()
    return vendor_dict

In [None]:
east_dict = compute_similarity_for_vendors(east_df)
west_dict = compute_similarity_for_vendors(west_df)
north_dict = compute_similarity_for_vendors(north_df)
south_dict = compute_similarity_for_vendors(south_df)
central_dict = compute_similarity_for_vendors(central_df)

In [None]:
import pathlib
pathlib.Path('../pickled/').mkdir(parents=True, exist_ok=True) 

In [None]:
import pickle

with open('../pickled/east_ads_similarity_per_vendor_zero_shot.pickle', 'wb') as handle:
    pickle.dump(east_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../pickled/west_ads_similarity_per_vendor_zero_shot.pickle', 'wb') as handle:
    pickle.dump(west_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/north_ads_similarity_per_vendor_zero_shot.pickle', 'wb') as handle:
    pickle.dump(north_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/south_ads_similarity_per_vendor_zero_shot.pickle', 'wb') as handle:
    pickle.dump(south_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../pickled/central_ads_similarity_per_vendor_zero_shot.pickle', 'wb') as handle:
    pickle.dump(central_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)