# Importing libraries

In [1]:
import os
import pandas as pd

from collections import Counter

from sentence_transformers import SentenceTransformer
from sentence_transformers import util

import plotly.express as px

# Importing data

In [2]:
east_df = pd.read_csv("../data/structured/TEXT/east.csv", low_memory=False)
west_df = pd.read_csv("../data/structured/TEXT/west.csv", low_memory=False)
north_df = pd.read_csv("../data/structured/TEXT/north.csv", low_memory=False)
south_df = pd.read_csv("../data/structured/TEXT/south.csv", low_memory=False)
central_df = pd.read_csv("../data/structured/TEXT/central.csv", low_memory=False)

# Loading model

In [4]:
model = SentenceTransformer("sentence-transformers/bert-base-nli-mean-tokens")

# Calculating similarity through existing style embedding models on raw data

In [6]:
def generate_heatmap(data_df, vendor_id):
    phones = set(data_df[data_df.VENDOR==vendor_id]['PHONES'].to_list())
    phones = [eval(phone) for phone in phones]
    phones = [item for sublist in phones for item in sublist]
    print("All phone numbers:", phones)
        
    sent_list = set(data_df[data_df.VENDOR==vendor_id]['TEXT'].to_list())
    outer_list = []
    for index1, sent1 in enumerate(sent_list):
        inner_list = []
        for index2, sent2 in enumerate(sent_list):
            emb1 = model.encode(sent1)
            emb2 = model.encode(sent2)
            inner_list.append(util.cos_sim(emb1, emb2).cpu().detach().numpy()[0][0])
        outer_list.append(inner_list)

    fig = px.imshow(outer_list, text_auto=True, aspect="auto")
    fig.show('iframe')

In [7]:
generate_heatmap(west_df, 63072)

All phone numbers: ['814-602-6807']


In [8]:
generate_heatmap(west_df, 63076)

All phone numbers: ['8145049661']


In [9]:
generate_heatmap(west_df, 63081)

All phone numbers: ['814 923 6153', '814 920 8940', '814 920 8940']


# Calculating similarity through existing style embedding models on processed data

In [10]:
def process_data(data_df, vendor_id):
    data_df = data_df[data_df.VENDOR == vendor_id]
    
    # Getting all the phone numbers
    phones = set(data_df['PHONES'].to_list())
    phones = [eval(phone) for phone in phones]
    phones = [item for sublist in phones for item in sublist]
    
    # Getting all the cities
    cities = set(data_df['CITY'].to_list())
    
    # cleaning the text
    data_df.TEXT = data_df.TEXT.apply(lambda x: clean_text(x, cities, phones))
    return data_df
    
    
def clean_text(sent, city, phone):
    sent = sent.split(" ")
    city = [place.lower() for place in city]
    # substituing all the names of the cities with ####
    sent = [text.replace(text, "####") if any(text.lower() in sub for sub in city) else text for text in sent]
    # replacing all the numbers in phone numbers by Z
    sent = [text.replace("1", "Z").replace("2", "Z").replace("3", "Z").replace("4", "Z").replace("5", "Z").replace("6", "Z").replace("7", "Z").replace("8", "Z").replace("9", "Z").replace("0", "Z") if any(text in sub for sub in phone) else text for text in sent]
    sent = " ".join(sent)
    # replacing "escorts - backpage.com", "backpage.com", and for ex "erie.backpage.com" to ""
    sent = sent.replace("escorts - backpage.com", "").replace("backpage.com", "")
    for place in city:
        sent = sent.replace(place.lower() + ".backpage.com", "")
    return sent

In [11]:
sample_df = process_data(west_df, 63072)
generate_heatmap(sample_df, 63072)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



All phone numbers: ['814-602-6807']


In [12]:
sample_df = process_data(west_df, 63076)
generate_heatmap(sample_df, 63076)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



All phone numbers: ['8145049661']


In [13]:
sample_df = process_data(west_df, 63081)
generate_heatmap(sample_df, 63081)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



All phone numbers: ['814 923 6153', '814 920 8940', '814 920 8940']
