# Importing libraires

In [None]:
import os
from collections import Counter, defaultdict

import pandas as pd
from tqdm import tqdm

import pickle

import networkx 
from networkx.algorithms.components.connected import connected_components

import scipy
import spacy

import plotly.graph_objects as go
import plotly.express as px

In [None]:
import plotly.io as pio
pio.renderers.default = 'iframe'

In [None]:
nlp = spacy.load("en_core_web_trf")

# Importing structured data

In [8]:
east_df = pd.read_csv("../data/structured/TEXT/east.csv", low_memory=False)
west_df = pd.read_csv("../data/structured/TEXT/west.csv", low_memory=False)
north_df = pd.read_csv("../data/structured/TEXT/north.csv", low_memory=False)
south_df = pd.read_csv("../data/structured/TEXT/south.csv", low_memory=False)
central_df = pd.read_csv("../data/structured/TEXT/central.csv", low_memory=False)

# Visualizing data

In [5]:
def get_image_count(df):
    images = df[df['IMAGES'].notnull()]['IMAGES'].to_list()
    images = [eval(image) for image in images]
    images = [item for items in images for item in items]
    return len(set(images))

In [6]:
east_df = east_df[east_df.VENDOR != 0][["TEXT", "IMAGES", "PHONES", "CITY", "VENDOR"]].drop_duplicates()
west_df = west_df[west_df.VENDOR != 0][["TEXT", "IMAGES", "PHONES", "CITY", "VENDOR"]].drop_duplicates()
north_df = north_df[north_df.VENDOR != 0][["TEXT", "IMAGES", "PHONES", "CITY", "VENDOR"]].drop_duplicates()
south_df = south_df[south_df.VENDOR != 0][["TEXT", "IMAGES", "PHONES", "CITY", "VENDOR"]].drop_duplicates()
central_df = central_df[central_df.VENDOR != 0][["TEXT", "IMAGES", "PHONES", "CITY", "VENDOR"]].drop_duplicates()

In [17]:
east_df['TEXT'][764]

' Hey Fellas Im Jovi Im Available From Now Until Whenever My Rates Are Very Reasonable Im Available For In-Call In Hazleton Only Pictures Sent As Requested, Very Clean Private And Discrete Location Call Or Text For More Info 570-710-1502 Posters age: 21 • Location: Hazleton "In-Call" Only, Scranton • Post ID: 32881137 scranton email to friend [SEP]  Young Hot And Ready For Fun  [SEP]  Come Spend Time With Little Fun-Sized Jovi?????? - Scranton escorts - backpage.com'

In [7]:
stat_dict = {"Demographics" : ['east', 'west', 'north', 'south', 'central'],
             "TEXT ADS" : [east_df.shape[0], west_df.shape[0], north_df.shape[0], south_df.shape[0], central_df.shape[0]],
             "Vendors" : [east_df.VENDOR.nunique(), west_df.VENDOR.nunique(), north_df.VENDOR.nunique(), south_df.VENDOR.nunique(), central_df.VENDOR.nunique()],
             "IMAGES" : [get_image_count(east_df), get_image_count(west_df), get_image_count(north_df), get_image_count(south_df), get_image_count(central_df)],
             "Multimodal" : [east_df.dropna().shape[0], west_df.dropna().shape[0], north_df.dropna().shape[0], south_df.dropna().shape[0], central_df.dropna().shape[0]]
            }

In [8]:
df = pd.DataFrame(stat_dict)
df

Unnamed: 0,Demographics,TEXT ADS,Vendors,IMAGES,Multimodal
0,east,60814,5028,14212,2319
1,west,48625,2575,12664,2740
2,north,3581,253,5552,1251
3,south,39426,2290,59294,11786
4,central,38571,2927,28739,6024


In [7]:
"""east_df.to_csv("../data/structured/TEXT/east.csv")
west_df.to_csv("../data/structured/TEXT/west.csv")
north_df.to_csv("../data/structured/TEXT/north.csv")
south_df.to_csv("../data/structured/TEXT/south.csv")
central_df.to_csv("../data/structured/TEXT/central.csv")"""

'east_df.to_csv("../data/structured/TEXT/east.csv")\nwest_df.to_csv("../data/structured/TEXT/west.csv")\nnorth_df.to_csv("../data/structured/TEXT/north.csv")\nsouth_df.to_csv("../data/structured/TEXT/south.csv")\ncentral_df.to_csv("../data/structured/TEXT/central.csv")'

In [8]:
def generate_violin_plot(data_df_dict, visualize="class-freq"):
    """
    param data_df_dict : Dictionary of all dataframes
                       : should have east, west, north, south, and central as the keys to the associated dataframes
    param visualize : Can be class-freq for visualizing number of ads/vendor and len-dist for length of the text ad
    """
    # Extracting class frequencies
    east_class_freq = list(dict(sorted(dict(Counter(data_df_dict["east"].VENDOR)).items())).values())
    west_class_freq = list(dict(sorted(dict(Counter(data_df_dict["west"].VENDOR)).items())).values())
    north_class_freq = list(dict(sorted(dict(Counter(data_df_dict["north"].VENDOR)).items())).values())
    south_class_freq = list(dict(sorted(dict(Counter(data_df_dict["south"].VENDOR)).items())).values())
    central_class_freq = list(dict(sorted(dict(Counter(data_df_dict["central"].VENDOR)).items())).values())
    
    # Extracting length of the sentences
    east_sen_len = [len(sent.split(" ")) for sent in data_df_dict["east"].TEXT.to_list()]
    west_sen_len = [len(sent.split(" ")) for sent in data_df_dict["west"].TEXT.to_list()]
    north_sen_len = [len(sent.split(" ")) for sent in data_df_dict["north"].TEXT.to_list()]
    south_sen_len = [len(sent.split(" ")) for sent in data_df_dict["south"].TEXT.to_list()]
    central_sen_len = [len(sent.split(" ")) for sent in data_df_dict["central"].TEXT.to_list()]
    
    value = east_class_freq + east_sen_len + west_class_freq + west_sen_len + north_class_freq + north_sen_len + south_class_freq + south_sen_len + central_class_freq + central_sen_len 
    
    east_feature = ["class-freq"] * len(east_class_freq) + ["sent-len"] * len(east_sen_len)
    west_feature = ["class-freq"] * len(west_class_freq) + ["sent-len"] * len(west_sen_len)
    north_feature = ["class-freq"] * len(north_class_freq) + ["sent-len"] * len(north_sen_len)
    south_feature = ["class-freq"] * len(south_class_freq) + ["sent-len"] * len(south_sen_len)
    central_feature = ["class-freq"] * len(central_class_freq) + ["sent-len"] * len(central_sen_len)
    features = east_feature + west_feature + north_feature + south_feature + central_feature
    
    demographics = ["East"] * len(east_feature) + ["West"] * len(west_feature) + ["North"] * len(north_feature) + ["South"] * len(south_feature) + ["Central"] * len(central_feature) 
    
    # Creating an empty dataframe with 'demographics', 'feature', and 'value' as the column names
    df = pd.DataFrame({"demo": demographics, "features": features, "value": value})
    
    # Plotting figure
    fig = go.Figure()
    fig.add_trace(go.Violin(x=df['demo'][ df['features'] == 'class-freq'],
                            y=df['value'][ df['features'] == 'class-freq'],
                            legendgroup='Yes', scalegroup='Yes', name='class-freq',
                            side='negative',
                            line_color='red')
                 )
    fig.add_trace(go.Violin(x=df['demo'][ df['features'] == 'sent-len'],
                            y=df['value'][ df['features'] == 'sent-len'],
                            legendgroup='No', scalegroup='No', name='sent-len',
                            side='positive',
                            line_color='green')
                 )
    fig.update_traces(meanline_visible=True)
    fig.update_layout(violingap=0, violinmode='overlay')
    
    fig.update_layout(
    title= "Box plot inidicating class-frequency and sentence-len distributions for all datasets",
    xaxis_title="Demographics",
    yaxis_title="Frequency",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="Black"
    ))
    
    fig.show('iframe')

In [9]:
generate_violin_plot({"east": east_df, "west" : west_df, "north": north_df, "south": south_df, "central": central_df})

# Finding common vendors

In [10]:
def find_common_vendors(data1_df, data2_df):
    return set(data1_df.VENDOR.to_list()) & set(data2_df.VENDOR.to_list())

In [11]:
all_data = [east_df, west_df, north_df, south_df, central_df]

outer_list = []
for index1, data1 in enumerate(all_data):
    inner_list = []
    for index2, data2 in enumerate(all_data):
        inner_list.append(len(find_common_vendors(data1, data2)))
    outer_list.append(inner_list)

In [12]:
fig = px.imshow(outer_list, text_auto=True, aspect="auto", labels=dict(x="Demographics", y="Demographics", color="Productivity"),
                x=['EAST', 'WEST', 'NORTH', 'SOUTH', 'CENTRAL'],
                y=['EAST', 'WEST', 'NORTH', 'SOUTH', 'CENTRAL']
               )

fig.show('iframe')

# Loading Processed data

In [10]:
east_df = pd.read_csv("../data/processed/TEXT/east.csv", low_memory=False)
west_df = pd.read_csv("../data/processed/TEXT/west.csv", low_memory=False)
north_df = pd.read_csv("../data/processed/TEXT/north.csv", low_memory=False)
south_df = pd.read_csv("../data/processed/TEXT/south.csv", low_memory=False)
central_df = pd.read_csv("../data/processed/TEXT/central.csv", low_memory=False)

In [4]:
def find_pos_dep(data_df):
    pos_dict = defaultdict(list)
    dep_dict = defaultdict(list)
    ads = data_df.TEXT.to_list()
    pbar = tqdm(total=len(ads))
    
    for text in ads:
        doc = nlp(text)
        for token in doc:
            pos_dict[token.pos_].append(token.text)
            dep_dict[token.dep_].append(token.text)
        pbar.update(1)
    
    pbar.close()
    return dict(pos_dict), dict(dep_dict)

In [None]:
# pos_dict_north, dep_dict_north = find_pos_dep(north_df)
# pos_dict_east, dep_dict_east = find_pos_dep(east_df)
# pos_dict_west, dep_dict_west = find_pos_dep(west_df)
# pos_dict_south, dep_dict_south = find_pos_dep(south_df)
# pos_dict_central, dep_dict_central = find_pos_dep(central_df)

 40%|████      | 16311/40534 [21:53<44:24,  9.09it/s]  

In [None]:
"""with open('../pickled/pos_dict_north.pickle', 'wb') as handle:
    pickle.dump(pos_dict_north, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/dep_dict_north.pickle', 'wb') as handle:
    pickle.dump(dep_dict_north, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/pos_dict_east.pickle', 'wb') as handle:
    pickle.dump(pos_dict_east, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../pickled/dep_dict_east.pickle', 'wb') as handle:
    pickle.dump(dep_dict_east, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/pos_dict_west.pickle', 'wb') as handle:
    pickle.dump(pos_dict_west, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/dep_dict_west.pickle', 'wb') as handle:
    pickle.dump(dep_dict_west, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/pos_dict_south.pickle', 'wb') as handle:
    pickle.dump(pos_dict_south, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/dep_dict_south.pickle', 'wb') as handle:
    pickle.dump(dep_dict_south, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/pos_dict_central.pickle', 'wb') as handle:
    pickle.dump(pos_dict_central, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/dep_dict_central.pickle', 'wb') as handle:
    pickle.dump(dep_dict_central, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""

In [7]:
def find_ner(data_df):
    ner_dict = defaultdict(list)
    ads = data_df.TEXT.to_list()
    pbar = tqdm(total=len(ads))
    
    for text in ads:
        doc = nlp(text)
        for token in doc.ents:
            ner_dict[token.label_].append(token.text)
        pbar.update(1)
    
    pbar.close()
    return dict(ner_dict)

In [None]:
"""ner_dict_north = find_ner(north_df)
ner_dict_east = find_ner(east_df)
ner_dict_west = find_ner(west_df)
ner_dict_south = find_ner(south_df)
ner_dict_central = find_ner(central_df)

with open('../pickled/ner_dict_north.pickle', 'wb') as handle:
    pickle.dump(ner_dict_north, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/ner_dict_east.pickle', 'wb') as handle:
    pickle.dump(ner_dict_east, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/ner_dict_west.pickle', 'wb') as handle:
    pickle.dump(ner_dict_west, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/ner_dict_south.pickle', 'wb') as handle:
    pickle.dump(ner_dict_south, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../pickled/ner_dict_central.pickle', 'wb') as handle:
    pickle.dump(ner_dict_central, handle, protocol=pickle.HIGHEST_PROTOCOL)"""

# Loading processed data

In [None]:
with open('../pickled/pos_dict_north.pickle', 'rb') as handle:
    pos_dict_north = pickle.load(handle)
    
with open('../pickled/pos_dict_south.pickle', 'rb') as handle:
    pos_dict_south = pickle.load(handle)
    
with open('../pickled/pos_dict_east.pickle', 'rb') as handle:
    pos_dict_east = pickle.load(handle)
    
with open('../pickled/pos_dict_west.pickle', 'rb') as handle:
    pos_dict_west = pickle.load(handle)
    
with open('../pickled/pos_dict_central.pickle', 'rb') as handle:
    pos_dict_central = pickle.load(handle)
    
with open('../pickled/dep_dict_north.pickle', 'rb') as handle:
    dep_dict_north = pickle.load(handle)
    
with open('../pickled/dep_dict_south.pickle', 'rb') as handle:
    dep_dict_south = pickle.load(handle)
    
with open('../pickled/dep_dict_east.pickle', 'rb') as handle:
    dep_dict_east = pickle.load(handle)
    
with open('../pickled/dep_dict_west.pickle', 'rb') as handle:
    dep_dict_west = pickle.load(handle)
    
with open('../pickled/dep_dict_central.pickle', 'rb') as handle:
    dep_dict_central = pickle.load(handle)
    
with open('../pickled/ner_dict_north.pickle', 'rb') as handle:
    ner_dict_north = pickle.load(handle)
    
with open('../pickled/ner_dict_south.pickle', 'rb') as handle:
    ner_dict_south = pickle.load(handle)
    
with open('../pickled/ner_dict_east.pickle', 'rb') as handle:
    ner_dict_east = pickle.load(handle)
    
with open('../pickled/ner_dict_west.pickle', 'rb') as handle:
    ner_dict_west = pickle.load(handle)
    
with open('../pickled/ner_dict_central.pickle', 'rb') as handle:
    ner_dict_central = pickle.load(handle)

In [None]:
def get_dict_density(tag_dict):
    tag_dict = {k:len(v) for k,v in tag_dict.items()}
    tag_dict = {k:v/sum(list(tag_dict.values())) for k,v in tag_dict.items()}
    return tag_dict 

In [None]:
pos_dict_west = get_dict_density(pos_dict_west)
pos_dict_east = get_dict_density(pos_dict_east)
pos_dict_north = get_dict_density(pos_dict_north)
pos_dict_south = get_dict_density(pos_dict_south)
pos_dict_central = get_dict_density(pos_dict_central)

dep_dict_west = get_dict_density(dep_dict_west)
dep_dict_east = get_dict_density(dep_dict_east)
dep_dict_north = get_dict_density(dep_dict_north)
dep_dict_south = get_dict_density(dep_dict_south)
dep_dict_central = get_dict_density(dep_dict_central)

ner_dict_west = get_dict_density(ner_dict_west)
ner_dict_east = get_dict_density(ner_dict_east)
ner_dict_north = get_dict_density(ner_dict_north)
ner_dict_south = get_dict_density(ner_dict_south)
ner_dict_central = get_dict_density(ner_dict_central)

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=list(pos_dict_east.keys()), y=list(pos_dict_east.values()), 
                             name="East", marker_color="red", opacity=0.3))
fig.add_trace(go.Bar(x=list(pos_dict_west.keys()), y=list(pos_dict_west.values()), 
                             name="West", marker_color="blue", opacity=0.3))
fig.add_trace(go.Bar(x=list(pos_dict_north.keys()), y=list(pos_dict_west.values()), 
                             name="North", marker_color="green", opacity=0.3))
fig.add_trace(go.Bar(x=list(pos_dict_south.keys()), y=list(pos_dict_west.values()), 
                             name="South", marker_color="orange", opacity=0.3))
fig.add_trace(go.Bar(x=list(pos_dict_central.keys()), y=list(pos_dict_west.values()), 
                             name="Central", marker_color="black", opacity=0.3))
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=list(dep_dict_east.keys()), y=list(dep_dict_east.values()), 
                             name="East", marker_color="red", opacity=0.3))
fig.add_trace(go.Bar(x=list(dep_dict_west.keys()), y=list(dep_dict_west.values()), 
                             name="West", marker_color="blue", opacity=0.3))
fig.add_trace(go.Bar(x=list(dep_dict_north.keys()), y=list(dep_dict_west.values()), 
                             name="North", marker_color="green", opacity=0.3))
fig.add_trace(go.Bar(x=list(dep_dict_south.keys()), y=list(dep_dict_west.values()), 
                             name="South", marker_color="orange", opacity=0.3))
fig.add_trace(go.Bar(x=list(dep_dict_central.keys()), y=list(dep_dict_west.values()), 
                             name="Central", marker_color="black", opacity=0.3))
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=list(ner_dict_east.keys()), y=list(ner_dict_east.values()), 
                             name="East", marker_color="red", opacity=0.3))
fig.add_trace(go.Bar(x=list(ner_dict_west.keys()), y=list(ner_dict_west.values()), 
                             name="West", marker_color="blue", opacity=0.3))
fig.add_trace(go.Bar(x=list(ner_dict_north.keys()), y=list(ner_dict_west.values()), 
                             name="North", marker_color="green", opacity=0.3))
fig.add_trace(go.Bar(x=list(ner_dict_south.keys()), y=list(ner_dict_west.values()), 
                             name="South", marker_color="orange", opacity=0.3))
fig.add_trace(go.Bar(x=list(ner_dict_central.keys()), y=list(ner_dict_west.values()), 
                             name="Central", marker_color="black", opacity=0.3))
fig.show()