# Importing libraries

In [2]:
import os
import re

from tqdm import tqdm
from collections import Counter
import networkx as nx

import numpy as np
import pandas as pd

from PIL import Image
# import cv2

# import torch
# from torch.utils.data import DataLoader
# from torchvision import datasets
# from facenet_pytorch import MTCNN, InceptionResnetV1

import warnings
warnings.filterwarnings('ignore')

# workers = 0 if os.name == 'nt' else 4

# Loading data

In [2]:
provided_chicago = pd.read_csv("../data/provided/chicago_data.csv")
provided_atlanta = pd.read_csv("../data/provided/atlanta_data.csv")
provided_dallas = pd.read_csv("../data/provided/dallas_data.csv")
provided_detroit = pd.read_csv("../data/provided/detroit_data.csv")
provided_sf = pd.read_csv("../data/provided/sanfransisco_data.csv")
provided_ny = pd.read_csv("../data/provided/newyork_data.csv")
provided_houston = pd.read_csv("../data/provided/houston_data.csv")

In [3]:
# Since NY dataset don't have the images attached
provided_ny_id2imgmappings = pd.read_csv("../data/provided/ImgtoCSVmappings/NYC_ID2Image.csv")
# collecting list of images per ID
unique_post_ids = provided_ny_id2imgmappings['Post ID'].unique()
id_dict = {}

for ids in unique_post_ids:
    temp_df = provided_ny_id2imgmappings[provided_ny_id2imgmappings["Post ID"] == ids]
    images = "|".join(list(temp_df["Image Title"].unique()))
    id_dict[ids] = images
    
# Assigning Images to the NY dataset
provided_ny['IMAGES'] = provided_ny['ID'].apply(lambda x: id_dict.get(x, np.nan))

In [4]:
# Dropping the instances with no images
provided_chicago = provided_chicago[provided_chicago['IMAGES'].notnull()][['ID', 'TEXT', 'IMAGES', 'PHONES']]
provided_atlanta = provided_atlanta[provided_atlanta['IMAGES'].notnull()][['ID', 'TEXT', 'IMAGES', 'PHONES']]
provided_dallas = provided_dallas[provided_dallas['IMAGES'].notnull()][['ID', 'TEXT', 'IMAGES', 'PHONES']]
provided_detroit = provided_detroit[provided_detroit['IMAGES'].notnull()][['ID', 'TEXT', 'IMAGES', 'PHONES']]
provided_sf = provided_sf[provided_sf['IMAGES'].notnull()][['ID', 'TEXT', 'IMAGES', 'PHONES']]
provided_ny = provided_ny[provided_ny['IMAGES'].notnull()][['ID', 'TEXT', 'IMAGES', 'PHONES']]
provided_houston = provided_houston[provided_houston['IMAGES'].notnull()][['ID', 'TEXT', 'IMAGES', 'PHONES']]

In [5]:
provided_chicago['PHONES'] = provided_chicago['PHONES'].apply(lambda x: eval(x))
provided_atlanta['PHONES'] = provided_atlanta['PHONES'].apply(lambda x: eval(x))
provided_dallas['PHONES'] = provided_dallas['PHONES'].apply(lambda x: eval(x))
provided_detroit['PHONES'] = provided_detroit['PHONES'].apply(lambda x: eval(x))
provided_sf['PHONES'] = provided_sf['PHONES'].apply(lambda x: eval(x))
provided_ny['PHONES'] = provided_ny['PHONES'].apply(lambda x: eval(x))
provided_houston['PHONES'] = provided_houston['PHONES'].apply(lambda x: eval(x))

# Creating Vendor Communities

In [6]:
# Cleaning phone numbers with white spaces and symbols
def clean_phones(phones):
    cleaned_phones = []
    for phone in phones:
        # Remove special characters, symbols, and emojis using regex
        phone = re.sub(r'[^\w\s]', '', phone)

        # Remove whitespaces
        phone = ''.join(phone.split())
        # Since all US phone numbers are atleat 7 digits long
        if len(phone) > 6:
            cleaned_phones.append(phone)
    return cleaned_phones

In [7]:
provided_chicago["CLEANED_PHONES"] = provided_chicago["PHONES"].apply(lambda x: clean_phones(x))
provided_atlanta["CLEANED_PHONES"] = provided_atlanta["PHONES"].apply(lambda x: clean_phones(x))
provided_dallas["CLEANED_PHONES"] = provided_dallas["PHONES"].apply(lambda x: clean_phones(x))
provided_detroit["CLEANED_PHONES"] = provided_detroit["PHONES"].apply(lambda x: clean_phones(x))
provided_sf["CLEANED_PHONES"] = provided_sf["PHONES"].apply(lambda x: clean_phones(x))
provided_ny["CLEANED_PHONES"] = provided_ny["PHONES"].apply(lambda x: clean_phones(x))
provided_houston["CLEANED_PHONES"] = provided_houston["PHONES"].apply(lambda x: clean_phones(x))

In [8]:
# Connecting phone numbers to create vendor communities
def connect_numbers(input_numbers):
    G = nx.Graph()

    for sublist in input_numbers:
        if len(sublist) > 1:
            for i in range(len(sublist) - 1):
                G.add_edge(sublist[i], sublist[i + 1])

    communities = list(nx.connected_components(G))
    return communities

In [9]:
# Assign vendor IDs to communities
def assign_ids2communities(communities):
    id_dict = {}
    id_ = 0
    for index, community in enumerate(communities):
        for number in community:
            id_dict[number] = id_
        id_ += 1
    return id_dict

In [10]:
# Mapping advertisements to vendor ids
def map_ads2ids(phone_numbers, community2id_dict):
    ads2ids_list = []
    for number in phone_numbers:
        if number not in community2id_dict.keys():
            community2id_dict[number] = len(community2id_dict)
        ads2ids_list.append(community2id_dict[number])
    return ads2ids_list

In [11]:
# Generating vendor labels
def generate_vendor_labels(df):
    input_numbers = df["CLEANED_PHONES"].to_list()
    communities = connect_numbers(df["CLEANED_PHONES"].to_list())
    community2id_dic = assign_ids2communities(communities)
    df['VENDOR'] = df['CLEANED_PHONES'].apply(lambda x: set(map_ads2ids(x, community2id_dic)))
    vendors = df['VENDOR'].to_list()
    vendors = [int(list(vendor)[0]) if len(vendor) > 0 else np.nan for vendor in vendors]
    df['VENDOR'] = vendors
    return df

In [12]:
provided_chicago = generate_vendor_labels(provided_chicago)[['ID', 'TEXT', 'IMAGES', 'VENDOR']]
provided_atlanta = generate_vendor_labels(provided_atlanta)[['ID', 'TEXT', 'IMAGES', 'VENDOR']]
provided_dallas = generate_vendor_labels(provided_dallas)[['ID', 'TEXT', 'IMAGES', 'VENDOR']]
provided_detroit = generate_vendor_labels(provided_detroit)[['ID', 'TEXT', 'IMAGES', 'VENDOR']]
provided_sf = generate_vendor_labels(provided_sf)[['ID', 'TEXT', 'IMAGES', 'VENDOR']]
provided_ny = generate_vendor_labels(provided_ny)[['ID', 'TEXT', 'IMAGES', 'VENDOR']]
provided_houston = generate_vendor_labels(provided_houston)[['ID', 'TEXT', 'IMAGES', 'VENDOR']]

In [13]:
# Removing vendor entries with nan values
provided_chicago = provided_chicago[~provided_chicago['VENDOR'].isna()]
provided_atlanta = provided_atlanta[~provided_atlanta['VENDOR'].isna()]
provided_dallas = provided_dallas[~provided_dallas['VENDOR'].isna()]
provided_detroit = provided_detroit[~provided_detroit['VENDOR'].isna()]
provided_sf = provided_sf[~provided_sf['VENDOR'].isna()]
provided_ny = provided_ny[~provided_ny['VENDOR'].isna()]
provided_houston = provided_houston[~provided_houston['VENDOR'].isna()]

In [14]:
# Dropping duplicates
provided_chicago = provided_chicago.drop_duplicates()
provided_atlanta = provided_atlanta.drop_duplicates()
provided_dallas = provided_dallas.drop_duplicates()
provided_detroit = provided_detroit.drop_duplicates()
provided_sf = provided_sf.drop_duplicates()
provided_ny = provided_ny.drop_duplicates()
provided_houston = provided_houston.drop_duplicates()

# Removing vendors that don't have at least 2 ads

In [15]:
def remove_vendors_with_one_ad(df):
    vendor_freq_dict = dict(Counter(df.VENDOR))
    non_relevant_vendor = [k for k, v in vendor_freq_dict.items() if v < 2]
    df = df[~df['VENDOR'].isin(non_relevant_vendor)]
    # converting the vendor ids to int format
    df['VENDOR'] = df['VENDOR'].astype(int)
    return df

In [16]:
provided_chicago = remove_vendors_with_one_ad(provided_chicago)
provided_atlanta = remove_vendors_with_one_ad(provided_atlanta)
provided_dallas = remove_vendors_with_one_ad(provided_dallas)
provided_detroit = remove_vendors_with_one_ad(provided_detroit)
provided_sf = remove_vendors_with_one_ad(provided_sf)
provided_ny = remove_vendors_with_one_ad(provided_ny)
provided_houston = remove_vendors_with_one_ad(provided_houston)

# Detecting faces

In [32]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

# Loading the model
mtcnn = MTCNN(
    keep_all=True,
    device=device)

Running on device: cuda:0


In [35]:
# Define a function to check if images contain faces
def if_images_have_faces(df, city="chicago"):
    # Convert city name to lowercase for consistency
    city = city.lower()

    # Construct the path to the image directory based on the city
    image_dir = os.path.join("/workspace/persistent/HTClipper/data/IMAGES", city, "image")
    
    # Load all filenames from the image directory into a set for efficient lookup
    filenames = set(os.listdir(image_dir))
    
    # Define a function to process each row of the dataframe
    def process_row(row):
        # Split the "IMAGES" field into individual image names
        images = row["IMAGES"].split('|')
        output = []  # Initialize an empty list to store the results

        # Iterate over each image name in the row
        for image_name in images:
            # Check if the image name exists in the set of filenames
            if image_name in filenames:
                # Construct the full path to the image
                image_path = os.path.join(image_dir, image_name)
                # Open the image and process it
                with Image.open(image_path) as image:
                    try:
                        # Use MTCNN to detect faces in the image
                        boxes, _ = mtcnn.detect(image)
                        # Append "yes" to the output list if faces are detected, else "no"
                        output.append("yes" if boxes is not None else "no")
                    except Exception as e:
                        # Handle exceptions (e.g., issues in face detection) and append an "-" marker
                        output.append("-")
                        print(f"Error processing image {image_name}: {e}")
            else:
                # Append "-" to the output list if the image name is not found
                output.append("-")

        # Join the results for this row with "|" and return it
        return "|".join(output)

    # Apply the process_row function to each row of the dataframe
    df["FACES"] = df.apply(process_row, axis=1)

    # Return the modified dataframe
    return df

In [None]:
provided_chicago = if_images_have_faces(provided_chicago, city="chicago")
provided_atlanta = if_images_have_faces(provided_atlanta, city="atlanta")
provided_dallas = if_images_have_faces(provided_dallas, city="dallas")
provided_detroit = if_images_have_faces(provided_detroit, city="detroit")
provided_sf = if_images_have_faces(provided_sf, city="sf")
provided_ny = if_images_have_faces(provided_ny, city="ny")
provided_houston = if_images_have_faces(provided_houston, city="houston")

# Save data

In [None]:
provided_chicago.to_csv("/workspace/persistent/HTClipper/data/processed/chicago.csv")
provided_atlanta.to_csv("/workspace/persistent/HTClipper/data/processed/atlanta.csv")
provided_dallas.to_csv("/workspace/persistent/HTClipper/data/processed/dallas.csv")
provided_detroit.to_csv("/workspace/persistent/HTClipper/data/processed/detroit.csv")
provided_sf.to_csv("/workspace/persistent/HTClipper/data/processed/sf.csv")
provided_ny.to_csv("/workspace/persistent/HTClipper/data/processed/ny.csv")
provided_houston.to_csv("/workspace/persistent/HTClipper/data/processed/houston.csv")

# Loading the Canada Data

In [129]:
canada_data = pd.read_excel("../data/provided/Colarado.xlsx")

In [130]:
canada_data.drop(columns=["Unnamed: 0"], inplace=True)

In [131]:
canada_data.city = canada_data.city.apply(lambda x: x.lower())

In [132]:
province_dict = {
    'calgary': 'alberta',
    'edmonton': 'alberta',
    'lethbridge': 'alberta',
    'banff/canmore': 'alberta',
    'airdrie': 'alberta',
    'medicine hat': 'alberta',
    'red deer': 'alberta',
    'lloydminster': 'alberta',  # Note: Lloydminster is split between Alberta and Saskatchewan
    'leduc': 'alberta',
    'grande prairie': 'alberta',
    'fort mcmurray': 'alberta',
    'peace river country': 'alberta',
    'edson': 'alberta',
    'mississauga': 'ontario',
    'city of toronto': 'ontario',
    'burlington': 'ontario',
    'north york': 'ontario',
    'greater-toronto': 'ontario',
    'richmond hill': 'ontario',
    'vaughan': 'ontario',
    'brampton': 'ontario',
    'durham region': 'ontario',
    'kitchener': 'ontario',
    'barrie': 'ontario',
    'kingston': 'ontario',
    'cornwall': 'ontario',
    'pembroke': 'ontario',
    'belleville': 'ontario',
    'peterborough': 'ontario',
    'orillia': 'ontario',
    'cambridge': 'ontario',
    'london': 'ontario',
    'guelph': 'ontario',
    'windsor': 'ontario',
    'thunder bay': 'ontario',
    'sault ste marie': 'ontario',
    'ottawa' : 'ontario',
    'north bay': 'ontario',
    'sudbury': 'ontario',
    'hamilton': 'ontario',
    'niagara region': 'ontario',
    'hamilton-niagara': 'ontario',
    'brantford-woodstock': 'ontario',
    'scarborough': 'ontario',
    'vancouver': 'british columbia',
    'burnaby/newwest': 'british columbia',
    'richmond': 'british columbia',
    'delta/surrey/langley': 'british columbia',
    'fraser valley': 'british columbia',
    'north shore': 'british columbia',
    'tricities/pitt/maple': 'british columbia',
    'whistler': 'british columbia',
    'prince george': 'british columbia',
    'fort st. john': 'british columbia',
    'skeena-bulkley': 'british columbia',
    'kamloops': 'british columbia',
    'vernon': 'british columbia',
    'kelowna': 'british columbia',
    'penticton': 'british columbia',
    'kootenays': 'british columbia',
    'victoria': 'british columbia',
    'nanaimo': 'british columbia',
    'comox valley': 'british columbia',
    'sunshine coast': 'british columbia',
    'metro-vancouver': 'british columbia',
    'vancouver-island': 'british columbia',
    'quebec city': 'quebec',
    'montreal': 'quebec',
    'laval': 'quebec',
    'trois rivieres': 'quebec',
    'sherbrooke': 'quebec',
    'saskatoon': 'saskatchewan',
    'prince albert': 'saskatchewan',
    'north battleford': 'saskatchewan',
    'winnipeg': 'manitoba',
    'brandon': 'manitoba',
    'dieppe': 'new brunswick',
    'saint john': 'new brunswick',
    'moncton': 'new brunswick',
    'fredericton': 'new brunswick',
    'nb - other areas': 'new brunswick',
    'new-brunswick': 'new brunswick',
    'miramichi': 'new brunswick',
    "st. john's": 'newfoundland and labrador',
    'labrador city': 'newfoundland and labrador',
    'goose bay': 'newfoundland and labrador',
    'lab - other areas': 'newfoundland and labrador',
    'newfoundland - other areas': 'newfoundland and labrador',
    'labrador': 'newfoundland and labrador',
    'newfoundland': 'newfoundland and labrador',
    'whitehorse': 'yukon',
    'yukon - other areas': 'yukon',
    'yukon' : 'yukon',
    'yellowknife': 'northwest territories',
    'summerside': 'prince edward island',
    'charlottetown': 'prince edward island',
    'prince-edward': 'prince edward island',
    'halifax': 'nova scotia',
    'cape breton - sydney': 'nova scotia',
    'truro': 'nova scotia',
    'halifax - other areas': 'nova scotia',
    'interior': 'british columbia'
}

In [133]:
canada_data['province'] = canada_data['city'].map(province_dict)

In [134]:
separator = ' [SEP] '
canada_data['TEXT'] = canada_data.apply(lambda row: f"{row['title']}{separator}{row['text']}", axis=1)

In [135]:
canada_data = canada_data[["TEXT", "phone", "province"]].drop_duplicates()

In [136]:
vendor_freq_dict = dict(Counter(canada_data.phone))
non_relevant_vendor = [k for k, v in vendor_freq_dict.items() if v < 2]
canada_data = canada_data[~canada_data['phone'].isin(non_relevant_vendor)]

In [155]:
phone_nr = canada_data["phone"].to_list()

phone_dict = {}
for phone in phone_nr:
    if phone not in phone_dict.keys():
        phone_dict[phone] = len(phone_dict)

In [157]:
canada_data['VENDOR'] = canada_data['phone'].map(phone_dict)

In [159]:
canada_data[["TEXT", "VENDOR", "province"]].to_csv("../data/processed/canada.csv")

# Fetching data for vendors with atleast 3 ads

In [28]:
chicago_df = pd.read_csv("../data/processed/chicago.csv")
atlanta_df = pd.read_csv("../data/processed/atlanta.csv")
dallas_df = pd.read_csv("../data/processed/dallas.csv")
detroit_df = pd.read_csv("../data/processed/detroit.csv")
houston_df = pd.read_csv("../data/processed/houston.csv")
ny_df = pd.read_csv("../data/processed/ny.csv")
sf_df = pd.read_csv("../data/processed/sf.csv")
canada_df = pd.read_csv("../data/processed/canada.csv")

In [29]:
max_id_present = pd.concat([chicago_df, atlanta_df, dallas_df, detroit_df, houston_df, ny_df, sf_df]).VENDOR.unique().max() + 1
canada_df["VENDOR"] = canada_df["VENDOR"].apply(lambda x: max_id_present + x)

In [30]:
df = pd.concat([chicago_df, atlanta_df, dallas_df, detroit_df, houston_df, ny_df, sf_df, canada_df])[["TEXT", "VENDOR", "IMAGES", "FACES"]]

In [32]:
df.sample(frac=1).reset_index(drop=True).drop_duplicates().to_csv("../data/processed/all.csv")

# Creating the image dataset

In [16]:
chicago_df = pd.read_csv("../data/processed/chicago.csv")
atlanta_df = pd.read_csv("../data/processed/atlanta.csv")
dallas_df = pd.read_csv("../data/processed/dallas.csv")
detroit_df = pd.read_csv("../data/processed/detroit.csv")
houston_df = pd.read_csv("../data/processed/houston.csv")
ny_df = pd.read_csv("../data/processed/ny.csv")
sf_df = pd.read_csv("../data/processed/sf.csv")
canada_df = pd.read_csv("../data/processed/canada.csv")

In [17]:
chicago_df["CITY"] = ["chicago"] * chicago_df.shape[0]
atlanta_df["CITY"] = ["atlanta"] * atlanta_df.shape[0]
dallas_df["CITY"] = ["dallas"] * dallas_df.shape[0]
detroit_df["CITY"] = ["detroit"] * detroit_df.shape[0]
houston_df["CITY"] = ["houston"] * houston_df.shape[0]
ny_df["CITY"] = ["ny"] * ny_df.shape[0]
sf_df["CITY"] = ["sf"] * sf_df.shape[0]

In [18]:
midwest_df = pd.concat([chicago_df, detroit_df])
northeast_df = ny_df
south_df = pd.concat([atlanta_df, houston_df, dallas_df])
west_df = sf_df

In [24]:
midwest_df.to_csv("../data/processed/midwest.csv")
northeast_df.to_csv("../data/processed/northeast.csv")
south_df.to_csv("../data/processed/south.csv")
west_df.to_csv("../data/processed/west.csv")

In [20]:
def load_image_dataset(df, city="chicago"):
    # Define the directory containing the images
    image_dir = os.path.join("/workspace/persistent/HTClipper/data/IMAGES", city, "image")
    
    # Initialize lists to store results
    id_list, image_list, vendor_list, face_list = [], [], [], []

    # Iterate over the dataframe with a progress bar
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Rows"):
        image_temp_list = row["IMAGES"].split("|")
        face_temp_list = row["FACES"].split("|")
        assert len(image_temp_list) == len(face_temp_list), "Mismatch in the number of images and faces"

        # Filter out non-existing image files
        valid_indices = [i for i, image in enumerate(image_temp_list) if os.path.isfile(os.path.join(image_dir, image))]
        
        image_temp_list = [image_temp_list[i] for i in valid_indices]
        face_temp_list = [face_temp_list[i] for i in valid_indices]

        # Generate corresponding ID and VENDOR lists
        id_temp_list = [row["ID"]] * len(image_temp_list)
        vendor_temp_list = [row["VENDOR"]] * len(image_temp_list)
        
        # Appending the image address to the image
        image_temp_list = [os.path.join(image_dir, image) for image in image_temp_list]
        
        # Append to the main lists
        id_list.extend(id_temp_list)
        image_list.extend(image_temp_list)
        vendor_list.extend(vendor_temp_list)
        face_list.extend(face_temp_list)
    
    # Create the resulting dataframe
    df_image = pd.DataFrame({
        'ID': id_list,
        'IMAGE': image_list,
        'VENDOR': vendor_list,
        'IF_FACE': face_list
    })
    
    return df_image

In [21]:
df_chicago_image = load_image_dataset(chicago_df, "chicago")
df_atlanta_image = load_image_dataset(atlanta_df, "atlanta")
df_dallas_image = load_image_dataset(dallas_df, "dallas")
df_detroit_image = load_image_dataset(detroit_df, "detroit")
df_houston_image = load_image_dataset(houston_df, "houston")
df_ny_image = load_image_dataset(ny_df, "ny")
df_sf_image = load_image_dataset(sf_df, "sf")

Processing Rows: 100%|██████████| 7011/7011 [00:17<00:00, 410.96it/s]
Processing Rows: 100%|██████████| 4951/4951 [00:10<00:00, 462.03it/s]
Processing Rows: 100%|██████████| 4043/4043 [00:10<00:00, 395.76it/s]
Processing Rows: 100%|██████████| 1553/1553 [00:03<00:00, 449.42it/s]
Processing Rows: 100%|██████████| 5094/5094 [00:12<00:00, 400.87it/s]
Processing Rows: 100%|██████████| 2599/2599 [00:07<00:00, 362.18it/s]
Processing Rows: 100%|██████████| 3262/3262 [00:07<00:00, 465.12it/s]


In [25]:
pd.concat([df_chicago_image, df_detroit_image]).to_csv("/workspace/persistent/HTClipper/data/processed/midwest_images.csv")
pd.concat([df_ny_image]).to_csv("/workspace/persistent/HTClipper/data/processed/northeast_images.csv")
pd.concat([df_atlanta_image, df_houston_image, df_dallas_image]).to_csv("/workspace/persistent/HTClipper/data/processed/south_images.csv")
pd.concat([df_sf_image]).to_csv("/workspace/persistent/HTClipper/data/processed/west_images.csv")

In [6]:
df_chicago_image.to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "chicago_images.csv"))
df_atlanta_image.to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "atlanta_images.csv"))
df_dallas_image.to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "dallas_images.csv"))
df_detroit_image.to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "detroit_images.csv"))
df_houston_image.to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "houston_images.csv"))
df_ny_image.to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "ny_images.csv"))
df_sf_image.to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "sf_images.csv"))

In [11]:
df_chicago_image[df_chicago_image["IF_FACE"] == "yes"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "chicago_faces.csv"))
df_chicago_image[df_chicago_image["IF_FACE"] == "no"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "chicago_nofaces.csv"))

df_atlanta_image[df_atlanta_image["IF_FACE"] == "yes"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "atlanta_faces.csv"))
df_atlanta_image[df_atlanta_image["IF_FACE"] == "no"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "atlanta_nofaces.csv"))

df_dallas_image[df_dallas_image["IF_FACE"] == "yes"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "dallas_faces.csv"))
df_dallas_image[df_dallas_image["IF_FACE"] == "no"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "dallas_nofaces.csv"))

df_detroit_image[df_detroit_image["IF_FACE"] == "yes"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "detroit_faces.csv"))
df_detroit_image[df_detroit_image["IF_FACE"] == "no"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "detroit_nofaces.csv"))

df_houston_image[df_houston_image["IF_FACE"] == "yes"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "houston_faces.csv"))
df_houston_image[df_houston_image["IF_FACE"] == "no"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "houston_nofaces.csv"))

df_ny_image[df_ny_image["IF_FACE"] == "yes"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "ny_faces.csv"))
df_ny_image[df_ny_image["IF_FACE"] == "no"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "ny_nofaces.csv"))

df_sf_image[df_sf_image["IF_FACE"] == "yes"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "sf_faces.csv"))
df_sf_image[df_sf_image["IF_FACE"] == "no"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "sf_nofaces.csv"))

In [12]:
all_df = pd.concat([df_chicago_image, df_atlanta_image, df_dallas_image, df_detroit_image, df_houston_image, df_ny_image, df_sf_image])

In [14]:
# Shuffling the dataset
all_df = all_df.sample(frac=1)

In [16]:
all_df[all_df["IF_FACE"] == "yes"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "all_faces.csv"))
all_df[all_df["IF_FACE"] == "no"].to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "all_nofaces.csv"))
all_df.to_csv(os.path.join("/workspace/persistent/HTClipper/data/processed", "all_images.csv"))