In [1]:
import pandas as pd
import altair as alt
from vega_datasets import data

In [2]:
states_geo = pd.read_csv("data/state_centers.csv")
lat_dict = dict(zip(states_geo.state, states_geo.lat))
long_dict = dict(zip(states_geo.state, states_geo.long))

In [3]:
states = pd.read_csv("data/StatesFIPSCodes.csv")
fips_dict = dict(zip(states.STUSAB, states.STATE_FIPS))

In [4]:
df = pd.read_csv('../web_scraper/data/buzzfeed_cvai.csv')
df.rename(
    columns={
        "0": 'org',
        "1": 'state',
        "2": 'search_count',
        "3": 'comment'
    },
    inplace=True
)
df = df.drop(labels=1528, axis=0)
cat_type = pd.CategoricalDtype(
    categories=["1-5", "6-10", "11-50", "51-100", "101-500", "501-1000", "1001-5000", "5001+"], 
    ordered=True
)
df['search_count'] = df.search_count.astype(cat_type)
df["id"] = df.state.map(fips_dict)
df["lat"] = df.state.map(lat_dict)
df["long"] = df.state.map(long_dict)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1803 entries, 0 to 1803
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   org           1803 non-null   object  
 1   state         1803 non-null   object  
 2   search_count  1803 non-null   category
 3   comment       1803 non-null   object  
 4   id            1721 non-null   float64 
 5   lat           1720 non-null   float64 
 6   long          1720 non-null   float64 
dtypes: category(1), float64(3), object(3)
memory usage: 100.7+ KB


In [5]:
search_count = alt.Chart(df).mark_bar().encode(
    x="search_count",
    y="count(search_count)"
)
state_count = alt.Chart(df).mark_bar().encode(
    x="state",
    y="count(state)"
)
search_count | state_count

In [6]:
state_bounds = alt.topo_feature(data.us_10m.url, "states")
source = df.groupby('id').size().reset_index().rename(columns={0:"cnt"})

alt.Chart(state_bounds).mark_geoshape(stroke="grey").encode(
    color='cnt:Q'
).transform_lookup(
    lookup="id",
    from_=alt.LookupData(source, "id", ["cnt"])
).project(
    type='albersUsa'
).properties(
    width=500,
    height=300
)

In [14]:
background = alt.Chart(state_bounds).mark_geoshape(
    fill='#4C5454',
    stroke='white'
).properties(
    width=500,
    height=300
).project('albersUsa')

points = alt.Chart(df).transform_aggregate(
    latitude='mean(lat)',
    longitude='mean(long)',
    count='count()',
    groupby=['state']
).mark_circle().encode(
    longitude="longitude:Q",
    latitude="latitude:Q",
    size=alt.Size('count:Q', title='Number of Orgs.'),
    color=alt.value('#FF715B'),
    tooltip=["state:N", "count:Q"]
).properties(
    title="Number of Orgs with Clearview AI Trials"
)

background + points

In [35]:
alt.Chart(df[df.state == "AL"]).mark_bar().encode(
    x="count(search_count)",
    y=alt.X("search_count:O", sort=["1-5", "6-10", "11-50", "51-100", "101-500", "501-1000", "1001-5000", "5001+"])
)

In [19]:
df.search_count.unique()

array(['11-50', '51-100', '6-10', '101-500', '1-5', '1001-5000',
       '501-1000', '5001+'], dtype=object)

In [23]:
cat_type = pd.CategoricalDtype(categories=["1-5", "6-10", "11-50", "51-100", "101-500", "501-1000", "1001-5000", "5001+"], ordered=True)
df.search_count.astype(cat_type)

In [16]:
df[df.state == "IL"].comment

638    This organization did not respond to a request...
639    This organization did not respond to a request...
640    This organization did not respond to a request...
641    This organization did not respond to a request...
642    “We tried out the free version trial. We stopp...
                             ...                        
732    This organization did not respond to a request...
733    This organization did not respond to a request...
734    “We do not have any facial recognition tools o...
735    “We used Clearview AI on a trial basis as an i...
736    This organization did not respond to a request...
Name: comment, Length: 99, dtype: object

In [5]:
df

Unnamed: 0,org,state,search_count,comment,id,lat,long
0,Alaska Scientific Crime Detection Laboratory,AK,11-50,“The DPS is only aware of one employee that re...,2.0,63.588753,-154.493062
1,Anchorage Police Department,AK,11-50,This organization did not respond to a request...,2.0,63.588753,-154.493062
2,City of Anchorage,AK,51-100,This organization did not respond to a request...,2.0,63.588753,-154.493062
3,State of Alaska,AK,11-50,"“To the best of our knowledge, we cannot verif...",2.0,63.588753,-154.493062
4,Ted Stevens Anchorage International Airport,AK,51-100,"“I’ve consulted all of staff at ANC, and there...",2.0,63.588753,-154.493062
...,...,...,...,...,...,...,...
1799,West Virginia University,WV,11-50,This organization did not respond to a request...,54.0,38.597626,-80.454903
1800,Albany County Sheriff’s Office,WY,1-5,This organization did not respond to a request...,56.0,43.075968,-107.290284
1801,Rock Springs Police Department,WY,51-100,This organization did not respond to a request...,56.0,43.075968,-107.290284
1802,Wyoming Division of Criminal Investigation,WY,11-50,“We accepted a free trial and used it to see w...,56.0,43.075968,-107.290284


In [10]:
df.to_json("data/clearview_ai_data.json", orient="records")