# Measuring Opioid Stigma Using the Twitter API and NLP

More than 100 Americans each day die from [opioid overdoses](https://www.cdc.gov/drugoverdose/epidemic/index.html). Expanding access to [medication assisted treatment (MAT)](http://adai.uw.edu/pubs/infobriefs/MAT.pdf) has the potential to help reverse the epidemic. However, [efforts to expand MAT have been blocked due to a stigmatized view of opioid addiction](https://www.vox.com/science-and-health/2017/7/20/15937896/medication-assisted-treatment-methadone-buprenorphine-naltrexone) as a moral failing rather than a medical condition.

Despite its importance, very little data is available on opioid-related stigma. This is in part because stigma is difficult to measure with traditional tools such as surveys, which may underestimate the pervasiveness of stigma due to [social desirability bias](https://en.wikipedia.org/wiki/Social_desirability_bias).

A potentially valuable source of data to fill this gap is unstructured text data from Twitter, which is [less likely to suffer from social desirability bias](http://journals.sagepub.com/doi/abs/10.1177/0049124115605339) than traditional surveys.

## Scraping Twitter

In order to tap into this data source, we can set up a listener that uses Twitter's [Streaming API](https://developer.twitter.com/en/docs/tweets/filter-realtime/overview) to track opioid-related tweets.

In [None]:
"""Twitter scraper

This module scrapes tweets from the Twitter Streaming API and saves them to an
PostgreSQL database.

Hat tip: https://www.dataquest.io/blog/streaming-data-python/

"""

import logging
import tweepy
import dataset
from sqlalchemy.exc import ProgrammingError
from requests.packages.urllib3.exceptions import ReadTimeoutError
from textblob import TextBlob
import settings

logging.basicConfig(
    filename=f"logs/{__name__}.log",
    level=logging.DEBUG,
    format="%(name)s - %(asctime)s - %(levelname)s - %(message)s",
    filemode='w')
logger = logging.getLogger()
logger.info('Starting log...')

db = dataset.connect(settings.CONNECTION_STRING)


class StreamListener(tweepy.StreamListener):

    def on_status(self, status):
        if hasattr(status, 'retweeted_status'):
            return

        else:
            try:
                text = status.extended_tweet['full_text']
            except AttributeError:
                text = status.text
            created_utc = status.created_at
            favs = status.favorite_count
            followers = status.user.followers_count
            handle = status.user.screen_name
            loc = status.user.location
            rts = status.retweet_count
            tweet_id_str = status.id_str
            user_id_str = status.user.id_str

            blob = TextBlob(text)
            polarity = blob.sentiment.polarity
            subjectivity = blob.sentiment.subjectivity

            table = db[settings.TABLE_NAME]
            try:
                table.insert(dict(
                    tweet_id=tweet_id_str,
                    user_id=user_id_str,
                    handle=handle,
                    user_location=loc,
                    followers=followers,
                    text=text,
                    created_utc=created_utc,
                    favorites=favs,
                    retweets=rts,
                    polarity=polarity,
                    subjectivity=subjectivity,
                ))
            except ProgrammingError as err:
                logging.warning(err)

    def on_error(self, status_code):
        if status_code == 420:  # rate limiting
            return False


if __name__ == '__main__':
    # Authenticate using tokens defined in settings.py
    auth = tweepy.OAuthHandler(settings.TWITTER_APP_KEY,
                               settings.TWITTER_APP_SECRET)
    auth.set_access_token(settings.TWITTER_KEY, settings.TWITTER_SECRET)
    api = tweepy.API(auth)

    while True:
        try:
            stream_listener = StreamListener()
            stream = tweepy.Stream(auth=api.auth, listener=stream_listener,
                                   tweet_mode='extended')
            stream.filter(track=settings.TRACK_TERMS, languages=['en'],
                          stall_warnings=True)
        except ReadTimeoutError as err:
            logging.warning(err)
            continue


## Analyzing Twitter Data

Once we have compiled a dataset, we can begin our analysis.

In [16]:
"""Analysis

This module analyzes opioid-related tweets.

"""

import pandas as pd
import numpy as np
import psycopg2
import settings

conn = psycopg2.connect(settings.PSYCOPG2_CONNECTION)
df_all = pd.read_sql('select * from {}'.format(settings.TABLE_NAME), con=conn)

# Drop favorites and retweets which are always 0
df_all = df_all.drop(columns=['favorites', 'retweets'])

df_all.shape

(127327, 10)

In [2]:
# Remove tweets without location
def clean_location(df):
    tweets_total = len(df)
    
    with_location = len(df.loc[df['user_location'].notnull()])
    pct_with_location = with_location / tweets_total * 100
    
    # Keep only tweets with user_locations
    df = df.loc[df['user_location'].notnull()]
    
    with_location_comma = len(df.loc[df['user_location'].str.contains(",", na=False)])
    pct_with_location_comma = with_location_comma / with_location * 100
    
    print(
        "Tweets with user location: {0} ({1:.0f}% of all tweets)\n"
        "Tweets with comma in user location: {2} ({3:.0f}% of tweets with location)".format(
            with_location, pct_with_location, with_location_comma, pct_with_location_comma)
    )
    
    # Keep only tweets where user_location contains a comma
    return df.loc[df['user_location'].str.contains(",")]

df_geo = clean_location(df_all)

Tweets with user location: 101828 (80% of all tweets)
Tweets with comma in user location: 65752 (65% of tweets with location)


In [3]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [43]:
# Split user location on first two commas
df_geo['user_location1'], df_geo['user_location2'], df_geo['user_location3'] = df_geo['user_location'].str.split(',', 2).str

# Alternative that creates single column with list of values
# Unable to figure out how to write replace so that it iterates through list
# df_geo['user_location_split'] = df_geo['user_location'].str.split(',')

# Remove leading and trailing whitespace
df_geo.user_location1 = df_geo.user_location1.str.strip()
df_geo.user_location2 = df_geo.user_location2.str.strip()

# Capitalize proper nouns and convert abbreviations to uppercase
def capitalize_location(location):
    if location == None or not(isinstance(location, str)):
        return location
    elif len(location) <= 3:
        return location.upper()
    else:
        return location.title()
        
df_geo['user_location1'] = df_geo['user_location1'].apply(capitalize_location)
df_geo['user_location2'] = df_geo['user_location2'].apply(capitalize_location)

# Create new column containing state, which is user_location2 in most cases
df_geo['state'] = df_geo['user_location2']

# One exception is when user_location2 is "USA". In these cases, user_location1 is usually a state name
df_geo.loc[df_geo.state == 'USA', 'state'] = df_geo.user_location1

# Ensure that all states are represented by their postal abbreviations
df_geo = df_geo.replace({'state': us_state_abbrev})
df_geo.rename(columns={'state': 'state_abbrev'}, inplace=True)

df_geo[['user_location', 'user_location1', 'user_location2', 'state_abbrev']].head()

Unnamed: 0,user_location,user_location1,user_location2,state_abbrev
1,"Toronto, Ontario",Toronto,Ontario,Ontario
3,"Detroit, MI",Detroit,MI,MI
5,"Gering, NE ➡️ Lincoln, NE",Gering,Ne ➡️ Lincoln,Ne ➡️ Lincoln
9,"New York, USA",New York,USA,NY
10,"Peterborough, Ontario",Peterborough,Ontario,Ontario


In [44]:
# Keep only rows where state is a U.S. postal abbreviation
def keep_usa(df):
    tweets_initial = len(df)
    
    tweets_usa = sum(df['state_abbrev'].isin(us_state_abbrev.values()))
    pct_usa = tweets_usa / tweets_initial * 100
    
    print(
        "Tweets with clean USA location: {0} ({1:.0f}% of tweets with comma-separated location)".format(
            tweets_usa, pct_usa)
    )
    
    return df.loc[df['state_abbrev'].isin(us_state_abbrev.values())]

df_usa = keep_usa(df_geo)
df_usa[['user_location', 'user_location1', 'user_location2', 'state_abbrev']].head()

Tweets with clean USA location: 51952 (79% of tweets with comma-separated location)


Unnamed: 0,user_location,user_location1,user_location2,state_abbrev
3,"Detroit, MI",Detroit,MI,MI
9,"New York, USA",New York,USA,NY
15,"Detroit, MI",Detroit,MI,MI
16,"Minnesota, USA",Minnesota,USA,MN
21,"Forest Lake, MN",Forest Lake,MN,MN


In [58]:
count_by_state = df_usa.groupby('state_abbrev').size().reset_index(name='count')
count_by_state.head()

Unnamed: 0,state_abbrev,count
0,AK,80
1,AL,903
2,AR,219
3,AZ,540
4,CA,3433


In [65]:
# Import state population data from Census bureau
# https://www2.census.gov/programs-surveys/popest/datasets/2010-2017/national/totals/nst-est2017-alldata.csv
state_pop = pd.read_csv('data/state_population/nst-est2017-alldata.csv',
                              usecols=[*range(1, 5), 14], skiprows=[*range(1, 6)],
                              skipfooter=3, engine='python')

# Clean column headers
state_pop.columns = map(str.lower, state_pop.columns)
state_pop.rename(columns={'state': 'FIPS', 'name': 'state_name'}, inplace=True)
state_pop['state_abbrev'] = state_pop['state_name'].map(us_state_abbrev)
cols = state_pop.columns.tolist()
cols= cols[:3] + cols[-1:] + cols[3:-1]
state_pop = state_pop[cols]

# Merge with tweets_by_state
count_by_state_pop = state_pop.merge(count_by_state,on='state_abbrev',validate='1:1')
count_by_state_pop['count_per_100k'] = (count_by_state_pop['count'] / 
                                           count_by_state_pop['popestimate2017'] *
                                           100000)
count_by_state_pop.head()

Unnamed: 0,region,division,FIPS,state_abbrev,state_name,popestimate2017,count,count_per_100k
0,3,6,1,AL,Alabama,4874747,903,18.524038
1,4,9,2,AK,Alaska,739795,80,10.813807
2,4,8,4,AZ,Arizona,7016270,540,7.696397
3,3,7,5,AR,Arkansas,3004279,219,7.289603
4,4,9,6,CA,California,39536653,3433,8.683082


In [69]:
import plotly.plotly as py
scl = [[0.0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'], [0.4, 'rgb(188,189,220)'],
       [0.6, 'rgb(158,154,200)'], [0.8, 'rgb(117,107,177)'], [1.0, 'rgb(84,39,143)']]

data = [dict(
            type='choropleth',
            colorscale = scl,
            autocolorscale = False,
            locations = count_by_state_pop['state_abbrev'],
            z = count_by_state_pop['count_per_100k'],
            locationmode = 'USA-states',
            text = count_by_state_pop['state_name'],
            marker = dict(
                line = dict(
                    color = 'rgb(255,255,255)',
                    width = 2
                )
            ),
            colorbar = dict(
                title = "Tweets per 100,000")
            )
        ]

layout = dict(
            title = 'Sample of Opioid-Related Tweets: Count by State<br>(Hover for Detail)',
            geo = dict(
                scope='usa',
                projection=dict(type='albers usa'),
                showlakes = True,
                lakecolor = 'rgb(255, 255, 255)'
            )
        )
    
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='state-cloropleth-tweets')

In [24]:
test = pd.DataFrame({
        'state': ['KY', 'KY', 'KY', 'WV'],
    })

test
tweets_by_state = test.groupby('state').size().reset_index(name='counts')
print(tweets_by_state)

  state  counts
0    KY       3
1    WV       1
