In [None]:
categories = {
    "aa": ["aa", "alcoholic anonymous"],
    "acquisition": ["acquisition"],
    "addiction": ["addiction"],
    "alcohol" : ["alcohol", "henny", "henessy", "vodka"],
    "antidepressants": ["antidepressants"],
    "anxiolytics": ["anxiolytics"],
    "cannabis": ["cannabis"],
    "chem_and_bio": ["chem_and_bio"],
    "comfort": ["comfort"],
    "counseling": ["counseling"],
    "darknet": ["darknet"],
    "delivery": ["delivery"],
    "depressants": ["depressants"],
    "discomfort": ["discomfort"],
    "drug": ["drug"],
    "drug_paraphrenalia": ["paraphrenalia"],
    "drug_quantity": ["drug quantity"],
    "drug_users": ["drug users"],
    "drunk": ["drunk"],
    "effects": ["effects"],
    "energetic": ["energetic"],
    "euphoria": ["euphoria", "euphoric"],
    "finance": ["finance"],
    "hallucinogens": ["hallucinogens", "mushrooms", "lsd"],
    "health": ["health"],
    "hospital": ["hospital"],
    "idu": ["idu"],
    "increased": ["increased"],
    "legal": ["legal"],
    "locations": ["locations", "street corner"],
    "meditation": ["meditation"],
    "mental": ["mental"],
    "nooptropics": ["nooptropics", "nootropics"],
    "numbness": ["numbness"],
    "opioids": ["opioids", "oxycodone", "oxy", "heroin", "morphine"],
    "oral": ["oral", "popping"],
    "overdose": ["overdose", "OD"],
    "physical": ["physical"],
    "physical_withdrawal_symptoms": ["physical withdrawal symptoms"],
    "prescription": ["prescription"],
    "psychedelic": ["psychedelic"],
    "pschological_withdrawal_symptoms": ["pschological withdrawal symptoms"],
    "quitting": ["quitting"],
    "recovery": ["recovery"],
    "recovery_support": ["recovery support"],
    "rehab" : ["rehab", "rehabilitation"],
    "relapse": ["relapse"],
    "seizure": ["seizure"],
    "smoking": ["smoking", "cigarette", "cigs", "squares"],
    "stimulants": ["stimulants", "stims"],
    "street": ["street"],
    "supplements": ["supplements"],
    "therapy": ["therapy"],
    "tobacco": ["tobacco"],
    "tolerance": ["tolerance"],
    "using": ["using", "shooting"],
    "withdrawal": ["withdrawal"]
}


In [2]:
import Stemmer
import pandas as pd

# Use english stemmer as all keywords are in english
stemmer = Stemmer.Stemmer('en')

dfs = []
for key, values in categories.items():
    words = pd.DataFrame({'category': key, 'term': stemmer.stemWords(values)})
    dfs.append(words)
    
terms_df = pd.concat(dfs)
terms_df

Unnamed: 0,category,term
0,aa,aa
1,aa,alcoholic anonym
0,acquisition,acquisit
0,addiction,addict
0,alcohol,alcohol
...,...,...
0,tobacco,tobacco
0,tolerance,toler
0,using,use
1,using,shoot


In [3]:
tweet_db_path = "../database.json"

df = pd.read_json(tweet_db_path)

# filtering out "at's" and http links
for index, row in df.iterrows():
    filtered_tweet_words = []

    for word in row["text"].split(' '):
        if word.startswith("@") and len(word) > 1:
            word = ""
        elif word.startswith("http"):
            word = ""
        filtered_tweet_words.append(word)
    tweet = " ".join(filtered_tweet_words)
    df.at[index, "text"] = tweet

df


Unnamed: 0,id,text,search_term
0,1500998000101011456,an oxycodone prescription is a dangerous thing...,oxycodone
1,1500995581342216192,trust me they are not bankrupt they claim ban...,oxycodone
2,1500995007909535744,Large quantity of oxycodone seized during Kaml...,oxycodone
3,1500995004856107008,Large quantity of oxycodone seized during Kaml...,oxycodone
4,1500995002985373696,Large quantity of oxycodone seized during Kaml...,oxycodone
...,...,...,...
2633,1514640367592914944,"It will only cost you 2 Vicodin, an 8 ball an...",Vicodin
2634,1514632415427321856,,Vicodin
2635,1514630579651764224,I have done two brilliant things i...,Vicodin
2636,1514625270568869888,“Please do not discuss other pills outside of ...,Vicodin


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

stemmer = Stemmer.Stemmer('en')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: stemmer.stemWords([w for w in analyzer(doc)])

term_list = list(terms_df.term)

vectorizer = StemmedCountVectorizer(binary = True, vocabulary = term_list)
fitted = vectorizer.fit_transform(df.text)
words_df = pd.DataFrame(fitted.toarray(), columns = vectorizer.get_feature_names())
words_df.head()

Unnamed: 0,aa,alcoholic anonym,acquisit,addict,alcohol,henni,henessi,vodka,antidepress,anxiolyt,...,stimul,stim,street,supplement,therapi,tobacco,toler,use,shoot,withdraw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
for category_name, rows in terms_df.groupby('category'):
    terms = list(rows['term'])
    print(f"category: {category_name}\nstems: {terms}\n")

    df[category_name] = words_df[terms].any(axis=1).astype(int)

category: aa
stems: ['aa', 'alcoholic anonym']

category: acquisition
stems: ['acquisit']

category: addiction
stems: ['addict']

category: alcohol
stems: ['alcohol', 'henni', 'henessi', 'vodka']

category: antidepressants
stems: ['antidepress']

category: anxiolytics
stems: ['anxiolyt']

category: cannabis
stems: ['cannabi']

category: chem_and_bio
stems: ['chem_and_bio']

category: comfort
stems: ['comfort']

category: counseling
stems: ['counsel']

category: darknet
stems: ['darknet']

category: delivery
stems: ['deliveri']

category: depressants
stems: ['depress']

category: discomfort
stems: ['discomfort']

category: drug
stems: ['drug']

category: drug_paraphrenalia
stems: ['paraphrenalia']

category: drug_quantity
stems: ['drug quant']

category: drug_users
stems: ['drug us']

category: drunk
stems: ['drunk']

category: effects
stems: ['effect']

category: energetic
stems: ['energet']

category: euphoria
stems: ['euphoria', 'euphor']

category: finance
stems: ['financ']

categor

In [6]:
# 501st tweet in the list contains one instance of the word "Cannabis", capitalized
df['cannabis'][501]

1

In [7]:
df['text'][501]

'At the very least reschedule!\n\nAccording to the DEAs "science" the following drugs are all LESS dangerous than Cannabis &amp; Peyote:\n\nCocaine\nFentanyl\nMeth\nMethadone\nOxycodone  '

In [12]:
# tweet we're using as an example
test_tweet = df.at[123, "text"]
test_tweet

'Im too drunk rn but I need 5 more shots with oxycodone'

In [None]:
# Then, perform sentiment analysis on the tweets to figure out if there is a
# correlation between these keywords (from DUI) and a positive vs negative
# sentiment

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

pretrained_model = open("pretrained_model").read()

model = AutoModelForSequenceClassification.from_pretrained(pretrained_model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

labels = ["negative", "neutral", "positive"]

for index, row in df.iterrows():
#     print(row["text"])
    print(row)
    encoded_tweet = tokenizer(row["text"], return_tensors='pt')

    output = model(**encoded_tweet)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    for i in range(len(scores)):
        l = labels[i]
        s = scores[i]
        print(l, s)
        
    print("\n")

an oxycodone prescription is a dangerous thing for a woman like me 2 have…but i have it
negative 0.79084826
neutral 0.19534484
positive 0.013806904


 trust me they are not bankrupt they claim bankruptcy they’ll just produce their drugs under another name there are still making oxycodone
negative 0.44478217
neutral 0.50071573
positive 0.054502007


Large quantity of oxycodone seized during Kamloops traffic stop #kamloops  
negative 0.6585352
neutral 0.31950897
positive 0.021955902


Large quantity of oxycodone seized during Kamloops traffic stop #kamloops  
negative 0.6585352
neutral 0.31950897
positive 0.021955902


Large quantity of oxycodone seized during Kamloops traffic stop #kamloops  
negative 0.6585352
neutral 0.31950897
positive 0.021955902


Large quantity of oxycodone seized during Kamloops traffic stop #kamloops  
negative 0.6585352
neutral 0.31950897
positive 0.021955902


 This is just norco in liquid form . So oxycodone is technically stronger . But the tuss is still stu

negative 0.81766015
neutral 0.17053328
positive 0.011806534


A police investigation resulted in the seizure of suspected cocaine and oxycodone, among other evidence. 
negative 0.2686894
neutral 0.6848403
positive 0.046470243


$35,654 cash, 5,100 oxycodone pills, 3 pounds of methamphetamine, 1½ pounds of black tar heroin, a quarter pound of cocaine and a hangun found on man during arrest. Shoot, a feller could have a pretty good time passed out in car with all that…   [Fark]
negative 0.14757247
neutral 0.48916087
positive 0.3632666


"Telling anyone in an ER that you can take hydromorphone but not hydrocodone or oxycodone is guaranteed to get you an eyeroll and an assessment as a junkie. The fact that this can be easily verified through my medical records (or google) does not help my cause.
negative 0.8457497
neutral 0.13898107
positive 0.015269293


A 37 year old cyclist with HTN sustains a mid shaft humerus fracture and undergoes surgical fixation. He is managed inpatient with oxyco

negative 0.74877733
neutral 0.19941384
positive 0.05180875


A ella le gusta mezclar codeína y oxycodone
negative 0.13757667
neutral 0.8002232
positive 0.062200017


 I've seen fentanyl pressed into counterfeit Xanax bars and oxycodone pills that looked less sketchy than your pills there.
negative 0.6774968
neutral 0.30093658
positive 0.021566601


 I regularly hit my allotments for hydrocodone and oxycodone so I can’t  take any new pain mgt pts until one drops off .
negative 0.67195725
neutral 0.30248064
positive 0.025562074


You don't take someone with osteoporosis &amp; 2 fractures in his back off oxycodone and put him on tramadol &amp; muscle  still hurts and the pain &amp; stress got me smoking with COPD.thanks 🖕
negative 0.93372065
neutral 0.060191963
positive 0.006087419


summrs could've made changes but tupac could've never made oxycodone 2. and yeat could've made harambe but he couldn't have made you said or feel it or daddy’s birthday or relationship, at least rn, until his