In [7]:
categories = {
    "aa": ["aa", "alcoholic anonymous"],
    "acquisition": ["acquisition"],
    "addiction": ["addiction"],
    "alcohol" : ["alcohol", "henny", "henessy", "vodka"],
    "antidepressants": ["antidepressants"],
    "anxiolytics": ["anxiolytics"],
    "cannabis": ["cannabis"],
    "chem_and_bio": ["chem_and_bio"],
    "comfort": ["comfort"],
    "counseling": ["counseling"],
    "darknet": ["darknet"],
    "delivery": ["delivery"],
    "depressants": ["depressants"],
    "discomfort": ["discomfort"],
    "drug": ["drug"],
    "drug_paraphrenalia": ["paraphrenalia"],
    "drug_quantity": ["drug quantity"],
    "drug_users": ["drug users"],
    "drunk": ["drunk"],
    "effects": ["effects"],
    "energetic": ["energetic"],
    "euphoria": ["euphoria", "euphoric"],
    "finance": ["finance"],
    "hallucinogens": ["hallucinogens", "mushrooms", "lsd"],
    "health": ["health"],
    "hospital": ["hospital"],
    "idu": ["idu"],
    "increased": ["increased"],
    "legal": ["legal"],
    "locations": ["locations", "street corner"],
    "meditation": ["meditation"],
    "mental": ["mental"],
    "nooptropics": ["nooptropics", "nootropics"],
    "numbness": ["numbness"],
    "opioids": ["opioids", "oxycodone", "oxy", "heroin", "morphine"],
    "oral": ["oral", "popping"],
    "overdose": ["overdose", "OD"],
    "physical": ["physical"],
    "physical_withdrawal_symptoms": ["physical withdrawal symptoms"],
    "prescription": ["prescription"],
    "psychedelic": ["psychedelic"],
    "pschological_withdrawal_symptoms": ["pschological withdrawal symptoms"],
    "quitting": ["quitting"],
    "recovery": ["recovery"],
    "recovery_support": ["recovery support"],
    "rehab" : ["rehab", "rehabilitation"],
    "relapse": ["relapse"],
    "seizure": ["seizure"],
    "smoking": ["smoking", "cigarette", "cigs", "squares"],
    "stimulants": ["stimulants", "stims"],
    "street": ["street"],
    "supplements": ["supplements"],
    "therapy": ["therapy"],
    "tobacco": ["tobacco"],
    "tolerance": ["tolerance"],
    "using": ["using", "shooting"],
    "withdrawal": ["withdrawal"]
}


In [8]:
import Stemmer
import pandas as pd

# Use english stemmer as all keywords are in english
stemmer = Stemmer.Stemmer('en')

dfs = []
for key, values in categories.items():
    words = pd.DataFrame({'category': key, 'term': stemmer.stemWords(values)})
    dfs.append(words)
    
terms_df = pd.concat(dfs)
terms_df

Unnamed: 0,category,term
0,aa,aa
1,aa,alcoholic anonym
0,acquisition,acquisit
0,addiction,addict
0,alcohol,alcohol
...,...,...
0,tobacco,tobacco
0,tolerance,toler
0,using,use
1,using,shoot


In [9]:
tweet_db_path = "../database.json"

df = pd.read_json(tweet_db_path)

# filtering out "at's" and http links
for index, row in df.iterrows():
    filtered_tweet_words = []

    for word in row["text"].split(' '):
        if word.startswith("@") and len(word) > 1:
            word = ""
        elif word.startswith("http"):
            word = ""
        filtered_tweet_words.append(word)
    tweet = " ".join(filtered_tweet_words)
    df.at[index, "text"] = tweet

df


Unnamed: 0,id,text,search_term
0,1500998000101011456,an oxycodone prescription is a dangerous thing...,oxycodone
1,1500995581342216192,trust me they are not bankrupt they claim ban...,oxycodone
2,1500995007909535744,Large quantity of oxycodone seized during Kaml...,oxycodone
3,1500995004856107008,Large quantity of oxycodone seized during Kaml...,oxycodone
4,1500995002985373696,Large quantity of oxycodone seized during Kaml...,oxycodone
...,...,...,...
2633,1514640367592914944,"It will only cost you 2 Vicodin, an 8 ball an...",Vicodin
2634,1514632415427321856,,Vicodin
2635,1514630579651764224,I have done two brilliant things i...,Vicodin
2636,1514625270568869888,“Please do not discuss other pills outside of ...,Vicodin


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

stemmer = Stemmer.Stemmer('en')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: stemmer.stemWords([w for w in analyzer(doc)])

term_list = list(terms_df.term)

vectorizer = StemmedCountVectorizer(binary = True, vocabulary = term_list)
fitted = vectorizer.fit_transform(df.text)
words_df = pd.DataFrame(fitted.toarray(), columns = vectorizer.get_feature_names())
words_df.head()



Unnamed: 0,aa,alcoholic anonym,acquisit,addict,alcohol,henni,henessi,vodka,antidepress,anxiolyt,...,stimul,stim,street,supplement,therapi,tobacco,toler,use,shoot,withdraw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
for category_name, rows in terms_df.groupby('category'):
    terms = list(rows['term'])
    print(f"category: {category_name}\nstems: {terms}\n")

    df[category_name] = words_df[terms].any(axis=1).astype(int)

category: aa
stems: ['aa', 'alcoholic anonym']

category: acquisition
stems: ['acquisit']

category: addiction
stems: ['addict']

category: alcohol
stems: ['alcohol', 'henni', 'henessi', 'vodka']

category: antidepressants
stems: ['antidepress']

category: anxiolytics
stems: ['anxiolyt']

category: cannabis
stems: ['cannabi']

category: chem_and_bio
stems: ['chem_and_bio']

category: comfort
stems: ['comfort']

category: counseling
stems: ['counsel']

category: darknet
stems: ['darknet']

category: delivery
stems: ['deliveri']

category: depressants
stems: ['depress']

category: discomfort
stems: ['discomfort']

category: drug
stems: ['drug']

category: drug_paraphrenalia
stems: ['paraphrenalia']

category: drug_quantity
stems: ['drug quant']

category: drug_users
stems: ['drug us']

category: drunk
stems: ['drunk']

category: effects
stems: ['effect']

category: energetic
stems: ['energet']

category: euphoria
stems: ['euphoria', 'euphor']

category: finance
stems: ['financ']

categor

In [14]:
# 501st tweet in the list contains one instance of the word "Cannabis", capitalized
df['cannabis'][501]

1

In [15]:
df['text'][501]

'At the very least reschedule!\n\nAccording to the DEAs "science" the following drugs are all LESS dangerous than Cannabis &amp; Peyote:\n\nCocaine\nFentanyl\nMeth\nMethadone\nOxycodone  '

In [16]:
# tweet we're using as an example
test_tweet = df.at[123, "text"]
test_tweet

'Im too drunk rn but I need 5 more shots with oxycodone'

In [34]:
# Then, perform sentiment analysis on the tweets to figure out if there is a
# correlation between these keywords (from DUI) and a positive vs negative
# sentiment

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

pretrained_model = open("pretrained_model").read()

model = AutoModelForSequenceClassification.from_pretrained(pretrained_model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

labels = ["negative", "neutral", "positive"]

data = {}
for index, row in df.head(5).iterrows():
#     print(row["text"])
    # print(row[3:])
    encoded_tweet = tokenizer(row["text"], return_tensors='pt')

    output = model(**encoded_tweet)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    for i in range(len(scores)):
        l = labels[i]
        s = scores[i]

    for i in row[3:]:
        print(index)
        # if i > 0:
        #     if data[i]:
        #         data[i] = scores[2]
        #     else:
        #         data[i] += scores[2]
            
        # print(l, s)
        
    # print("\n")

print(data)

TypeError: cannot unpack non-iterable int object