In [1]:
import numpy as np 
from sklearn.externals import joblib
import pandas as pd
import scipy as sp
import re 



In [2]:
tfidf_matrix = joblib.load("../clustering/tfidf_matrix.pkl")

In [3]:
tfidf_matrix = sp.sparse.csr_matrix.toarray(tfidf_matrix) # In order to keep the integrity of the data this must be used before
#converting it to a data frame 

In [4]:
df = pd.read_csv("../Cleaning/opioid_tweets_clean.csv")

In the data frame is a link told us the page was suspended we input true in the "to_twitter" column. Since they have been removed and analytically, we observed they to have links that pointed outside of twitter before they have been removed we if "suspended" is the "unshortened_url" then we will change its value in "to_twitter" as false

In [5]:
def twitter(url):
    to_twitter = re.compile(r"https?://twitter.com[\.A-Za-z0-9/]*\s*")
    
    if type(url) != bool:
        if url == "Suspended":
           return False
        elif url == "N/A":
           return True 
        elif url == "Redo":
           return True
        r = to_twitter.findall(url)   
        if len(r) != 0:        
           return True
        else:
           return False
    else:
         return url

In [6]:
full_url = df["unshortened_url"].fillna(True)
full_url.value_counts()

True                                                                                                             17274
False                                                                                                             2602
Suspended                                                                                                          691
https://twitter.com/                                                                                               235
https://www.nytimes.com/2019/10/16/magazine/china-fentanyl-drug-ring.html                                          135
                                                                                                                 ...  
https://twitter.com/i/web/status/1180538837996638208                                                                 1
https://www.edmondsbeacon.com/story/2019/10/10/schools/school-district-issues-advisory-on-fentanyl/22143.html        1
https://twitter.com/i/web/status/118493874669904

In [7]:
to_twitter = []
for i in full_url:
    to_twitter.append( twitter(i) )

In [8]:
to_twitter = pd.Series(to_twitter)

In [9]:
to_twitter.value_counts()

True     36234
False     6702
dtype: int64

Notice that True indicate that it lead to twitter and False means it does not lead to twitter. Since it was observed that tweets selling illicit drugs had links that lead out of twitter... we should swap the boolean values

In [10]:
for i in range(len(to_twitter)):
    if to_twitter[i] == False:
        to_twitter[i] = 1
    else:
        to_twitter[i] = 0

In [11]:
to_twitter.value_counts()

False    36234
True      6702
dtype: int64

Now they have been switched, we can go ahead and use it in a classifier

In [12]:
to_twitter = to_twitter*1

In [13]:
df["to_twitter"] = to_twitter

In [14]:
with open("../bad_tweets.txt", "r") as f:
    bad_ids = [i.strip() for i in f.read().split("\n")]

In [15]:
def is_bad(id):
    if str(id) in bad_ids:
        return True
    else:
        return False

In [16]:
df["labels"] = df["id"].apply(lambda x: is_bad(x))


In [17]:
df["labels"].value_counts()

False    42874
True        62
Name: labels, dtype: int64

In [18]:
y = df["labels"]
y = y*1

In [19]:
y.value_counts()

0    42874
1       62
Name: labels, dtype: int64

Now we have the extra attributes accounted for we go ahead and do the . In order to combine the two with numpy hstack they both need to be numpy array, thus we need to convert the to_twitter series to an array

In [20]:
type(tfidf_matrix)

numpy.ndarray

In [21]:
to_twitter = to_twitter.to_numpy()

In [22]:
something = []
for i in to_twitter:
    something.append(i)

In [23]:
tfidf_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
x = pd.DataFrame(tfidf_matrix)

In [25]:
x["to_twitter"] = to_twitter

In [26]:
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20082,20083,20084,20085,20086,20087,20088,20089,20090,to_twitter
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
42932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
42933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
42934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [31]:
from sklearn.ensemble.forest import RandomForestClassifier #Be caresful with the  delcartion. 
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [35]:
%timeit classifier.fit(X_train, y_train) # In order to proceed we need something to classify our text with, in the exapmle used they
#use the use y which contains 0 or 1. 
#It is possible to create script and flag the tweets we manully identified to have illicit drugs. 
#What else would we compare this against 

17min 27s ± 4.63 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
y_pred = classifier.predict(X_test)

In [37]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[8579    0]
 [   4    5]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8579
           1       1.00      0.56      0.71         9

    accuracy                           1.00      8588
   macro avg       1.00      0.78      0.86      8588
weighted avg       1.00      1.00      1.00      8588

0.999534233814625


In the end, the extra column lead to a decrease in the recall score. 