In [17]:
import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import Image

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.feature_extraction import text
from tqdm import tqdm, tqdm_notebook

import json

In [13]:
df = pd.read_csv('../data/opioid_tweets_label.csv').drop(columns = ["Unnamed: 0"])
bad_tweets = pd.read_csv('../bad_tweets.txt', header = None)

In [14]:
df

Unnamed: 0,id,tweet_id,content,created_at,fav_count,url_present,user_name,followers_count,friends_count,user_description,label
0,1,1.182370e+18,"Boston Police, Public Health Officials To Trea...",10/10/19,662.0,True,SaraCarterDC,836793.0,4656.0,"@FoxNews Contributor, award winning National S...",0
1,2,1.182120e+18,Three #Chinese nationals were charged last wee...,10/10/19,302.0,True,EpochTimes,134594.0,102.0,"An independent, award-winning voice in print &...",0
2,3,1.182360e+18,Three #Chinese nationals were charged with imp...,10/10/19,164.0,True,EpochTimes,134594.0,102.0,"An independent, award-winning voice in print &...",0
3,4,1.182720e+18,Boston is using a chemical warfare device to h...,10/11/19,0.0,True,BUSPH,27642.0,2202.0,The official Twitter of Boston University Scho...,0
4,5,1.182720e+18,This makes no sense given what President Trump...,10/11/19,0.0,True,FlagHiApp,1893.0,4547.0,"FlagHi™ calculates how temperature, elevation ...",0
...,...,...,...,...,...,...,...,...,...,...,...
42933,588727,1.187910e+18,Thanks....haven't got Motrin PM..trying Naprox...,10/26/19,2.0,True,CarolMc29382003,3729.0,4996.0,#Trump2020#MAGA#KAGA#NRA.No DM NO Dating.. don...,0
42934,588728,1.187910e+18,@thistallawkgirl One year my husband dressed u...,10/26/19,6.0,True,beatalley,1738.0,4035.0,Beat Alley-Denver's Music Webzine - Vintage M...,0
42935,588729,1.187910e+18,I fractured my growth plate when I was 12 and ...,10/26/19,1.0,True,depressedloc,349.0,341.0,Scientist and minor Prophet #FreeSanchez #Free...,0
42936,588730,1.187910e+18,@_Daks_ Vicodin messed me the fuck up. Like fo...,10/26/19,0.0,False,Road_Block,910.0,365.0,"Gamer, podcaster, JMM on DungeonDrunks! RT Si...",0


In [19]:
# Creating a list of stopwords
stops = stopwords.words('english')
# Add stop variants without single quotes
no_quotes = [re.sub(r'\'','',word) for word in stops if "'" in word]
my_stop_words = ["codeine", "hydrocodone", "morphine", "oxycodone", "hydromorphone", "fentanyl", "oxycontin", "vicodin", "percocet"]
stops.extend(no_quotes)
stops.extend(my_stop_words)
def clean_string(string):
    # remove HTML entities
    temp = re.sub(r'\&\w*;','', string)
    # remove @user
    temp = re.sub(r'@(\w+)','', temp)
    # remove links
    temp = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+','', temp)
    # lowercase
    temp = temp.lower()
    # remove hashtags
#     temp = re.sub(r'#(\w+)','', temp)
    # remove repeating characters
    temp = re.sub(r'(.)\1{1,}',r'\1\1', temp)
    # remove non-letters
    temp = re.sub("[^a-zA-Z]"," ", temp)
    # remove anything that is less than two characters
    temp = re.sub(r'\b\w{1,2}\b','',temp)
    # remove multiple spaces
    temp = re.sub(r'\s\s+', ' ', temp)
    return temp

def str_preprocess(string):
    stemmer = PorterStemmer()
    # removing punctuation
    removed_punc = ''.join([char for char in string if char not in punctuation])
    # removing stopwords
    cleaned = [stemmer.stem(word.lower()) for word in removed_punc.split(' ') if word not in stops]
    return ' '.join(cleaned)

In [20]:
#my_stop_words = text.ENGLISH_STOP_WORDS.union(["codeine", "hydrocodone", "morphine", "oxycodone", "hydromorphone", "fentanyl", "oxycontin", "vicodin", "percocet"])
docs = df.content.astype(str)
cleaned_frame = docs.apply(clean_string).apply(str_preprocess)
td_idf_vec = TfidfVectorizer(stop_words=my_stop_words, max_features = 20000)
X = td_idf_vec.fit_transform(cleaned_frame)
X_norm = normalize(X)
X_arr = X_norm.toarray()

In [28]:
df_x = df.drop(columns = ["id","tweet_id","created_at", "user_name", "user_description", "url_present", "content"])
df_x.append(pd.DataFrame(X_arr))

Unnamed: 0,fav_count,followers_count,friends_count,label,0,1,2,3,4,5,...,19990,19991,19992,19993,19994,19995,19996,19997,19998,19999
0,662.0,836793.0,4656.0,0.0,,,,,,,...,,,,,,,,,,
1,302.0,134594.0,102.0,0.0,,,,,,,...,,,,,,,,,,
2,164.0,134594.0,102.0,0.0,,,,,,,...,,,,,,,,,,
3,0.0,27642.0,2202.0,0.0,,,,,,,...,,,,,,,,,,
4,0.0,1893.0,4547.0,0.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42933,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42934,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42935,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42936,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
df_x_sparse = pd.DataFrame(X_arr)
y = df.label

In [2]:
!pip install xgboost

Collecting xgboost
  Using cached https://files.pythonhosted.org/packages/96/84/4e2cae6247f397f83d8adc5c2a2a0c5d7d790a14a4c7400ff6574586f589/xgboost-0.90.tar.gz
[31m    ERROR: Command errored out with exit status 1:
     command: /opt/anaconda3/bin/python -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/private/var/folders/7l/52r8lbh106ngzkc_bp3w5gvh0000gn/T/pip-install-elqdiziw/xgboost/setup.py'"'"'; __file__='"'"'/private/var/folders/7l/52r8lbh106ngzkc_bp3w5gvh0000gn/T/pip-install-elqdiziw/xgboost/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base pip-egg-info
         cwd: /private/var/folders/7l/52r8lbh106ngzkc_bp3w5gvh0000gn/T/pip-install-elqdiziw/xgboost/
    Complete output (721 lines):
    ++ pwd
    + oldpath=/private/var/folders/7l/52r8lbh106ngzkc_bp3w5gvh0000gn/T/pip-install-elqdiziw/xgboost
    + cd ./xgboost/
    + echo d

                 ^
    In file included from src/c_api/c_api.cc:3:
    In file included from include/xgboost/data.h:11:
    In file included from dmlc-core/include/dmlc/data.h:14:
    In file included from dmlc-core/include/dmlc/./io.h:12:
    In file included from /Library/Developer/CommandLineTools/usr/bin/../include/c++/v1/istream:164:
    In file included from /Library/Developer/CommandLineTools/usr/bin/../include/c++/v1/ostream:138:
    In file included from /Library/Developer/CommandLineTools/usr/bin/../include/c++/v1/ios:216:
    In file included from /Library/Developer/CommandLineTools/usr/bin/../include/c++/v1/__locale:18:
    In file included from /Library/Developer/CommandLineTools/usr/bin/../include/c++/v1/mutex:191:
    In file included from /Library/Developer/CommandLineTools/usr/bin/../include/c++/v1/__mutex_base:17:
    /Library/Developer/CommandLineTools/usr/bin/../include/c++/v1/__threading_support:360:13: error: variable has incomplete type 'timespec'
   

In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_x_sparse, y, test_size=0.2, random_state=123)

ModuleNotFoundError: No module named 'xgboost'