In [70]:
import pandas as pd
import numpy as np
import re
import nltk 
import sklearn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sqlite3 import Error
from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes.BernoulliNB import BernoulliNB

## Load data

In [8]:
data = pd.read_excel('../data/database/hcc_tweets_train_label.xlsx')
data.columns

Index(['tweet_id', 'created_at', 'tweet', 'relevant to liver disease/cancer',
       'anecdote/experiential',
       'cites academic study or credible organization or doctor',
       'claim about liver cancer cure/cause', 'truth of claim',
       'contains link', 'retweet', 'in_reply_to_status_id',
       'in_reply_to_user_id', 'in_reply_to_screen_name', 'geo',
       'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'user_id',
       'username', 'screen_name', 'user_location', 'user_description',
       'user_url', 'followers_count', 'friends_count', 'listed_count',
       'user_favorites_count', 'statuses_count', 'user_lang', 'location',
       'place'],
      dtype='object')

In [68]:
# Split into training and test sets
X = data['tweet']
X.dropna(inplace=True)
y = data['relevant to liver disease/cancer']
y = y[:2009]
y.fillna(0, inplace=True)

In [60]:
# Check for non-string values
X[X.apply(lambda x: type(x)) != str]

Series([], Name: tweet, dtype: object)

## Text pre-processing

In [61]:

def preprocess_text(row, stemmer, stopwords):
    row = row.split()
    result = [stemmer.stem(word) for word in row if word not in stopwords]
    result = ' '.join(result).lower() # Convert to lowercase string
    return result

preprocess_text(X[0], stemmer, words)

'rt @undarkmag: the esophag line can’t toler repeat exposur stomach acid begin change, cell cell. division…'

In [62]:
stemmer = PorterStemmer()
words = stopwords.words("english")

X = X.apply(lambda x: preprocess_text(x, stemmer, words))
X


0       rt @undarkmag: the esophag line can’t toler re...
1       rt @lqstrengthcoach: pleas keep mom prayers. h...
2       et1402l1-artemis™2 t cell afp express hepatoce...
3       rt @earthjustice: each day chemic permit job e...
4       doe aspirin reduc risk liver cancer patient ch...
5       @cash__9 becaus liver made hepatocyt cell endo...
6       rt @earthjustice: each day chemic permit job e...
7       eat more okra: it help control diabetes, rever...
8       @buffa82 cancer suck indeed. lost dear friend ...
9       @theellenshow @cheerio 2 2 abd addit want live...
10      eat more okra: it help control diabetes, rever...
11      rt @jennycohn1: 2/ example: "a studi done show...
12      *sighs* so liver level high could xome oral ch...
13      rt @dr_novchinsky: honor present behalf americ...
14      a video new invention, 20 twentea:cancer, diab...
15      my stepfath put hospice. he liver cancer stage...
16      rt @essonews: @essonew support 13th intern liv...
17      have e

## Create vectorizer

In [63]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(X).toarray()
final_features.shape

(2009, 3089)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25)

# instead of doing these steps one at a time, we can use a pipeline to complete them all at once
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1200)),
                     ('clf', RandomForestClassifier())])

# fitting our model and save it in a pickle for later use
model = pipeline.fit(X_train, y_train)
# with open('RandomForest.pickle', 'wb') as f:
#     pickle.dump(model, f)
ytest = np.array(y_test)

# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(ytest, model.predict(X_test)))
print(confusion_matrix(ytest, model.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97       415
         1.0       0.89      0.83      0.86        88

   micro avg       0.95      0.95      0.95       503
   macro avg       0.93      0.90      0.92       503
weighted avg       0.95      0.95      0.95       503

[[406   9]
 [ 15  73]]




In [None]:
## Identify features