In [18]:
import pandas as pd
import random
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from gensim.models import Doc2Vec
from tqdm import tqdm
from sklearn import utils
from sklearn.metrics import accuracy_score, f1_score
tqdm.pandas(desc="progress-bar")
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

### Display Options for DFs

In [2]:
pd.set_option('display.max_colwidth', -1)

# Functions

In [3]:
# clean tweets from URLs and Mentions
def clean_text(text):
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    text = text.lower()
    return text

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

def add_sentiment(text, text2):
    if text2 == 4:
        return text + " positive"
    else:
        return text + " negative"
    
def feature_eng(tweet):
    step1 = re.sub(r'@([^:\s]+)', r'__MENTION__', tweet)
    output = re.sub(r'(https?://[^\s]+)', r'__URL__', step1)
    return output

# Research Data

In [4]:
import glob 

path = 'refilteredtweets/*.csv'
colnames = ['tweets', 'FP']

xlists = []
ylists = []

for fname in glob.glob(path):
    tweets = pd.read_csv(fname, skipinitialspace=True, usecols=colnames)
#     tweets['tweets'] = tweets['tweets'].apply(clean_text)
    xlists.append(tweets['tweets'].tolist())
    ylists.append(tweets['FP'].tolist())
    
    
X = [x for xlist in xlists for x in xlist] # the element, then the outer loop first, then inner loop
y = [y if y == 1.0 else 0.0 for ylist in ylists for y in ylist]

X = [feature_eng(x) for x in X]

print(len(X), len(y))

643 643


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [6]:
X_train[:10]

['Protect yourself from Zika! Wear protective clothing &amp; use approved repellents when outside. #Zika #ProtectYourself __URL__',
 'Got bit by a mosquito. Now all I can think of is #zika.  My brain.  sigh...',
 'Take some simple steps 2 protect yourself from the #Zika virus. __URL__ __MENTION__ __MENTION__ __URL__',
 'Take just 2 teaspoons of 100% natural RepelZika and your entire body is protected from mosquitoes.   #RepelZika… __URL__',
 'Fighting Zika in the US: The Battle Over GMO Mosquitoes - ABC News - __URL__ via __MENTION__',
 "Keep spraying all those toxic pesticides &amp; you'll kill more people than Zika will. __MENTION__ __MENTION__ __URL__",
 'Protect your family and pets from diseases like West Nile Virus, Zika Virus, and Malaria by getting your yard sprayâ\x80¦ __URL__',
 'Get the real scoop on #zika and what action you can take to protect yourself and others: __URL__',
 'See: Spike in sales of mosquito repellent products amid Zika concerns __URL__ __URL__',
 'Take a l

# Pipeline Model

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from time import time
from sklearn.pipeline import FeatureUnion

In [8]:
pipeline = Pipeline([('vect',CountVectorizer(max_df=0.5, ngram_range = (1,2), lowercase = True)),
#                     ('tfidf',TfidfTransformer(norm='l1', use_idf=True)),
                    ('MNB',MultinomialNB(fit_prior=False))])
#                     ('LogReg', LogisticRegression(penalty='l1'))])

In [9]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

scoring = ['precision_macro', 'recall_macro']

scores = cross_validate(pipeline, X, y, scoring=scoring, cv=5, return_train_score=True)

for k, v in scores.items():
    print(k,":", v)
    print()

fit_time : [0.03010821 0.02455068 0.06723499 0.02423477 0.02455854]

score_time : [0.01297736 0.00976539 0.00967407 0.00972581 0.00927305]

test_precision_macro : [0.74634503 0.77284946 0.79069767 0.79282151 0.79175258]

train_precision_macro : [0.98846913 0.99402985 0.9942407  0.99255952 0.99552239]

test_recall_macro : [0.76479832 0.73926139 0.78156103 0.77671556 0.7300813 ]

train_recall_macro : [0.98604989 0.98907104 0.99302495 0.9863388  0.99184783]



In [10]:
parameters = {
    'vect__lowercase': ('True', 'False'),   
}
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
parameters:
{'vect__lowercase': ('True', 'False')}
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


done in 1.158s
Best score: 0.813
Best parameters set:
	vect__lowercase: 'True'


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.0s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.1s finished


In [11]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.5,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('MNB',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False))],
         verbose=False)

In [12]:
y_predicted = pipeline.predict(X_test)

In [13]:
print(classification_report(y_predicted, y_test))

              precision    recall  f1-score   support

         0.0       0.94      0.82      0.87        88
         1.0       0.69      0.88      0.77        41

    accuracy                           0.84       129
   macro avg       0.81      0.85      0.82       129
weighted avg       0.86      0.84      0.84       129



# Running the model

In [19]:
import glob

In [34]:
for file in glob.glob("all_tweets/*"):
    tweets = list(pd.read_csv(file, lineterminator='\n')['tweet'])
    labels = pipeline.predict(tweets)
    
    
    df = pd.DataFrame({"tweet": tweets, "label": labels})
    filename = file.split("/")[1].split(".")[0] + "_labeled.csv"
    df.to_csv("classified_tweets/{}".format(filename), index=False)

In [54]:
for file in glob.glob("classified_tweets/*"):
    print(file)
    
    df = pd.read_csv(file, lineterminator='\n')
    num_tweets = len(df)
    num_fp_tweets = sum(df['label'] == 1)
    
    print("Num tweets = {}".format(num_tweets))
    print("Num fp tweets = {}".format(num_fp_tweets))
    print("Percentage = {:.1f}".format(num_fp_tweets/num_tweets*100))
    print()

classified_tweets/tweets_pronouns_labeled.csv
Num tweets = 33533
Num fp tweets = 20625
Percentage = 61.5

classified_tweets/tweets_VBG_labeled.csv
Num tweets = 11415
Num fp tweets = 2084
Percentage = 18.3

classified_tweets/tweets_VB_labeled.csv
Num tweets = 4122
Num fp tweets = 616
Percentage = 14.9

classified_tweets/tweets_VBD_labeled.csv
Num tweets = 4299
Num fp tweets = 760
Percentage = 17.7

classified_tweets/tweets_VBN_labeled.csv
Num tweets = 6589
Num fp tweets = 1069
Percentage = 16.2

classified_tweets/tweets_VBZ_labeled.csv
Num tweets = 5185
Num fp tweets = 1049
Percentage = 20.2

classified_tweets/tweets_VBP_labeled.csv
Num tweets = 4781
Num fp tweets = 866
Percentage = 18.1

