### NLP - Bag of Words approach

It is called a “bag” of words, because any information about the order or structure of words in the document is discarded. The model is only concerned with whether known words occur in the document, not where in the document.

In [2]:
import pandas as pd
import numpy as np
import re
import os

from nltk.stem import WordNetLemmatizer
from scipy import stats

In [3]:
path = 'C:/Users/malgo_000/Desktop/Web_scraping/twitter_scraping/tweet_texts_pharma/'
#path = os.path.join(os.getcwd(), 'tweet_texts_pharma/')

def prepare_dataset(company):
    df = pd.read_csv(path + '%s_tweets.txt' % company, sep='|')
    
    df['company'] = company
    df['id'] = df['id'].apply(str)
    
    df['hashtags'] = df['text'].apply(lambda s: re.findall(r'#(\w+)', s))
    df['num_hash'] = df['hashtags'].apply(len)
    
    df['tagged'] = df['text'].apply(lambda s: re.findall(r'@(\w+)', s))
    
    def clean_tweet(tweet):
        check = '(@[A-Za-z]+)|([^A-Za-z \t])|(\w+:\/\/\S+)'
        return ' '.join(re.sub(check, ' ', tweet).split())
        
    df['clean_tweet'] = [clean_tweet(tweet) for tweet in df['text']] 
    df['len'] = df['clean_tweet'].apply(len)
    df['text_as_list'] = df['clean_tweet'].apply(lambda x: x.lower()).str.split(' ')
    
    df['datetime'] = pd.to_datetime(df['created_at'])
    df['date'] = df['datetime'].apply(lambda x: x.strftime("%Y-%m-%d"))
    df['hour'] = df['datetime'].apply(lambda x: x.hour)
    df['month'] = df['datetime'].apply(lambda x: x.month)
    df['day'] = df['datetime'].apply(lambda x: x.day)
    df['year'] = df['datetime'].apply(lambda x: x.year)
    df = df.drop(columns=['created_at'])
        
    df['z'] = np.abs(stats.zscore(df['fav']))
    
    return df

df = prepare_dataset('AstraZeneca')

def combine_tweets(company_names):
    df_all = pd.DataFrame()
    for company in company_names:
        df_all = df_all.append(prepare_dataset(company), ignore_index = True)        
    
    df_all = df_all[df_all['z'] < 3] 
    df_all = df_all.append(df, ignore_index = True)
    
    return df_all

company_list = ['JNJCares', 'Roche', 'Pfizer','Novartis', 'BayerPharma',
                'Merck','GSK','Sanofi', 'abbvie', 'AbbottGlobal',
                'LillyPad', 'Amgen', 'bmsnews', 'GileadSciences']

df_all = combine_tweets(company_list)

In [4]:
# counting company averages to tell which tweets were more popular
df_all['percentile'] = df_all.groupby(['company'])['fav'].transform(lambda x: np.percentile(x,80))

df_all['popular_tf'] = df_all['percentile'] <= df_all['fav']

maping = {True: 1, False: 0}
df_all['popular'] = df_all['popular_tf'].map(maping)

df_analysis = df_all[['id','popular','text_as_list']]

print(df_analysis[['popular','text_as_list']].head(5))

   popular                                       text_as_list
0        0  [thanks, for, sharing, please, send, us, a, di...
1        0  [thanks, for, reaching, out, our, team, that, ...
2        0  [thank, you, for, taking, the, time, to, reach...
3        1  [dyk, j, amp, j, offers, transgender, inclusiv...
4        0  [to, get, the, best, answers, to, this, questi...


In [5]:
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
 
hashtag_list = []
for hashtag in df_all['hashtags']:
        for h in hashtag:
            if h.lower() not in hashtag_list:
                hashtag_list.append(h.lower())
                
print(hashtag_list[:5])

['pridemonth', 'lgbtq', 'jnj', '5bfilm', 'makehivhistory']


In [6]:
lemmatizer = WordNetLemmatizer()  

# feature extractor function
def bag_of_words(tweet):
    words_dictionary = dict([lemmatizer.lemmatize(word), True] for word in tweet if word not in hashtag_list)    
    return words_dictionary
 
pop_tweets = df_analysis['text_as_list'][df_analysis['popular']==1]
unpop_tweets = df_analysis['text_as_list'][df_analysis['popular']==0]

# popular tweets feature set
pop_tweets_set = []
for tweet in pop_tweets:
    pop_tweets_set.append((bag_of_words(tweet), 'pop'))    
 
 # less popular tweets feature set
unpop_tweets_set = []
for tweet in unpop_tweets:
    unpop_tweets_set.append((bag_of_words(tweet), 'unpop'))
 
print (len(pop_tweets_set), len(unpop_tweets_set))

9955 38011


In [None]:
from random import shuffle 
shuffle(pop_tweets_set)
shuffle(unpop_tweets_set)
 
test_set = pop_tweets_set[:1000] + unpop_tweets_set[:1000]
train_set = pop_tweets_set[1000:] + unpop_tweets_set[1000:]

print(len(test_set),  len(train_set))

from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print("accuracy: ", accuracy) # Output: 0.765
 

In [9]:
print (classifier.show_most_informative_features(600))

2000 45966
accuracy:  0.6415
Most Informative Features
                      dm = True            unpop : pop    =    144.8 : 1.0
                    send = True            unpop : pop    =    137.8 : 1.0
                      hi = True            unpop : pop    =    127.2 : 1.0
                     com = True            unpop : pop    =     65.8 : 1.0
                     saw = True            unpop : pop    =     54.9 : 1.0
                  assist = True            unpop : pop    =     46.1 : 1.0
                 contact = True            unpop : pop    =     45.2 : 1.0
                  please = True            unpop : pop    =     35.5 : 1.0
                feedback = True            unpop : pop    =     30.1 : 1.0
                    note = True            unpop : pop    =     24.3 : 1.0
                   ifato = True              pop : unpop  =     23.4 : 1.0
                 nearest = True            unpop : pop    =     19.6 : 1.0
                 crystal = True            un

In [8]:


# List of most liked hashtags

df_hashtag = df_all[['hashtags', 'fav']][df_all['num_hash']>0]

hashtag_dict = {}

for row, f in zip(df_hashtag['hashtags'], df_hashtag['fav']):
    for h in row:
        if h in hashtag_dict: 
            hashtag_dict[h][0] += f
            hashtag_dict[h][1] += 1
        else:
            hashtag_dict[h] = [f,1]
            
# Finding 10 highest values 
most_pop_hash = collections.Counter(hashtag_dict).most_common(10)  
 
print("Hashtags that were the most popular overall:") 
print(" ")
for k in most_pop_hash:    
    print(k[0],":",k[1][0])     
    
fav_per_post = {}    
    
for j in hashtag_dict:
    if hashtag_dict[j][0] != 0:
        fav_per_post[j] = round(hashtag_dict[j][0]/hashtag_dict[j][1],2)

# Finding 10 highest values 
most_eff_hash = collections.Counter(fav_per_post).most_common(10) 
print("------------------------------------------------------ ")
print("Hashtags that received the most likes per use in total:")     
print(" ")
for k in most_eff_hash:    
    print(k[0],":",int(k[1]))     

NameError: name 'collections' is not defined

In [None]:
company_list_full =  sorted(df_all['company'].unique())

daily_count_pd = pd.DataFrame(df_all['date'].unique(), columns = ['date_tweeted'])
daily_count_pd = daily_count_pd.set_index('date_tweeted')

for company in company_list_full:
    dates = df_all['date'][df_all['company'] == company].value_counts()
    dates = pd.DataFrame(dates, columns = ['date'])
    dates.set_index('date')
    daily_count_pd = pd.concat([daily_count_pd, dates], axis = 1, sort = True)
    
daily_count_pd.columns = company_list_full
daily_count_pd = daily_count_pd.fillna(0)
daily_count_pd.sort_index(ascending = 0)[:5]