## Authentication

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
def load_file_from_drive(id, name):
    downloaded = drive.CreateFile({'id':id})
    downloaded.GetContentFile(name)

In [None]:
!ls

adc.json  sample_data  tweets.csv


## importing modules

In [None]:
!pip install ipython-autotime

%load_ext autotime

time: 165 µs (started: 2021-01-27 08:24:32 +00:00)


In [None]:
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

import pickle
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
time: 644 ms (started: 2021-01-27 08:24:32 +00:00)


## Preprocessing dataset

In [None]:
df=pd.read_csv('tweets.csv')

time: 1.08 s (started: 2021-01-27 08:29:04 +00:00)


In [None]:
df = df.drop_duplicates()

time: 337 ms (started: 2021-01-27 08:29:06 +00:00)


In [None]:
df.head()

Unnamed: 0,Time,Text,Location,RetweetCount,LikeCount,Author
0,2020-12-11 17:23:14,b'@ ERobbPrincipal RT @bbray27: Qs for 12/14/2...,,0,0,User(_api=<tweepy.api.API object at 0x7fa7712d...
1,2020-12-11 17:23:14,b'RT @RitheeshVfc: B/W love \xf0\x9f\x96\xa4\x...,,7,0,User(_api=<tweepy.api.API object at 0x7fa7712d...
2,2020-12-11 17:23:14,"b""Watch @pinkun's broadcast: Daniel Farke Pres...",,0,0,User(_api=<tweepy.api.API object at 0x7fa7712d...
3,2020-12-11 17:23:13,b'RT @Prada: The holidays are for intrigue. Di...,,5,0,User(_api=<tweepy.api.API object at 0x7fa7712d...
4,2020-12-11 17:23:11,b'RT @ArkadyRzegocki: #OTD in 1994 died Gen St...,,15,0,User(_api=<tweepy.api.API object at 0x7fa7712d...


time: 23.9 ms (started: 2021-01-27 08:29:06 +00:00)


In [None]:
df.columns[df.isnull().any()]

Index(['Location'], dtype='object')

time: 21.7 ms (started: 2021-01-27 08:29:06 +00:00)


**PreProcessing**

In [None]:
def preprocess(tweet):
    hashtags = []
    tokenizer=nltk.RegexpTokenizer(r"\w+")
    stopword_list=nltk.corpus.stopwords.words('english')+ ['u','im','rt','ummm','b','dont', 'arent','ya','yall','isnt'
                                                          ,'cant','couldnt','wouldnt','wont', 'yr','aint','gonna','ur',
                                                          'didnt','r','wasnt','werent','might','maybe','doesnt','would','shes'
                                                          ,'hes','youre', 'omg','us', 'wow'] + stopwords.words('english')

    preposition = ['in', 'at', 'by', 'from', 'on', 'for', 'with', 'about', 'into', 'through', 'between', 'under',
                   'against', 'during', 'without', 'upon', 'toward', 'among', 'within', 'along', 'across', 'behind',
                   'near', 'beyond', 'using', 'throughout', 'despite', 'to', 'beside', 'plus', 'towards', 'concerning',
                   'onto', 'beneath', 'via']
    stopword_list += preposition
    ps=PorterStemmer()

    #Convert to lower case
    tweet = tweet['Text'].lower()
    # remove unwanted characters
    tweet=re.sub(r'(\\x[^\s][^\s])',"", tweet)
    # remove \n
    tweet=re.sub(r'\\n',' ',tweet)
    #remove url and mentions
    tweet = re.sub(r"(?:\@|rt @|https?\://)\S+", " ", tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #extract hashtags
    hashtags = re.findall(r"#(\w+)", tweet)
    # remove hashtags from tweet
    tweet = re.sub(r'#([^\s]+)'," ", tweet)
    # tokenize tweets and remove punctuantions
    tokens=tokenizer.tokenize(tweet)
    # remove stopwords and stemming
    tokens=[ps.stem(word) for word in tokens if word not in stopword_list]
    # remove short words and numbers
    twt=' '.join([token for token in tokens if token.isalpha() and len(token)>2 ])
    hashtags=' '.join(hashtags)
    return twt, hashtags

time: 37.5 ms (started: 2021-01-27 08:29:06 +00:00)


In [None]:
df[['Text','hashtags']]=df.apply(preprocess, axis=1, result_type="expand")

time: 17.3 s (started: 2021-01-27 08:29:06 +00:00)


In [None]:
df = df[df.hashtags !='']

time: 19.1 ms (started: 2021-01-27 08:29:24 +00:00)


In [None]:
df=df[df.Text !='']

time: 14.4 ms (started: 2021-01-27 08:29:24 +00:00)


In [None]:
df.head()

Unnamed: 0,Time,Text,Location,RetweetCount,LikeCount,Author,hashtags
0,2020-12-11 17:23:14,erobbprincip chat guest host buildin,,0,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,rethink_learning
1,2020-12-11 17:23:14,love,,7,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,master
2,2020-12-11 17:23:14,watch broadcast daniel fark press confer bulletin,,0,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,ncfc rovers football sport norwich
3,2020-12-11 17:23:13,holiday intrigu discov new stranger call campa...,,5,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,pradagifts pradaholiday20
4,2020-12-11 17:23:11,die gen stanisaw play key role battl normandi ...,,15,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,otd maczek


time: 21.2 ms (started: 2021-01-27 08:29:24 +00:00)


In [None]:
len(df)

27126

time: 2.73 ms (started: 2021-01-27 08:29:24 +00:00)


split train and text data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['Text'],df['hashtags'],test_size=0.1,random_state=42)

time: 13.3 ms (started: 2021-01-27 08:29:24 +00:00)


In [None]:
len(x_test)

2713

time: 4.89 ms (started: 2021-01-27 08:29:24 +00:00)


In [None]:
df.head()

Unnamed: 0,Time,Text,Location,RetweetCount,LikeCount,Author,hashtags
0,2020-12-11 17:23:14,erobbprincip chat guest host buildin,,0,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,rethink_learning
1,2020-12-11 17:23:14,love,,7,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,master
2,2020-12-11 17:23:14,watch broadcast daniel fark press confer bulletin,,0,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,ncfc rovers football sport norwich
3,2020-12-11 17:23:13,holiday intrigu discov new stranger call campa...,,5,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,pradagifts pradaholiday20
4,2020-12-11 17:23:11,die gen stanisaw play key role battl normandi ...,,15,0,User(_api=<tweepy.api.API object at 0x7fa7712d...,otd maczek


time: 26.3 ms (started: 2021-01-27 08:29:24 +00:00)


In [None]:
y_test.head()

25009    vaccine sdoh covid19 cova
6937                       chicago
25655             monbebe monsta_x
27772                       amazon
26486             monbebe monsta_x
Name: hashtags, dtype: object

time: 5.87 ms (started: 2021-01-27 08:29:24 +00:00)


## HF_IHU

In [None]:
corpus=[]
hashtagTweets = {}
wordHashtags = {}
hashtagWithWholeTweets = {}
for tweet, hashtag in zip(x_train,y_train):
  corpus+=tweet.split()
  for h in hashtag.split():
    if h not in hashtagTweets:
        hashtagTweets[h] = tweet
    else:
        hashtagTweets[h] += tweet
  for word in tweet.split():
    if word not in wordHashtags:
        wordHashtags[word] = hashtag
    else:
        wordHashtags[word] += (hashtag+ " ")

time: 254 ms (started: 2021-01-27 08:29:37 +00:00)


In [None]:
for word in wordHashtags:
  # print(len(wordHashtags[word].split()))
  wordHashtags[word].split()

time: 26.8 ms (started: 2021-01-27 08:29:45 +00:00)


### training

In [None]:
from collections import defaultdict 
import math
hashtagFrequency = defaultdict(dict)

def hfIcf():
    V = len(corpus)
    for word in wordHashtags:
        allHashtagNum = len(wordHashtags[word].split())
        for hashtag in wordHashtags[word].split():
          n=0
          for h in wordHashtags[word].split():
            if h == hashtag:
              n+=1
          hashtagFrequency[word][hashtag] = float(n)/allHashtagNum

    inverseCorpusFrequency = {}
    for hashtag in hashtagTweets:
        allWordSum=len(hashtagTweets[hashtag].split())
        inverseLog = math.log(V) - math.log(allWordSum)
        inverseCorpusFrequency[hashtag] = inverseLog

    return hashtagFrequency,inverseCorpusFrequency



time: 14.2 ms (started: 2021-01-27 08:30:21 +00:00)


In [None]:
hashtag_frequency,inverse_corpus_frequency = hfIcf()

time: 22.2 s (started: 2021-01-27 08:30:58 +00:00)


In [None]:
save_inverseCorpusFrequencys = open('inverseCorpusFrequency.pickle','wb')
pickle.dump(inverse_corpus_frequency, save_inverseCorpusFrequencys)
save_inverseCorpusFrequencys.close()

time: 12 ms (started: 2021-01-27 08:31:26 +00:00)


In [None]:
save_hashtagFrequency = open('hashtagFrequency.pickle','wb')
pickle.dump(hashtag_frequency, save_hashtagFrequency)
save_hashtagFrequency.close()

time: 54.2 ms (started: 2021-01-27 08:31:32 +00:00)


In [None]:
# sort hashtag according to the value of it
def sortedHashtag(hashtagFrequency, inverseCorpusFrequency):
    sortedWordHashtags = defaultdict(dict)
    for word in hashtagFrequency:
        hashtagWithValue = []
        for hashtag in hashtagFrequency[word]:
          if hashtag in inverseCorpusFrequency:
            # print('mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm')
            value = hashtagFrequency[word][hashtag] * inverseCorpusFrequency[hashtag]
          else:
            value = hashtagFrequency[word][hashtag]
          hashtagWithValue.append((hashtag, value))
        hashtagWithValue = sorted(hashtagWithValue, key= lambda hashtagWithValue: hashtagWithValue[1], reverse = True)
        sortedWordHashtags[word] = hashtagWithValue
    return sortedWordHashtags


time: 9.78 ms (started: 2021-01-27 08:31:38 +00:00)


In [None]:
sorted_word_hashtags = sortedHashtag(hashtag_frequency,inverse_corpus_frequency)

time: 101 ms (started: 2021-01-27 08:32:00 +00:00)


In [None]:
save_sortedWordHashtags = open('sortedWordHashtags.pickle','wb')
pickle.dump(sorted_word_hashtags, save_sortedWordHashtags)
save_sortedWordHashtags.close()

time: 56.4 ms (started: 2021-01-27 08:32:02 +00:00)


### evaluation

In [None]:
def hashtagRecommend(tweet,hashtags,hashtagFrequency,inverseCorpusFrequency):
    tweetScore = []
    for word in tweet.split():
        score =0
        if word in hashtagFrequency:
            for hashtag in hashtagFrequency[word]:
              if hashtag in inverseCorpusFrequency:
                score += hashtagFrequency[word][hashtag] * inverseCorpusFrequency[hashtag]
              else:
                score += hashtagFrequency[word][hashtag]
        tweetScore.append((word,score))

    tweetScore = sorted(tweetScore, key= lambda tweetScore: tweetScore[1], reverse= True)
    return tweetScore

time: 5.79 ms (started: 2021-01-27 08:31:46 +00:00)


In [None]:
def accuracy(x_test, y_test, hashtagFrequency,inverseCorpusFrequency,sortedWordHashtags):
    correctNum = 0
    totalTweetsNum = 0

    tp=0
    fn=0
    fp=0
    tn=0
    for tweet, hashtags in zip(x_test,y_test):
        original_hashtags=hashtags.split()
        rankedWords = hashtagRecommend(tweet,hashtags,hashtagFrequency,inverseCorpusFrequency)
        hashtagRecommended = []
        num = 0
        less = 0
        for i in range(0, len(rankedWords)):
            num_per_word = 0
            if num == 5:
                break
            hashtagNum = 0
            if i < 4:
                hashtagNum = 4 -i + less
            else:
                hashtagNum = 2
            if rankedWords[i][1] !=0:
                for j in range(0, hashtagNum):
                    if j == len(sortedWordHashtags[rankedWords[i][0]]):
                        break
                    hashtagRecommended.append(sortedWordHashtags[rankedWords[i][0]][j][0])
                    num += 1
                    num_per_word += 1
                less = hashtagNum - num_per_word

            else:
                break
        # print(hashtagRecommended)
        if len(hashtagRecommended) != 0:
            
            totalTweetsNum += 1
            correct = False
            for h in original_hashtags:
            # print(h)
                if h in hashtagRecommended:
                    correctNum += 1
                    correct = True
                    break

        if hashtagRecommended:
            for recom in hashtagRecommended:
                if recom in original_hashtags:
                    tp+=1
                else:
                    fp+=1
            for org in original_hashtags:
                if org not in hashtagRecommended:
                    fn+=1
    if tp==0:
        precision=0
        recall=0
        f1=0
    else:
        precision= (tp/(tp+fp))*100
        recall=(tp/(tp+fn))*100
        f1 = 2 * (precision * recall) / (precision + recall)
        accuracy=(float(correctNum)/(totalTweetsNum+1))*100
    
        

    return precision, recall, f1, accuracy

time: 63.4 ms (started: 2021-01-27 08:31:50 +00:00)


In [None]:
precision, recall, f1, accuracy_score = accuracy(x_test, y_test, hashtag_frequency, inverse_corpus_frequency, sorted_word_hashtags)

time: 601 ms (started: 2021-01-27 08:32:10 +00:00)


In [None]:
print(precision)
print(recall)
print(f1)
print(accuracy_score)

32.58020164986251
92.13920031106215
48.13868057083848
81.01732399557685
time: 1.8 ms (started: 2021-01-27 08:32:12 +00:00)


In [None]:
from IPython.display import HTML, display
import tabulate
table = [["Precision",precision],
         ["recall",recall],
         ["f1 score",f1],
         ["accuracy",accuracy_score]]
display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1
Precision,32.5802
recall,92.1392
f1 score,48.1387
accuracy,81.0173


time: 13.3 ms (started: 2021-01-27 08:32:17 +00:00)


In [None]:
print(hashtagRecommend('Today is warm', '', hashtag_frequency, inverse_corpus_frequency))

[('warm', 8.773226223831113), ('Today', 0), ('is', 0)]
time: 2.11 ms (started: 2021-01-27 08:10:01 +00:00)


predict new tweet

In [None]:
def new_hashtag_recommendation(new_tweet_str):
    inverseCorpusFrequency=pickle.load(open('inverseCorpusFrequency.pickle','rb'))
    hashtagFrequency=pickle.load(open('hashtagFrequency.pickle','rb'))
    sortedWordHashtags=pickle.load(open('sortedWordHashtags.pickle','rb'))
    # inverseCorpusFrequency=inverse_corpus_frequency
    # hashtagFrequency=hashtag_frequency
    # sortedWordHashtags=sorted_word_hashtags
    rankedWords=hashtagRecommend(new_tweet_str, '', hashtagFrequency, inverseCorpusFrequency)
    hashtagRecommended = []
    num = 0
    less = 0
    for i in range(0, len(rankedWords)):
        num_per_word = 0
        if num == 5 or len(hashtagRecommended)>=5:
            break
        hashtagNum = 0
        if i < 4:
            hashtagNum = 4 -i + less
        else:
            hashtagNum = 2
        if rankedWords[i][1] !=0:
            for j in range(0, hashtagNum):
                if len(hashtagRecommended)>=5:
                    break
                if j == len(sortedWordHashtags[rankedWords[i][0]]):
                    break
                hashtagRecommended.append(sortedWordHashtags[rankedWords[i][0]][j][0])
                num += 1
                num_per_word += 1
            less = hashtagNum - num_per_word

        else:
            break
    return hashtagRecommended 


time: 20.5 ms (started: 2021-01-27 08:32:21 +00:00)


In [None]:
print(new_hashtag_recommendation('today is warm'))

['coatsforkidsatx', 'sjd', 'mdzs', 'seaice', 'wordoftheday']
time: 172 ms (started: 2021-01-27 08:32:26 +00:00)
