# Social Media Hashtag Prediction

### <span style="color:#FF00FF">Import Libraries</span>

In [190]:

import pandas as pd
import numpy as np
import re
import html
import string
import unicodedata
  
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import random
import ast
import pickle

import spacy
from spacy import displacy
from spacy.training import Example
from spacy.util import minibatch, compounding

from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report



In [191]:
df = pd.read_excel("SocialHashTagdata.xlsx")
print(df)

                                  leadersocialhashtags       SpecialTags
0    ['#academy', '#training', '#education', '#foot...              None
1    ['#initiative', '#love', '#together', '#motiva...              None
2    ['#echo', '#the', '#season', '#overwatch', '#a...              None
3    ['#mosaic', '#art', '#mosaicart', '#mosaico', ...              None
4    ['#lgbtq', '#lgbt', '#gay', '#pride', '#loveis...             LGBTQ
..                                                 ...               ...
124  ['#winner', '#win', '#love', '#giveaway', '#bl...              None
125  ['#lgbtq', '#lgbt', '#gay', '#pride', '#loveis...             LGBTQ
126  ['#differentlyabled', '#disabilityawareness', ...  DifferentlyAbled
127  ['#sport', '#fitness', '#training', '#gym', '#...              None
128  ['#echo', '#the', '#season', '#overwatch', '#a...              None

[129 rows x 2 columns]


In [192]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   leadersocialhashtags  129 non-null    object
 1   SpecialTags           129 non-null    object
dtypes: object(2)
memory usage: 2.1+ KB


### <span style="color:#FF00FF">Data Preprocessing</span>

In [193]:
# This is a class te get text. 
class preprocessing:
    
    def __init__(self):
        pass

    #Cleaning and stripping HTML
    def remove_html_tags(self,text):
        clean = re.compile('<.*?>')
        cleantext = re.sub(clean, '', text)
        return cleantext

    #Removing Escaping characters &lt
    def escaping_html_char(self,doc):
        cleandoc = html.unescape(doc)
        return cleandoc
    
    #Removing newline & extra spaces
    def textcleaning(self,doc):
        # remove extra newlines
        a = doc.replace("\\n",".").strip()
        a = a.replace("\\r",".").strip()
        #a = re.sub(r'\d+','',a)# remove numbers
        cleandoc = re.sub("\s+"," ", a)
        return cleandoc
    
    def text_norm(self,doc):
        cleandoc = doc.lower()
        return cleandoc
    
    
    abbr_dict={
        "what's":"what is",
        "what're":"what are",
        "who's":"who is",
        "who're":"who are",
        "where's":"where is",
        "where're":"where are",
        "when's":"when is",
        "when're":"when are",
        "how's":"how is",
        "how're":"how are",

        "i'm":"i am",
        "we're":"we are",
        "you're":"you are",
        "they're":"they are",
        "it's":"it is",
        "he's":"he is",
        "she's":"she is",
        "that's":"that is",
        "there's":"there is",
        "there're":"there are",

        "i've":"i have",
        "we've":"we have",
        "you've":"you have",
        "they've":"they have",
        "who've":"who have",
        "would've":"would have",
        "not've":"not have",

        "i'll":"i will",
        "we'll":"we will",
        "you'll":"you will",
        "he'll":"he will",
        "she'll":"she will",
        "it'll":"it will",
        "they'll":"they will",

        "isn't":"is not",
        "wasn't":"was not",
        "aren't":"are not",
        "weren't":"were not",
        "can't":"can not",
        "couldn't":"could not",
        "don't":"do not",
        "didn't":"did not",
        "shouldn't":"should not",
        "wouldn't":"would not",
        "doesn't":"does not",
        "haven't":"have not",
        "hasn't":"has not",
        "hadn't":"had not",
        "won't":"will not"
    }

    def process_data(self,doc):
        for key, value in self.abbr_dict.items():
            doc = doc.replace(key,value)
        return doc

    # Removing accented characters
    # A simple example — converting é to e.
    def decode_text(self,doc):
        cleandoc = unicodedata.normalize('NFKD', doc).encode('ascii','ignore').decode("utf8")
        return cleandoc
    
    def text_tokenize(self,doc):
        return word_tokenize(doc)
    
    def remove_stopwords(self,words):
        # set of stop words
        stop_words = set(stopwords.words('english')) 
        stext = [] 
        for w in words:
            if w not in stop_words:
                stext.append(w)
        return stext
    
    def remove_punctuation(self,doc):
        #chars = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
        #table = str.maketrans(chars, ' '*len(chars))
        table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        ##str.maketrans('', '', string.punctuation)
        cleandoc = doc.translate(table)
        return cleandoc
    
    def data_preprocessing(self,doc,w_stop=True):

        doc = str(doc)

        step1 = self.remove_html_tags(doc)            # Cleaning and stripping HTML
        step2 = self.escaping_html_char(step1)        # Removing Escaping characters &lt
        step3 = self.textcleaning(step2)              # Removing newline & extra spaces
        step4 = self.text_norm(step3)                 # Case Normalization
        step5 = self.process_data(step4)              # Transforming abbreviations
        step6 = self.remove_punctuation(step5)        # Remove punctuation
        step7 = self.decode_text(step6)               # Text encoding - Removing accented characters
        step8 = self.text_tokenize(step7)             # Tokenization

        if w_stop:
            step11 = self.remove_stopwords(step8)
            cleandoc = " ".join(step11)
        else:
            cleandoc = " ".join(step8)

        return cleandoc

In [194]:
pre = preprocessing()
df['clean_hashtags'] = df['leadersocialhashtags'].apply(pre.data_preprocessing,w_stop=True)

In [195]:
df[['clean_hashtags','leadersocialhashtags','SpecialTags']]

Unnamed: 0,clean_hashtags,leadersocialhashtags,SpecialTags
0,academy training education football school fit...,"['#academy', '#training', '#education', '#foot...",
1,initiative love together motivation covid insp...,"['#initiative', '#love', '#together', '#motiva...",
2,echo season overwatch alexa stihl lawncare lov...,"['#echo', '#the', '#season', '#overwatch', '#a...",
3,mosaic art mosaicart mosaico interiordesign de...,"['#mosaic', '#art', '#mosaicart', '#mosaico', ...",
4,lgbtq lgbt gay pride loveislove queer lesbian ...,"['#lgbtq', '#lgbt', '#gay', '#pride', '#loveis...",LGBTQ
...,...,...,...
124,winner win love giveaway blackpink motivation ...,"['#winner', '#win', '#love', '#giveaway', '#bl...",
125,lgbtq lgbt gay pride loveislove queer lesbian ...,"['#lgbtq', '#lgbt', '#gay', '#pride', '#loveis...",LGBTQ
126,differentlyabled disabilityawareness disabilit...,"['#differentlyabled', '#disabilityawareness', ...",DifferentlyAbled
127,sport fitness training gym motivation workout ...,"['#sport', '#fitness', '#training', '#gym', '#...",


### <span style="color:#FF00FF">Training spaCy model</span>

In [196]:
# Create an empty model
nlp = spacy.blank("en")

# Add the TextCategorizer to the empty model
text = nlp.add_pipe("textcat")

# Adding labels to the `ner`
text.add_label('LGBTQ')
text.add_label('DifferentlyAbled')
text.add_label('None')


1

In [197]:
train_texts = df['clean_hashtags'].values
train_labels = [{'cats': {'LGBTQ': SpecialTags == 'LGBTQ',
                          'DifferentlyAbled': SpecialTags == 'DifferentlyAbled',
                          'None' : SpecialTags == 'None'}} 
                for SpecialTags in df['SpecialTags']]

In [198]:
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('academy training education football school fitness beauty makeup academia phoenix art love dance sports joaquin soccer learning online motivation sport instagram instagood coaching music students oscar fashion fun',
  {'cats': {'LGBTQ': False, 'DifferentlyAbled': False, 'None': True}}),
 ('initiative love together motivation covid inspiration change india education success changemakers peace social unicef slapcollective youths spread unesco changebelievers indianyouths support changeseekers harmony youngindia globalchangemakers slapwale lifeskills unyouths socialventure rurban',
  {'cats': {'LGBTQ': False, 'DifferentlyAbled': False, 'None': True}}),
 ('echo season overwatch alexa stihl lawncare love amazon echodot mercy landscapers instagram widowmaker dva art mccree clarkegriffin genji octaviablake amazonecho reaper echocardiography lucio music bellamyblake zenyatta tech tracer blizzard',
  {'cats': {'LGBTQ': False, 'DifferentlyAbled': False, 'None': True}})]

In [199]:
def train_spacy(data,iterations):
    train_data=data
    random.seed(1)
    spacy.util.fix_random_seed(1)
    optimizer = nlp.begin_training()

    losses = {}
    for epoch in range(iterations):
        random.shuffle(train_data)
        # Create the batch generator with batch size = 8
        batches = minibatch(train_data, size=8)
        # Iterate through minibatches
        for batch in batches:
            for text, labels in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, labels)
                nlp.update([example], sgd=optimizer, losses=losses)
        print(losses)

In [200]:
hashtaganalytics = train_spacy(train_data, iterations = 20)

{'textcat': 12.958285689553069}
{'textcat': 14.97433874851104}
{'textcat': 16.972505053846113}
{'textcat': 19.349783979412464}
{'textcat': 21.374505578138407}
{'textcat': 26.069019476842033}
{'textcat': 28.07512961535921}
{'textcat': 30.110771911050456}
{'textcat': 32.17944538065903}
{'textcat': 35.00512049565695}
{'textcat': 37.02922410251419}
{'textcat': 39.04329633680619}
{'textcat': 41.051109849925574}
{'textcat': 43.59608092927129}
{'textcat': 45.636809481919556}
{'textcat': 47.644617303532264}
{'textcat': 49.65290326647114}
{'textcat': 51.665213018849116}
{'textcat': 53.66556861308786}
{'textcat': 55.67372149601794}


### <span style="color:#FF00FF">Predicting Hashtags based on spaCy model</span>

In [214]:
TEST_DATA = df[['clean_hashtags']].sample(frac=1).reset_index(drop=True)

def test_spacy(data):
    test_texts = data['clean_hashtags'].values
        #hashtaganalytics = SPACY_obj.get("spacy_model")
    texts = test_texts
    docs = [nlp.tokenizer(text) for text in texts]

    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    scores = textcat.predict(docs)

    return scores


In [230]:
# From the scores, find the label with the highest score/probability
def test_spacy_execute():
    predicted_labels = test_spacy(TEST_DATA).argmax(axis=1)
    predicted_tags = [textcat.labels[label] for label in predicted_labels]

    labels = np.array(predicted_tags)
    predictedlabel = pd.DataFrame({'Label': labels}, columns=['Label'])

    result = pd.concat([TEST_DATA, predictedlabel], axis=1, join='inner')
    return (result)

In [232]:
df_social = pd.read_csv("file_name.csv", sep='|')

In [233]:
df_social_test = df_social[['dunsNum','leadersocialhashtags']]

In [234]:
pre = preprocessing()
df_social_test['clean_hashtags'] = df_social_test['leadersocialhashtags'].apply(pre.data_preprocessing,w_stop=True)
df_social_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_social_test['clean_hashtags'] = df_social_test['leadersocialhashtags'].apply(pre.data_preprocessing,w_stop=True)


Unnamed: 0,dunsNum,leadersocialhashtags,clean_hashtags
0,58168600,"['#academy', '#training', '#education', '#foot...",academy training education football school fit...
1,605485861,"['#initiative', '#love', '#together', '#motiva...",initiative love together motivation covid insp...
2,783598485,"['#echo', '#the', '#season', '#overwatch', '#a...",echo season overwatch alexa stihl lawncare lov...
3,20825064,"['#mosaic', '#art', '#mosaicart', '#mosaico', ...",mosaic art mosaicart mosaico interiordesign de...
4,148015845,"['#lgbtq', '#lgbt', '#gay', '#pride', '#loveis...",lgbtq lgbt gay pride loveislove queer lesbian ...
...,...,...,...
242,79688478,"['#teacher', '#teachersofinstagram', '#educati...",teacher teachersofinstagram education school t...
243,619658904,"['#academy', '#training', '#education', '#foot...",academy training education football school fit...
244,787916092,"['#differentlyabled', '#disabilityawareness', ...",differentlyabled disabilityawareness disabilit...
245,73886782,"['#differentlyabled', '#disabilityawareness', ...",differentlyabled disabilityawareness disabilit...


In [237]:
TEST_DATA=df_social_test[['clean_hashtags']]
TEST_RESULT = test_spacy_execute()

In [238]:
result = pd.concat([df_social_test, TEST_RESULT], axis=1, join='inner')
result

Unnamed: 0,dunsNum,leadersocialhashtags,clean_hashtags,clean_hashtags.1,Label
0,58168600,"['#academy', '#training', '#education', '#foot...",academy training education football school fit...,academy training education football school fit...,
1,605485861,"['#initiative', '#love', '#together', '#motiva...",initiative love together motivation covid insp...,initiative love together motivation covid insp...,
2,783598485,"['#echo', '#the', '#season', '#overwatch', '#a...",echo season overwatch alexa stihl lawncare lov...,echo season overwatch alexa stihl lawncare lov...,
3,20825064,"['#mosaic', '#art', '#mosaicart', '#mosaico', ...",mosaic art mosaicart mosaico interiordesign de...,mosaic art mosaicart mosaico interiordesign de...,
4,148015845,"['#lgbtq', '#lgbt', '#gay', '#pride', '#loveis...",lgbtq lgbt gay pride loveislove queer lesbian ...,lgbtq lgbt gay pride loveislove queer lesbian ...,LGBTQ
...,...,...,...,...,...
242,79688478,"['#teacher', '#teachersofinstagram', '#educati...",teacher teachersofinstagram education school t...,teacher teachersofinstagram education school t...,
243,619658904,"['#academy', '#training', '#education', '#foot...",academy training education football school fit...,academy training education football school fit...,
244,787916092,"['#differentlyabled', '#disabilityawareness', ...",differentlyabled disabilityawareness disabilit...,differentlyabled disabilityawareness disabilit...,DifferentlyAbled
245,73886782,"['#differentlyabled', '#disabilityawareness', ...",differentlyabled disabilityawareness disabilit...,differentlyabled disabilityawareness disabilit...,DifferentlyAbled


In [239]:
result[['Label']]

Unnamed: 0,dunsNum,Label
0,58168600,
1,605485861,
2,783598485,
3,20825064,
4,148015845,LGBTQ
...,...,...
242,79688478,
243,619658904,
244,787916092,DifferentlyAbled
245,73886782,DifferentlyAbled


Unnamed: 0,dunsNum,leadergender,leader_race_expanded,isWomanOwned
0,58168600,female,Black or African-American,YES
1,605485861,male,White or European American,NO
2,783598485,female,White or European American,YES
3,20825064,male,White or European American,NO
4,148015845,male,White or European American,NO
...,...,...,...,...
242,79688478,male,Black or African-American,NO
243,619658904,female,Asian,YES
244,787916092,female,Asian,YES
245,73886782,male,White or European American,NO


In [245]:
finaldataset = pd.concat([df_social[['dunsNum','leadergender','leader_race_expanded','isWomanOwned']], result[['Label']]], axis=1, join='inner')
finaldataset

Unnamed: 0,dunsNum,leadergender,leader_race_expanded,isWomanOwned,Label
0,58168600,female,Black or African-American,YES,
1,605485861,male,White or European American,NO,
2,783598485,female,White or European American,YES,
3,20825064,male,White or European American,NO,
4,148015845,male,White or European American,NO,LGBTQ
...,...,...,...,...,...
242,79688478,male,Black or African-American,NO,
243,619658904,female,Asian,YES,
244,787916092,female,Asian,YES,DifferentlyAbled
245,73886782,male,White or European American,NO,DifferentlyAbled


In [246]:
finaldataset.to_csv('final_dataset.csv')