In [None]:
import numpy as np
from PIL import Image
import spacy
import string
import re
import wordcloud
import matplotlib.pyplot as plt
import collections
import multiprocessing
from unidecode import unidecode
import math

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

import logging  # Setting up the loggings to monitor processes
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

<h1 style="background-color:powderblue;">General helper Classes</h1>

In [None]:
class CleaningHelper():
    """Cleaning Helper"""
    def __init__(self, version):
        self.version = version
    
    def __str__(self):
        return f"Cleaning helper version {self.version}"
    
    def get_nulls_data(self, df):
        #We want to know the quality of data. So, let's start by detecting not null percentage related to every column. 

        df_tot_nulls = df.isnull().sum().sort_values(ascending=False)
        df_tot_nulls_perc = 100 - round(df_tot_nulls/len(df)*100,2)
        df_tot_perc_nulls = pd.concat([df_tot_nulls,df_tot_nulls_perc],axis=1)
        df_tot_perc_nulls = df_tot_perc_nulls.rename(columns={0: "Total", 1: "PercNotNull"})
        return df_tot_perc_nulls
    
    def get_url_str(self, token_):
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        url = re.findall(regex,token_)
        return [x[0] for x in url]
    
    def get_twitter_username_str(self, token_):
        regex = r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)"
        username = re.findall(regex,token_)
        return [x[0] for x in username]

    def get_custom_stop_words(self, spacy_):
        spacy_stopwords = spacy_.Defaults.stop_words
        
        with open("stopwords.txt") as file:
            more_stopwords = {line.rstrip() for line in file}
        
        final_stopwords = spacy_stopwords | more_stopwords
        
        return final_stopwords
    
    def get_custom_rate(self, rate_):
        custom_rate = 0
        if rate_ > 0:
            custom_rate = 1
        return custom_rate

In [None]:
class CleaningTweets():
    """Cleaning Tweets"""
    def __init__(self, version, spacy_, parser_, punctuation_str_, stop_words_):
        self.version          = version
        self.spacy_           = spacy_
        self.parser_          = parser_
        self.punctuation_str_ = punctuation_str_
        self.stop_words_      = stop_words_
    
    def __str__(self):
        return f"Cleaning Tweets version {self.version}"
    
    def do_spacy_tokenizer(self, token_):
        #https://spacy.io/api/annotation
        
        # initializing CleaningHelper class
        cleaningHelper = CleaningHelper(version = "1.0")
        
        # initializing Token's doc
        doc_tokens = self.parser_(token_)
        
        # Removing twitter - username 
        doc_tokens = [ token_ for token_ in doc_tokens if len(cleaningHelper.get_twitter_username_str(token_ = token_.lemma_.lower().strip())) == 0 ]

        # Removing token - URL 
        doc_tokens = [ token_ for token_ in doc_tokens if len(cleaningHelper.get_url_str(token_ = token_.lemma_.lower().strip())) == 0 ]

        # Lemmatizing each token and converting each token into lowercase
        doc_tokens = [ token_.lemma_.lower().strip() if token_.lemma_ != "-PRON-" else token_.lower_ for token_ in doc_tokens ]

        # Removing stop words
        doc_tokens = [ token_ for token_ in doc_tokens if token_ not in stop_words and token_ not in punctuation_str ]
        
        return doc_tokens
    
    def get_words_df(self, df_Tweets_):
        #df_doc_tokens = pd.DataFrame(columns = ['token_'])
        df_doc_tokens = pd.DataFrame({'token_': pd.Series([], dtype='str')})
        for index, row in df_Tweets_.iterrows():
            token_ = str(row["tweet"], encoding='UTF-8')
            doc_tokens = self.do_spacy_tokenizer(token_ = token_)
            #print(doc_tokens)
            for doc_token in doc_tokens:
                df_doc_tokens = df_doc_tokens.append({'token_' : doc_token}, ignore_index = True)
        return df_doc_tokens
    
    def get_words_list(self, df_Tweets_):
        words_list =[]
        for index, row in df_Tweets_.iterrows():
            token_ = str(row["token_"])
            words_list.append(token_)
        return words_list

In [None]:
class SentimentAnalysis():
    """Sentiment Analysis"""
    def __init__(self, version):
        self.version = version
    
    def __str__(self):
        return f"Sentiment Analysis version {self.version}"
    
    def create_tfidf_dictionary(self, x, transformed_sentences, features):
        '''
        create dictionary for each input sentence x, where each word has assigned its tfidf score

        inspired  by function from this wonderful article: 
        https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34

        x - row of dataframe, containing sentences, and their indexes,
        transformed_sentences - all sentences transformed with TfidfVectorizer
        features - names of all words in corpus used in TfidfVectorizer

        '''
        vector_coo = transformed_sentences[x.name].tocoo()
        vector_coo.col = features.iloc[vector_coo.col].values
        dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
        return dict_from_coo

    def replace_tfidf_words(self, x, transformed_sentences, features):
        '''
        replacing each word with it's calculated tfidf dictionary with scores of each word
        x - row of dataframe, containing sentences, and their indexes,
        transformed_sentences - all sentences transformed with TfidfVectorizer
        features - names of all words in corpus used in TfidfVectorizer
        '''
        dictionary = self.create_tfidf_dictionary(x, transformed_sentences, features)   
        return list(map(lambda y:dictionary[f'{y}'], x.tweet_vector.split()))

    def replace_sentiment_words(self, word, sentiment_dict):
        '''
        replacing each word with its associated sentiment score from sentiment dict
        '''
        try:
            out = sentiment_dict[word]
        except KeyError:
            out = 0
        return out