In [1]:
import numpy as np
from tqdm import tqdm
from PIL import Image
import spacy
import string
import re
import wordcloud
import matplotlib.pyplot as plt
import collections
import multiprocessing
#from unidecode import unidecode
import math

import little_mallet_wrapper as lmw

#Google colab
from google.colab import files

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary
#https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#15-removed-third-party-wrappers
#from gensim.models.wrappers import LdaMallet
import emoji
from sklearn.cluster import KMeans

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

#from tmtoolkit.topicmod.evaluate import metric_coherence_gensim
from IPython.display import display

import logging  # Setting up the loggings to monitor processes
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)



<h1 style="background-color:powderblue;">General helper Classes</h1>

In [None]:
class CleaningHelper():
    """Cleaning Helper"""
    def __init__(self, version):
        self.version = version
    
    def __str__(self):
        return f"Cleaning helper version {self.version}"
    
    def get_nulls_data(self, df):
        #We want to know the quality of data. So, let's start by detecting not null percentage related to every column. 

        df_tot_nulls = df.isnull().sum().sort_values(ascending=False)
        df_tot_nulls_perc = 100 - round(df_tot_nulls/len(df)*100,2)
        df_tot_perc_nulls = pd.concat([df_tot_nulls,df_tot_nulls_perc],axis=1)
        df_tot_perc_nulls = df_tot_perc_nulls.rename(columns={0: "Total", 1: "PercNotNull"})
        return df_tot_perc_nulls
    
    def get_url_str(self, token_):
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        url = re.findall(regex,token_)
        return [x[0] for x in url]
    
    def get_twitter_username_str(self, token_):
        regex = r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)"
        username = re.findall(regex,token_)
        return [x[0] for x in username]

    def get_twitter_emoji_str(self, token_):
        #Cleaning emoji
        emoji_list = [c for c in token_ if c in emoji.UNICODE_EMOJI.get("en")]
        tmp_split = token_.split()
        #print(f"tmp_split:{tmp_split}")
        clean_text = ' '.join([str for str in token_.split() if not any(i in str for i in emoji_list)])
        #print(f"Clean type:{type(clean_text)}")
        #print(f"The clean text:{clean_text}")
        return clean_text
    
    def get_custom_stop_words(self, spacy_):
        spacy_stopwords = spacy_.Defaults.stop_words
        
        with open("stopwords.txt") as file:
            more_stopwords = {line.rstrip() for line in file}
        
        final_stopwords = spacy_stopwords | more_stopwords
        
        return final_stopwords
    
    def get_custom_rate(self, rate_):
        custom_rate = 0
        if rate_ > 0:
            custom_rate = 1
        return custom_rate

In [1]:
class CleaningTweets():
    """Cleaning Tweets"""
    def __init__(self, version, spacy_, parser_, punctuation_str_, stop_words_):
        self.version          = version
        self.spacy_           = spacy_
        self.parser_          = parser_
        self.punctuation_str_ = punctuation_str_
        self.stop_words_      = stop_words_
    
    def __str__(self):
        return f"Cleaning Tweets version {self.version}"
    
    def do_spacy_tokenizer(self, token_):
        #https://spacy.io/api/annotation
        
        # initializing CleaningHelper class
        cleaningHelper = CleaningHelper(version = "1.0")
        
        # initializing Token's doc
        doc_tokens = self.spacy_(token_)
        #print(type(doc_tokens))
        #print(f"Parser: {doc_tokens}")
        
        # Removing twitter - username 
        doc_tokens = [ token_ for token_ in doc_tokens if len(cleaningHelper.get_twitter_username_str(token_ = token_.lemma_.lower().strip())) == 0 ]
        #print(type(doc_tokens))
        print(f"Removing_Twitter-Username: {doc_tokens}")

        # Removing token - URL 
        doc_tokens = [ token_ for token_ in doc_tokens if len(cleaningHelper.get_url_str(token_ = token_.lemma_.lower().strip())) == 0 ]
        #print(type(doc_tokens))
        print(f"Removing_Token-URL: {doc_tokens}")

        # Lemmatizing each token and converting each token into lowercase
        #doc_tokens = [ token_.lemma_.lower().strip() if token_.lemma_ != "-PRON-" else token_.lower_ for token_ in doc_tokens ]
        #print(f"Lemmatizing: {doc_tokens}")
        
        # Removing emoji
        #print(cleaningHelper.get_twitter_emoji_str(token_ = token_.lemma_.lower().strip()))
        doc_tokens = [ token_ for token_ in doc_tokens if len(cleaningHelper.get_twitter_emoji_str(token_ = token_.lemma_.lower().strip())) > 0 ]
        ##cleaningHelper.get_twitter_emoji_str(token_ = doc_tokens)
        #print(f"The token_:{token_}")
        #print(f"The doc_tokens:{doc_tokens}")
        #print(f"The token_ type:{type(token_)}")
        #print(f"The doc_tokens type:{type(doc_tokens)}")
        print(f"Removing_emoji: {doc_tokens}")
        
        #print(type(doc_tokens))
        #[print (token_.lemma_) if token_.lemma_ != "-PRON-" else token_.lower_ for token_ in doc_tokens]
        doc_tokens = [ token_.lemma_.lower().strip() if token_.lemma_ != "-PRON-" else token_.lower_ for token_ in doc_tokens ]
        #print(type(doc_tokens))
        print(f"Lemmatizing: {doc_tokens}")
        
        # Removing stop words
        doc_tokens = [ token_ for token_ in doc_tokens if token_ not in stop_words and token_ not in punctuation_str ]
        print(f"Stop-words: {doc_tokens}")
        
        return doc_tokens
    
    def get_words_df(self, df_Tweets_):
        #df_doc_tokens = pd.DataFrame(columns = ['token_'])
        df_doc_tokens = pd.DataFrame({'token_': pd.Series([], dtype='str')})
        for index, row in df_Tweets_.iterrows():
            token_ = str(row["tweet"], encoding='UTF-8')
            doc_tokens = self.do_spacy_tokenizer(token_ = token_)
            #print(doc_tokens)
            for doc_token in doc_tokens:
                df_doc_tokens = df_doc_tokens.append({'token_' : doc_token}, ignore_index = True)
        return df_doc_tokens
    
    def get_token_list(self, token_):
        tokens_ = self.do_spacy_tokenizer(token_)
        tokens_ = " ".join([i for i in tokens_])
        return tokens_
    
    def selected_topics(self, model, vectorizer, top_n=10):
        for idx, topic in enumerate(model.components_):
            #print(f"topic:{topic}")
            print("Topic %d:" % (idx))
            print([(vectorizer.get_feature_names()[i], topic[i])
                            for i in topic.argsort()[:-top_n - 1:-1]])
    
    def get_words_list(self, df_Tweets_):
        words_list =[]
        for index, row in df_Tweets_.iterrows():
            token_ = str(row["token_"])
            words_list.append(token_)
        return words_list
    
    def get_tokens(self, token_):
        tokens =  token_.lower().split() # Make text lowercase and split it
        return tokens

In [None]:
class GeneralProcess():
    """General Process"""
    def __init__(self, version):
        self.version          = version

    def __str__(self):
        return f"General Process version {self.version}"
    
    def create_df_from_lists(self, lst_pf, lst_cols):
        df = pd.DataFrame(list(zip(*lst_pf)), columns = lst_cols)
        return df