# Dashboard

## 1 - Libaries Import

In [None]:
%reset -f

# Import Os to get to the root directory
import os
import sys

# Root directory of the project
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

TEST_DIR = os.path.join(ROOT_DIR, "test-white-papers")
MODEL_DIR = os.path.join(ROOT_DIR, "model")

# First, we define the path where our Train dataset is located
TRAIN_DIR = os.path.join(ROOT_DIR, "dataset")
TRAIN_CSV = os.path.join(TRAIN_DIR, "train.csv")
        
# Import own functions
from libraries import corpus
from libraries import pdf2text

##########################
# Import other libraries
##########################

# Data Processing and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 2 - Methods Application

In [None]:
class Data:
    def __init__(self):
        # Retrieving the PDF files from white-paper folder and converting them to text
        self.dataset = pdf2text.get_dataset()
        
        # Define if you want your corpus to be whole or divided in sentences
        makeSentences = True
        # Creates a dictionary with each White Paper and its text pre-processed
        self.corpora_sent = corpus.makeCleanCorpus(dataset, lemmatize=True, removePunct=True, removeNums=True
                            ,makeSentences=makeSentences, removeURL=True, removeChar=True, removeEnt=True)
        
        # Define if you want your corpus to be whole or divided in sentences
        makeSentences = False
        # Creates a dictionary with each White Paper and its text pre-processed
        self.corpora_whole = corpus.makeCleanCorpus(dataset, lemmatize=True, removePunct=True, removeNums=True
                            ,makeSentences=makeSentences, removeURL=True, removeChar=True, removeEnt=True)
    
        # Retrieve the NEW White Papers to Test
        self.testset = pdf2text.get_dataset(path=TEST_DIR)
        
        # Define if you want your corpus to be whole or divided in sentences
        makeSentences = True
        # Creates a dictionary with each White Paper and its text pre-processed
        self.corpora_test_sent = corpus.makeCleanCorpus(test, lemmatize=True, removePunct=True, removeNums=True
                                ,makeSentences=makeSentences, removeURL=True, removeChar=True, removeEnt=True)
        
        # Define if you want your corpus to be whole or divided in sentences
        makeSentences = False
        
        # Creates a dictionary with each White Paper and its text pre-processed
        self.corpora_test_whole = corpus.makeCleanCorpus(test, lemmatize=True, removePunct=True, removeNums=True
                                 ,makeSentences=makeSentences, removeURL=True, removeChar=True, removeEnt=True)

In [None]:
class InformationRetrieval():
    
    def __init__(self, data):
        self.data = data
        
    # Define function
    def prepare_data(self, corpus, protocol):
        """
        Function that receives a name of a protocol and return the tokens.
        Input:
            Name of protocol
        Output:
            Tokens
        """
        assert protocol in corpus.keys()

        # Start tokenizing the dataset
        tokenizer = nltk.tokenize.RegexpTokenizer('\s+', gaps=True)

        tokens = tokenizer.tokenize(corpus[protocol])

        return tokens
    
    
    def visualize_ngrams(self,tokens, n = 2):
    
        assert n > 1

        if n == 2:
            bigrams_tokens = bigrams(tokens)
            fdist_bigrams = FreqDist(list(bigrams_tokens))
            fdist_bigrams.plot(30,cumulative=False)
            return plt
        if n == 3:
            trigrams_tokens = ngrams(tokens, 3)
            fdist_trigrams = FreqDist(list(trigrams_tokens))
            fdist_trigrams.plot(30,cumulative=False)
            return plt
        if n > 3:
            grams_tokens = ngrams(tokens, n)
            fdist_grams = FreqDist(list(grams_tokens))
            fdist_grams.plot(30,cumulative=False)
            return plt
            
    
    def visualize_word_cloud(self,corpus,protocol):
        text = corpus[protocol]
        wordcloud = WordCloud(background_color="white",
                              stopwords = set(STOPWORDS)
                              ).generate(text)
        plt.imshow(wordcloud)
        plt.axis("off")
        
        return plt

In [None]:
class TextClassification():
    
    def __init__(self, data):
        self.data = data
        
    def prepare_data(self):
        self.df_test_1 = pd.DataFrame.from_dict(self.data.corpora_test_whole,orient="index").reset_index()

        self.df_test_1.columns = ["Protocol", "Text"]
        
        # Define a Data Frame to put all the corpora_sent into a Pandas dataframe
        self.df_test_2 = pd.DataFrame(columns=["Text","Protocol"])
        
        # Iterate over each white paper, and on each sentence, and add it to the Dataframe
        i=0
        for k, v in enumerate(self.data.corpora_test_sent):
            for sent in self.data.corpora_test_sent[v]:
                self.df_test_2.loc[i, "Text"] = sent
                self.df_test_2.loc[i, "Protocol"] = v
                i += 1
        
    def get_log_model(self):
        # Read the csv and load it on a DataFrame
        df = pd.read_csv(TRAIN_CSV, sep=';', encoding="utf-8-sig")
        
        # Lower all the words
        df.Label = df.Label.apply(lambda x : x.lower())

        # Define the labels and also the Train dataset
        labels = df.groupby(by='Label').count().sort_values(by='Label').reset_index()['Label']

        X_train = df["Text"]
        y_train = df["Label"]

        # We need to add another category, in case there is no topic mentioned on the sentence
        X_train[len(X_train)+1] = ""
        y_train[len(y_train)+1] = "none"

        # We train a really simple Logistic Regression
        self.logreg = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                       ])
        
        self.logreg.fit(X_train, y_train)
        
    def get_nn_model(self):
        
        # We load our Neural Network from previous model weights
        self.model = load_model("model_neuralnetwork_classification.h5")
        
    def visualize_log(self,protocol):
        # First, we define our test dataframe
        X_test = df_test_2["Text"]

        # Then, for the second model, we predict it
        predict_logreg = self.logreg.predict(X_test)

        # Add into the Dataframe
        self.df_test_2["Model_2"] = predict_logreg

        protocol_test = self.df_test_2[self.df_test_2.Protocol == protocol].groupby(by="Model_2").count().sort_values(by="Text", ascending=False)
        protocol_test['percentage'] = protocol_test['Text']/protocol_test['Text'].sum()

        protocol_test = protocol_test.drop("none", axis=0).reset_index()

        # PLOT
        plt.bar(protocol_test.Model_2, protocol_test.percentage)
        plt.title(protocol) 
        plt.xlabel("Train Protocols")
        plt.ylabel("% Likelihood")
        
        return plt


    def visualize_nn(self,protocol):
        # We define the max of words that our Tokenizer will have
        max_words = 15000
        tokenize = text.Tokenizer(num_words=max_words, char_level=False)

        # First, we define our test dataframe
        X_test = self.df_test_1["Text"]

        # For the first model, we need to tokenize the text
        X_test = tokenize.texts_to_matrix(X_test)
        
        # Predict
        predict_nn = self.model.predict(X_test)

        # Add into the DataFrame
        self.df_test_1["Model_1"] = predict_nn.tolist()
        
        # Generate the Labels, with the name of each White Paper
        labels = list(self.data.corpora_sent.keys())
        
        pd.options.display.float_format = '{}'.format
        
        for index, row in self.df_test_1.iterrows():
            if row["Protocol"] == protocol:
                frame = pd.DataFrame()
                frame["weights"] = row["Model_1"]
                frame["label"] = labels
                frame = frame.sort_values(by="weights", ascending=False)
                frame = frame.head(5)
                
                # PLOT
                plt.bar(frame.label, frame.weights)
                plt.title(row["Protocol"]) 
                plt.xlabel("Train Protocols")
                plt.ylabel("% Likelihood")
        
        return plt


In [None]:
class DocumentClustering():
    
    def __init__(self, data):
        self.data = data
        
    def prepare_data(self):
        
        # Function that converts a dictionary into a Pandas Dataframe
        # The indexes are the name of the files
        df = corpus.dictionaryToPandas(corpora)

        # Instantiate TfidfVectorizer object with stopwords and tokenizer
        # parameters for efficient processing of text
        tfidf_vectorizer = TfidfVectorizer(max_features=20000,
                                                stop_words='english', lowercase = True,
                                         use_idf=True, tokenizer=tokenize_and_stem,
                                         ngram_range=(1,3))

        # Fit and transform the tfidf_vectorizer with the "text" of each paper
        # to create a vector representation of the plot summaries
        tfidf_matrix = tfidf_vectorizer.fit_transform(corpora.values())

        print(type(tfidf_matrix))

        matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names(),index=corpora.keys())
        matrix.head(10)

    # Defining Function
    def clustering_on_wordvecs(word_vectors, num_clusters):
        """
        Function that receives a word vectors and a number of clusters, and return the centers and the clusters

        """
        # Initalize a k-means object and use it to extract centroids
        kmeans_clustering = KMeans(n_clusters = num_clusters, init='k-means++');
        idx = kmeans_clustering.fit_predict(word_vectors);

        return kmeans_clustering.cluster_centers_, idx;

    #Average out vectors in a document
    def average_word_vectors(words, model, vocabulary, num_features):
        """
        Function that receives a words, a model, a vocabulary and number of features and return averages

        """
        feature_vector = np.zeros((num_features,),dtype="float64")
        nwords = 0.

        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model[word])

        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    #Process a single document 
    def averaged_word_vectorizer(corpus, model, num_features):
        vocabulary = set(model.wv.index2word)
        features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                        for tokenized_sentence in corpus]
        return np.array(features)

    def get_word2vec_model(self):
        
        # Load the model
        model = word2vec.Word2Vec.load("model_word2vec_clustering.model")


    def get_ap_model(self):
        
        #COnverting into list of words
        cleaned_text = clean_text(list(corpora.values()))

        w2v_feature_array = averaged_word_vectorizer(corpus=cleaned_text, model=model,
                                                     num_features=100)
        # Create an
        ap = AffinityPropagation()

        ap.fit(w2v_feature_array)

        ap_centers = ap.cluster_centers_

        cluster_labels = ap.labels_
        cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])

        df = df.reset_index()
        rslt_df = pd.concat([df, cluster_labels], axis=1)

        clus_val = rslt_df.ClusterLabel.value_counts()

        # Create a Dataframe with the predicted clusters
        reference = rslt_df[["index","cluster","ClusterLabel"]]

        reference.columns = ["protocol","kmeans_cluster","ap_cluster"]


    def visualize_ap_clusters(self,protocol):
        # Application of Affinity Propagation Cluster to a specific white paper
        cleaned_text = clean_text(list(
            [corpora_test["budbo"]]))

        w2v_feature_array = averaged_word_vectorizer(corpus=cleaned_text, model=model,
                                                     num_features=100)


        predict = ap.predict(w2v_feature_array)

        # List of other protocols in the same cluster
        similars = reference[reference.ap_cluster == predict[0]].protocol
        
        stats = corpus.get_datastats()

        result = pd.merge(similars.to_frame(),stats, left_on="protocol",right_on="name",how="inner")

        result = result.sort_values(by="rank")

        plt.plot(result.protocol, result["rank"])
        plt.xticks(rotation=90)
        plt.xlabel("Protocols in Cluster")
        plt.ylabel("Rank")
        plt.show()


In [None]:
class TextExtraction():
    
    def __init__(self, data):
        self.data = data
        
    def prepare_data(self):
        
        # Prepare the array with the tokens
        tokens = []

        # Iterate through all the papers and get all the different words within the papers
        for k in corpora.keys():
            tokens.append(word_tokenize(corpora[k]))
    
    def get_word2vec_model(self):
        
        model = Word2Vec.load("model_word2vec_textextraction.model")
    
    # Define Function
    def visualization_ent(self,protocol, corpora):
        """
        Function that receives the name of a paper, and retrieves a plot with the most common entities found
        Input:
            Name of a white paper in the corpus
        Output:
            Plot
        """

        assert protocol in corpora.keys()

        # Load the NLP object with pre-trained data
        nlp = spacy.load('en_core_web_sm')

        # Create an object of the paper chosen
        doc1 = nlp(corpora[protocol])

        # Get all the entities gather from the documents
        items1 = [x.label_ for x in doc1.ents]

        # Get the most common entities found in the paper
        i1 = Counter(items1).most_common(10)

        # Pepare the arrays for the plot
        x1 = []
        y1 = []

        # Iterate through the entities and add them into the array
        for i in range(len(i1)):
            x1.append(i1[i][0])
            y1.append(i1[i][1])

        # Create the visualization
        plt.subplots_adjust(hspace = 0.5, top = 0.4)
        plt.figure(figsize=(12,5))

        plt.subplot(211)
        plt.plot(x1, y1)
        plt.title(protocol)
    
    # Define the function
    def visualize_wordvec(topic, protocol,corpora, model1):
        """
        Function that receives a word and the name of a paper, and returns a visualization with similar words
        Input:
            Specific topic
            Name of the protocol
            Corpora with all the texts
            Word2Vec Model with all the Vectors
        Output:
            Plots
        """
        assert protocol in corpora.keys()

        # Iterate through the the protocols to get the position of the protocol
        n = 0    
        for i in enumerate(corpora.items()):
            if i[1][0] == protocol:
                n = i[0]
            else: continue

        # Create an empty Word2Vec object where we will put the weights for the specific protocol, based on the whole corpus
        model_test = Word2Vec()

        # Prepare the array with the tokens
        tokens = []

        # Tokenize white paper
        tokens.append(word_tokenize(corpora[protocol]))

        # Iterate through all the words in the specific white paper, and add the weights into the new model
        for word in tokens[0]:
            if word in model1.wv.vocab:
                model_test.wv[word] = model1.wv[word]

        # Get most similar word, based on the whole corpus
        l = model_test.wv.most_similar([topic])

        # Prepare the arrays for the visualizations
        x = []
        y = []

        # Iterate through the similar words
        for i in range(len(l)):
            x.append(l[i][0])
            y.append(l[i][1])

        # Prepare Visualization
        #grid = plt.GridSpec(2, 3, wspace=0.4, hspace=0.3)
        #plt.subplot(grid[0, 0])   
        plt.bar(x, y)
        plt.xticks(rotation=90)
        plt.show()

        # Prepare the array with the tokens
        tokens = []

        # Iterate through all the papers and get all the different words within the papers
        for k in corpora.keys():
            tokens.append(word_tokenize(corpora[k]))

        # Model with only words in the exact protocol
        model2 = Word2Vec(tokens[n], size =100, window=5, min_count=1, workers=2)

        # Get the similar words within the exact protocol
        X = model2[model2.wv.vocab]
        pca = PCA(n_components=2)
        result = pca.fit_transform(X)

        # Prepare Visualization
        #plt.subplot(grid[1, 0])
        plt.figure(figsize=(10,10))

        plt.scatter(result[:,0], result[:,1], s=100)
        words = list(model2.wv.vocab)

        for i, word in enumerate(tokens[n][:]):
            try:
                plt.annotate(word, xy=(result[i,0], result[i,1]), size = 10)
            except:
                pass

        plt.show()

In [None]:
class TopicModeling():
    def __init__(self, data):
        self.data = data
    
    def prepare_data(self):
        
        #Combining them and putting into pandas dataframe
        test_combined = {key:[combine_text(value)] for (key, value) in corpora_test_sent.items()}
        pd.set_option('max_colwidth', 150)
        test_df = pd.DataFrame.from_dict(test_combined).transpose()
        test_df.columns = ['whitepapers']
        test_df = test_df.sort_index()
        test_df.head()
        
        #creating the list and tokenizing each word in the test corpus
        testdata = test_df.whitepapers.values.tolist()
        testdata_words = list(sent_to_words(testdata))

        test_data_ready = process_words(testdata_words)

        # Apply the Topic Model into unseen documents
        corpus_lda_test = [id2word.doc2bow(text) for text in test_data_ready]

    # Function that gets the Dominant Topic in a Document
    def format_topics_sentences(ldamodel=None, corpus=corpus_lda, texts=data):
        # Init output
        sent_topics_df = pd.DataFrame()

        # Get main topic in each document
        for i, row_list in enumerate(ldamodel[corpus]):
            row = row_list[0] if ldamodel.per_word_topics else row_list            
            # print(row)
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # => dominant topic
                    wp = ldamodel.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
                else:
                    break
        sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

        # Add original text to the end of the output
        contents = pd.Series(texts)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return(sent_topics_df)


    def get_lda_model(self):
        self.model = LdaModel.load("model_lda_topicmodeling.model")
        
    def visualize_topics(self):
        
        unseen_doc = corpus_lda_test[0]
        # get topic probability distribution for a document
        vector = lda_gensim[unseen_doc] 

        # Format the topics from the unseen data into the LDA Model
        df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_gensim, corpus=corpus_lda_test, texts=test_data_ready)

        # Format
        df_dominant_topic = df_topic_sents_keywords.reset_index()
        df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
        df_dominant_topic.head(10)

        # Use the function to get the percentages of topics in the document
        _, topic_percentages = topics_per_document(model=lda_gensim, corpus=corpus_lda_test,doc="budbo",df=test_df)            

        # Total Topic Distribution by actual weight
        topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
        df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()

        # Top 3 Keywords for each Topic
        topic_top3words = [(i, topic) for i, topics in lda_gensim.show_topics(formatted=False) 
                                         for j, (topic, wt) in enumerate(topics) if j < 3]

        df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
        df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
        df_top3words.reset_index(level=0,inplace=True)

        # Plot
        fig, ax = plt.subplots(1, 1, figsize=(15, 6), dpi=150, sharey=True)

        # Topic Distribution by Topic Weights
        ax.bar(x='index', height='count', data=df_topic_weightage_by_doc, width=.5, color='steelblue')
        ax.set_xticks(range(6))
        tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top3words.loc[df_top3words.topic_id==x, 'words'].values[0])
        ax.xaxis.set_major_formatter(tick_formatter)
        ax.set_title('Number of Documents by Topic Weightage', fontdict=dict(size=10))

        plt.show()
    