In [5]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib
import matplotlib.patches as mpatches
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from tqdm import tqdm # show progress bar for long running ops
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

#algorithm based on knowing that in the data there are just two labels 0=normal tweet, 1=disaster tweet
def plot_LSA(test_data, test_labels, plot=True):
    lsa = TruncatedSVD(n_components=2)
    lsa.fit(test_data)
    lsa_scores = lsa.transform(test_data)
    color_mapper = {label:idx for idx,label in enumerate(set(test_labels))}
    color_column = [color_mapper[label] for label in test_labels]
    colors = ['orange','blue','blue']
    if plot:
        plt.scatter(lsa_scores[:,0], lsa_scores[:,1], s=8, alpha=.8, c=test_labels, cmap=matplotlib.colors.ListedColormap(colors))
        red_patch = mpatches.Patch(color='orange', label='Normal')
        green_patch = mpatches.Patch(color='blue', label='Disaster')
        plt.legend(handles=[red_patch, green_patch], prop={'size': 30})
    

def get_pretrained_glove_vectors(file):
    embedding_dict={}
    with open(file,'r') as f:
        for line in f:
            values=line.split()
            word = values[0]
            vectors=np.asarray(values[1:],'float32')
            embedding_dict[word]=vectors
        f.close()
    
    return embedding_dict


def create_corpus_new(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet)]
        corpus.append(words)
    return corpus  


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marttiylikoski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
#Usage:
# Showing Confusion Matrix for BERT model
#plot_confision_matrix(train_pred_BERT_int, train['target'].values, 'Confusion matrix for BERT model', figsize=(7,7))

# Showing Confusion Matrix
def plot_confusion_matrixy(y_true, y_pred, title, figsize=(5,5)):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d\n\n\n\n' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d\n\n\n\n' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'True label'
    cm.columns.name = 'Predicted label'
    fig, ax = plt.subplots(figsize=figsize)
#    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
#    ax.set_title('Confusion Matrix');    
    plt.title(title)
    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax)

In [None]:
# Showing Confusion Matrix
# https://likegeeks.com/seaborn-heatmap-tutorial/

def plot_cm(y_true, y_pred, title, figsize=(5,5)):
    df = pd.DataFrame({'y_Actual':y_true, 'y_Predicted':y_pred})
    confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'], margins = False)
    
    ax= plt.subplot()
    sns.heatmap(confusion_matrix, annot=True,  ax = ax)
    sns.set(font_scale=1.5)
    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['disaster', 'normal']); ax.yaxis.set_ticklabels(['normal', 'disaster']);
    plt.show()
