In [None]:
# Import libraries and packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import copy, re
from gensim import corpora, models, similarities, matutils
import string
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize,RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.util import ngrams
from nltk.corpus import stopwords, wordnet, sentiwordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import time
import emoji
    
# import metrics to show accuracy, recall, precision and ...
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score
from sklearn.metrics import precision_recall_fscore_support as prf_score

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('vader_lexicon') # vader sentiment
nltk.download('sentiwordnet') # sentiwordnet sentiment

stemmer = PorterStemmer()
sentiment_analyzer = SentimentIntensityAnalyzer()

In [None]:
# data overview
def df_overview(df):
    print ('Rows     : ', df.shape[0])
    print ('Columns  : ', df.shape[1])
    print ('\nFeatures : ', df.columns.tolist())
    print ('\nMissing values :  ', df.isnull().sum().values.sum())
    print ('\nUnique values :  \n', df.nunique())
    return df.shape[0]

In [None]:
# Import Dataset and remove empty rows
raw_df = pd.read_csv('../datastore/Manuel_Antonio_National_Park/review-info.dat', sep='\t', lineterminator='\n')
df = raw_df.fillna("")

In [None]:
df = df[['reviewAuthor', 'visitDate', 'reviewDate', 'reviewStars', 'reviewTotalReviews', 'reviewAuthorAddress', 'reviewTitle', 'translated_reviewText']]
df.columns = ['author', 'exp_date', 'review_date', 'rating', 'review_num', 'address', 'title', 'review']
curr_rows = df_overview(df)

In [None]:
# Import dataset having author's geo infomation, and join it with review data
loc_df = pd.read_csv('../datastore/Manuel_Antonio_National_Park/review-info-with-loc.dat', sep='\t')
loc_df = loc_df[['reviewAuthor', 'reviewAuthorLng', 'reviewAuthorLng', 'reviewAuthorCountryCode', 'reviewAuthorCountryName']]
loc_df.columns = ['author', 'lat', 'lng', 'country_code', 'country_name']
loc_df.head(5)
print("# of Row: {}".format(loc_df.shape[0]))

In [None]:
# clean data 
# remove duplicates
loc_df = loc_df.drop_duplicates(subset=['author'], keep='last')
print("# of Row: {}".format(loc_df.shape[0]))

In [None]:
# clean data 
# remove duplicates
df = df.drop_duplicates(subset=['author', 'review'], keep='last')
print("\n")
print("\n======================")
print("Cleaned {} duplicate rows!".format(curr_rows - df_overview(df)))

In [None]:
# join tables
df = df.merge(loc_df, on='author', how='left')

In [None]:
df = df.fillna("")
df.head(5)

In [None]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [None]:
# Returns list of pos-neg and objective score. But returns empty list if not present in senti wordnet.
def get_word_sentiment(word, tag):
    #Synset is a special kind of a simple interface that is present in NLTK to look up words in WordNet. 
    #Synset instances are the groupings of synonymous words that express the same concept. 
    #Some of the words have only one Synset and some have several.
    synsets = wordnet.synsets(word, pos=tag)
    
    if not synsets:
        return ['', 0, 0, 0]

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = sentiwordnet.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

In [None]:
def get_sentiment(text):
    pos_tags = pos_tag(text.split())
    pos_score, neg_score, obj_score, word_count = 0, 0, 0, 0
    for word, tag in pos_tags:
        res = get_word_sentiment(word, get_wordnet_pos(tag))
        pos_score += res[1]
        neg_score += res[2]
        obj_score += res[3]
        word_count += 1
        
    return [pos_score/word_count, neg_score/word_count, obj_score/word_count]

In [None]:
def clean_text(text):
    # lower the text
    text = text.lower()
    
    # remove hyperlink
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    
    # ad-hoc remove
    text = text.replace("cr ", "costa rica ").replace("CR ", "costa rica")\
    .replace("manual ", "manuel ").replace("Manual ", "manuel ")\
    .replace("manuel antonio", "").replace("esp ", "especially ").replace("parc ", "park ")\
    .replace("&quot;", "").replace("&#39;", "'").replace("；", ";")\
    .replace("。", ".").replace("，", ",").replace("’", "'").replace("‘", "'").replace("`", "'")
    
    # spelling check
    from textblob import TextBlob
    text = str(TextBlob(text).correct())
    
    # converting emoji
    text = emoji.demojize(text)
    text = re.sub(r':[a-z_&]+:', '', text)

    # replace all types of negations: no, n't, never
    text = text.replace("he's", "he is").replace("I'm", "I am").replace("'re", " are").replace("ain't", "are not")\
    .replace("'ve", " have").replace("'ll", " will").replace("won't", "will not").replace("can't", "can not")\
    .replace("n't", " not").replace("'d", " would")
    
    # remove all symbols
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)

    # tokenize the text with removal of non-words,  punctuation, short (< 3 symbols) and long (> 25 symbols) tokens
    text = [word.strip() for word in text.split() if len(word.strip()) >= 3 and len(word.strip()) <= 25]

    # filtering English stopwords, and remove digits
    stop = stopwords.words('english')
    text = [word for word in text if not any(c.isdigit() for c in word)]
    text = [word for word in text if word not in stop]

    # Part-Of-Speech (POS) tagging: assign a tag to every word to define if it corresponds to a noun, a verb etc. using the WordNet lexical database (retaining nouns and adjectives)
    pos_tags = pos_tag(text)
    pos_tags = list(filter(lambda x: get_wordnet_pos(x[1]) is not None, pos_tags))
    text = [w for w, t in pos_tags if get_wordnet_pos(t) == wordnet.NOUN or get_wordnet_pos(t) == wordnet.ADJ]

    # stemming (reducing inflected words to their word stems using Porter stemmer)
    text = [stemmer.stem(word) for word in text]

    # lemmatize the text: transform every word into their root form (e.g. rooms -> room, slept -> sleep)
    text = [WordNetLemmatizer().lemmatize(w, get_wordnet_pos(t)) for w, t in pos_tags]
    
    # join the token
    text = ' '.join(text)
    return(text)

In [None]:
df["review_clean"] = df["review"].apply(lambda x: clean_text(x))
df.head(5)

In [None]:
# (VADER) add sentiment anaylsis columns
df["sentiments"] = df["review_clean"].apply(lambda x: sentiment_analyzer.polarity_scores(x))
df = pd.concat([df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1)
df.head(5)

In [None]:
# sentiwordnet sentiment
df["sentiments"] = df["review_clean"].apply(lambda x: get_sentiment(x))
df = pd.concat([df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1)
df.columns = ['author', 'exp_date', 'review_date', 'rating', 'review_num', 'address', 'title', 'raw_review', 'lat', 'lng', 'country_code', 'country_name', 'cleaned_review', 'vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 'swn_pos', 'swn_neg', 'swn_obj']
df.head(5)

In [None]:
df.index += 1
df.to_csv('sentiment-manual-antonio.csv', encoding='utf-8-sig')

In [None]:
# we have 2700 rows of data in total,
# label first ~10% data in sentiment-manual-antonio.csv by review rating. 
# label -1 if review rating below and equal to 3; label 1 if review rating over and equal to 4.
# so we will have ~90% data left for training and validation
TRAIN_AND_VALIDATION_END_INDEX = 2700
TEST_SET_END_INDEX = 300

In [None]:
df = pd.read_csv("label-sentiment-manual-antonio.csv", encoding='utf8')

In [None]:
# test set
test_labels = []
test_reviews = []

vader_tp = 0
vader_fp = 0
vader_tn = 0
vader_fn = 0
vader_pos = 0
vader_neg = 0
vader_neu = 0

wn_tp = 0
wn_fp = 0
wn_tn = 0
wn_fn = 0
wn_pos = 0
wn_neg = 0
wn_neu = 0

num_of_test = 0
cnt_neutral = 0
for index in range(0, TEST_SET_END_INDEX):
    vader_score = float(df['vader_compound'][index])
    wn_score = float(df['swn_pos'][index])-float(df['swn_neg'][index])
    label_val = int(df['label'][index])
    
    num_of_test += 1
    
    test_labels.append(df['label'][index])
    test_reviews.append(df['cleaned_review'][index])
    
    if (vader_score < 0.05 and vader_score > -0.05) or wn_score == 0:
        cnt_neutral += 1
        
    else:        
        vader_final = 1 if vader_score >= 0.05 else -1
        wn_final = 1 if wn_score > 0 else -1

        vader_tp += 1 if vader_final == label_val and label_val == 1 else 0
        vader_tn += 1 if vader_final == label_val and label_val == -1 else 0
        vader_fn += 1 if vader_final != label_val and label_val == 1 else 0
        vader_fp += 1 if vader_final != label_val and label_val == -1 else 0

        wn_tp += 1 if wn_final == label_val and label_val == 1 else 0
        wn_tn += 1 if wn_final == label_val and label_val == -1 else 0
        wn_fn += 1 if wn_final != label_val and label_val == 1 else 0
        wn_fp += 1 if wn_final != label_val and label_val == -1 else 0
        
    if vader_score < 0.05 and vader_score > -0.05:
        vader_neu += 1
    elif vader_score >= 0.05:
        vader_pos += 1
    else:
        vader_neg += 1
            
    if wn_score == 0:
        wn_neu += 1
    elif wn_score >0:
        wn_pos += 1
    else:
        wn_neg += 1

vader_accuracy_score = (vader_tp + vader_fn)/(num_of_test - cnt_neutral)
vader_precision_score = vader_tp/(vader_tp + vader_fp)
vader_recall_score = vader_tp/(vader_tp + vader_fn)
vader_f1_score = 2 * (vader_precision_score * vader_recall_score) / (vader_precision_score + vader_recall_score)

print("=== Vader ===\n")
print("TruePositive: %d" % vader_tp)
print("TrueNegative: %d" % vader_tn)
print("FalsePositive: %d" % vader_fp)
print("FalseNegative: %d\n" % vader_fn)
print("Accuracy Score: %f" % vader_accuracy_score)
print("Precision Score: %f" % vader_precision_score)
print("Recall Score: %f" % vader_recall_score)
print("F1 Score: %f\n" % vader_f1_score)
# in 300
print("Positive: %d" % vader_pos)
print("Negative: %d" % vader_neg)
print("Neutral: %d\n" % vader_neu)

wn_accuracy_score = (wn_tp + wn_fn)/(num_of_test - cnt_neutral)
wn_precision_score = wn_tp/(wn_tp + wn_fp)
wn_recall_score = wn_tp/(wn_tp + wn_fn)
wn_f1_score = 2 * (wn_precision_score * wn_recall_score) / (wn_precision_score + wn_recall_score)

print("=== Sentiment WordNet ===\n")
print("TruePositive: %d" % wn_tp)
print("TrueNegative: %d" % wn_tn)
print("FalsePositive: %d" % wn_fp)
print("FalseNegative: %d\n" % wn_fn)
print("Accuracy Score: %f" % wn_accuracy_score)
print("Precision Score: %f" % wn_precision_score)
print("Recall Score: %f" % wn_recall_score)
print("F1 Score: %f\n" % wn_f1_score)
# in 300
print("Positive: %d" % wn_pos)
print("Negative: %d" % wn_neg)
print("Neutral: %d" % wn_neu)

print("Removed neutral [%d] reviews" % cnt_neutral)
print("# of Test -> %d" % num_of_test)
test_reviews_tokens = [r for r in test_reviews]

In [None]:
# training and validation set
labels = []
reviews = []

num_of_train_and_validation = 0
for index in range(TEST_SET_END_INDEX, TRAIN_AND_VALIDATION_END_INDEX):
    labels.append(df['label'][index])
    reviews.append(df['cleaned_review'][index])
    
    num_of_train_and_validation += 1

print("# of Train and Validation -> %d" % num_of_train_and_validation)
reviews_tokens = [r for r in reviews]

In [None]:
# manual split data into training set and test set
x_train = reviews_tokens
x_test = test_reviews_tokens
y_train = labels
y_test = test_labels

In [None]:
# encoding label using LabelEncoder()
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

MIN_FEATURE = 100 # inclusive
MAX_FEATURE = 7100 # exclusive
FEATURE_INTERVAL = 100

# load the module to transform our review inputs into word vectors using TfidVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

training_methods = []

# metrics 
metrics = {"performance": [], "accuracy": [], "cross_val": [], "f1": [], "precision": [], "recall": []}

# create our SVM classifier with the class LinearSVC
from sklearn.svm import LinearSVC
training_methods.append((LinearSVC(), "linear svc", copy.deepcopy(metrics)))

# create our Bayes classifier with the BernoulliNB
from sklearn.naive_bayes import BernoulliNB
training_methods.append((BernoulliNB(binarize=None), "bayes with bernoulliNB", copy.deepcopy(metrics)))

# create our Bayes classifier with the MultinomialNB
from sklearn.naive_bayes import MultinomialNB
training_methods.append((MultinomialNB(), "bayes with multinomialNB", copy.deepcopy(metrics)))

# x_train = vectorizer.fit_transform(x_train)

for n_features in range(MIN_FEATURE, MAX_FEATURE, FEATURE_INTERVAL):
    # print("------------------------------------\n")
    # print("nFeatures: %s\n" % n_features)
    tfidf_vect = TfidfVectorizer(max_features=n_features)
    # tfidf_vect.fit(reviews_tokens)

    _x_train = tfidf_vect.fit_transform(x_train)
    _x_test = tfidf_vect.transform(x_test)
     
    # print(tfidf_vect.vocabulary_)
    # print(x_train)
    
    for tm, name, metrics in training_methods:
        tm.fit(_x_train, y_train)
        
        prediction = tm.predict(_x_test)
        
        if n_features == 500:
            print("Model [%s] at [%d] features ->\n" % (name, n_features))
            tn, fp, fn, tp = confusion_matrix(y_test, prediction).ravel()
            print("TruePositive: %d" % tp)
            print("TrueNegative: %d" % tn)
            print("FalsePositive: %d" % fp)
            print("FalseNegative: %d\n" % fn)
        
        _performance_score = tm.score(_x_test, y_test)
        _accuracy_score = accuracy_score(y_test, prediction)
        _f1_score = f1_score(y_test, prediction)
        _precision_score = precision_score(y_test, prediction)    
        _recall_score = recall_score(y_test, prediction)
        
        metrics["performance"].append((n_features, _performance_score))
        metrics["accuracy"].append((n_features, _accuracy_score))
        metrics["f1"].append((n_features, _f1_score))
        metrics["precision"].append((n_features, _precision_score))
        metrics["recall"].append((n_features, _recall_score))

In [None]:
# draw the diagram
nfeatures_performance = []
nfeatures_accuracy = []
nfeatures_f1 = []
nfeatures_precision = []
nfeatures_recall = []
for tm, name, metrics in training_methods:
    nfeatures_performance.append((name, pd.DataFrame(metrics['performance'], columns=['n_features', 'performance_score'])))
    nfeatures_accuracy.append((name, pd.DataFrame(metrics['accuracy'], columns=['n_features', 'accuracy_score'])))
    nfeatures_f1.append((name, pd.DataFrame(metrics['f1'], columns=['n_features', 'f1_score'])))
    nfeatures_precision.append((name, pd.DataFrame(metrics['precision'], columns=['n_features', 'precision_score'])))
    nfeatures_recall.append((name, pd.DataFrame(metrics['recall'], columns=['n_features', 'recall_score'])))

In [None]:
# performance diagram
plt.figure(figsize=(8,6))

title = "Performance Score Comparison between"

for name, df in nfeatures_performance:
    plt.plot(df.n_features, df.performance_score, label=name)
    title += " %s |" % name
plt.title(title)
plt.xlabel("Number of features")
plt.ylabel("Performance Score")
plt.legend()

print("=== Performance Score ===\n")
for metrics in nfeatures_performance:
    max_performance_score_feature = 0
    max_performance_score = 0
    
    for feature_index in range(0, int((MAX_FEATURE-MIN_FEATURE)/FEATURE_INTERVAL)):
        _nfeatures = metrics[1].loc[feature_index].n_features
        _performance_score = metrics[1].loc[feature_index].performance_score
        # print("[%d] features -> [%f] performance score" % (_nfeatures, _performance_score))
        if _performance_score > max_performance_score:
            max_performance_score = _performance_score
            max_performance_score_feature = _nfeatures
    
    print("Model Name [%s] has max performance score [%f] at [%d] # of features" % (metrics[0], max_performance_score, max_performance_score_feature))

In [None]:
# accuracy diagram
plt.figure(figsize=(8,6))

title = "Accuracy Score Comparison between"

for name, df in nfeatures_accuracy:
    plt.plot(df.n_features, df.accuracy_score, label=name)
    title += " %s |" % name
plt.title(title)
plt.xlabel("Number of features")
plt.ylabel("Accuracy Score")
plt.legend()

print("=== Accuracy Score ===\n")
for metrics in nfeatures_accuracy:
    max_accuracy_score_feature = 0
    max_accuracy_score = 0
    
    for feature_index in range(0, int((MAX_FEATURE-MIN_FEATURE)/FEATURE_INTERVAL)):
        _nfeatures = metrics[1].loc[feature_index].n_features
        _accuracy_score = metrics[1].loc[feature_index].accuracy_score
        # print("[%d] features -> [%f] accuracy score" % (_nfeatures, _accuracy_score))
        if _accuracy_score > max_accuracy_score:
            max_accuracy_score = _accuracy_score
            max_accuracy_score_feature = _nfeatures
    
    print("Model Name [%s] has max accuracy score [%f] at [%d] # of features" % (metrics[0], max_accuracy_score, max_accuracy_score_feature))

In [None]:
# f1 diagram
plt.figure(figsize=(8,6))

title = "F1 Score Comparison between"

for name, df in nfeatures_f1:
    plt.plot(df.n_features, df.f1_score, label=name)
    title += " %s |" % name
plt.title(title)
plt.xlabel("Number of features")
plt.ylabel("F1 Score")
plt.legend()

print("=== F1 Score ===\n")
for metrics in nfeatures_f1:
    max_f1_score_feature = 0
    max_f1_score = 0
    
    for feature_index in range(0, int((MAX_FEATURE-MIN_FEATURE)/FEATURE_INTERVAL)):
        _nfeatures = metrics[1].loc[feature_index].n_features
        _f1_score = metrics[1].loc[feature_index].f1_score
        # print("[%d] features -> [%f] F1 score" % (_nfeatures, _f1_score))
        if _f1_score > max_f1_score:
            max_f1_score = _f1_score
            max_f1_score_feature = _nfeatures
    
    print("Model Name [%s] has max F1 score [%f] at [%d] # of features" % (metrics[0], max_f1_score, max_f1_score_feature))

In [None]:
# precision diagram
plt.figure(figsize=(8,6))

title = "Precision Score Comparison between"

for name, df in nfeatures_precision:
    plt.plot(df.n_features, df.precision_score, label=name)
    title += " %s |" % name
plt.title(title)
plt.xlabel("Number of features")
plt.ylabel("Precision Score")
plt.legend()

print("=== Precision Score ===\n")
for metrics in nfeatures_precision:
    max_precision_score_feature = 0
    max_precision_score = 0
    
    for feature_index in range(0, int((MAX_FEATURE-MIN_FEATURE)/FEATURE_INTERVAL)):
        _nfeatures = metrics[1].loc[feature_index].n_features
        _precision_score = metrics[1].loc[feature_index].precision_score
        # print("[%d] features -> [%f] Precision score" % (_nfeatures, _precision_score))
        if _precision_score > max_precision_score:
            max_precision_score = _precision_score
            max_precision_score_feature = _nfeatures
    
    print("Model Name [%s] has max Precision score [%f] at [%d] # of features" % (metrics[0], max_precision_score, max_precision_score_feature))

In [None]:
# recall diagram
plt.figure(figsize=(8,6))

title = "Recall Score Comparison between"

for name, df in nfeatures_recall:
    plt.plot(df.n_features, df.recall_score, label=name)
    title += " %s |" % name
plt.title(title)
plt.xlabel("Number of features")
plt.ylabel("Recall Score")
plt.legend()

print("=== Recall Score ===\n")
for metrics in nfeatures_recall:
    max_recall_score_feature = 0
    max_recall_score = 0
    
    for feature_index in range(0, int((MAX_FEATURE-MIN_FEATURE)/FEATURE_INTERVAL)):
        _nfeatures = metrics[1].loc[feature_index].n_features
        _recall_score = metrics[1].loc[feature_index].recall_score
        # print("[%d] features -> [%f] Recall score" % (_nfeatures, _recall_score))
        if _recall_score > max_recall_score:
            max_recall_score = _recall_score
            max_recall_score_feature = _nfeatures
    
    print("Model Name [%s] has max Recall score [%f] at [%d] # of features" % (metrics[0], max_recall_score, max_recall_score_feature))

In [None]:
# precision, recall, f1, accuracy for all methods

lsvc_accuracy_score = nfeatures_accuracy[0][1].loc[4].accuracy_score
lsvc_f1_score = nfeatures_f1[0][1].loc[4].f1_score
lsvc_precision_score = nfeatures_precision[0][1].loc[4].precision_score
lsvc_recall_score = nfeatures_recall[0][1].loc[4].recall_score

bcbn_accuracy_score = nfeatures_accuracy[1][1].loc[4].accuracy_score
bcbn_f1_score = nfeatures_f1[1][1].loc[4].f1_score
bcbn_precision_score = nfeatures_precision[1][1].loc[4].precision_score
bcbn_recall_score = nfeatures_recall[1][1].loc[4].recall_score

bcmn_accuracy_score = nfeatures_accuracy[2][1].loc[4].accuracy_score
bcmn_f1_score = nfeatures_f1[2][1].loc[4].f1_score
bcmn_precision_score = nfeatures_precision[2][1].loc[4].precision_score
bcmn_recall_score = nfeatures_recall[2][1].loc[4].recall_score

val1 = ["Accuracy_Score", "F1_Score", "Precision_Score", "Recall_Score"] 
val2 = ["Vader", "Sentiment_WordNet", "Linear_SVC", "Naive_Bayes_with_BernoulliNB", "Naive_Bayes_with_MultinomialNB"] 
val3 = [[vader_accuracy_score, vader_f1_score, vader_precision_score, vader_recall_score],
        [wn_accuracy_score, wn_f1_score, wn_precision_score, wn_recall_score],
        [lsvc_accuracy_score, lsvc_f1_score, lsvc_precision_score, lsvc_recall_score],
        [bcbn_accuracy_score, bcbn_f1_score, bcbn_precision_score, bcbn_recall_score],
        [bcmn_accuracy_score, bcmn_f1_score, bcmn_precision_score, bcmn_recall_score]]
  
fig, ax = plt.subplots() 
ax.set_axis_off() 
table = ax.table( 
    cellText = val3,  
    rowLabels = val2,  
    colLabels = val1, 
    rowColours =["palegreen"] * 10,  
    colColours =["palegreen"] * 10, 
    cellLoc ='center',  
    loc ='upper left')  


# table = ax.table(cellText=table_data, loc='center')
table.set_fontsize(14)
table.scale(3.5,3.5)
# ax.axis('off')

plt.show()