In [1]:
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import sklearn
import sklearn.linear_model
import sklearn.metrics
import sentence_transformers
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from tqdm import trange
import random

In [2]:
fields = ['TYPE_EVENT', 'REFERENCE']
df = pd.read_csv('eswd.csv', usecols=fields)


In [3]:
df = df.dropna(subset=['REFERENCE'])  # we are at 12871 rows now
df.drop_duplicates(inplace=True) 
df.to_csv('eswd_two.csv', index=False)

In [4]:
df = df[~df['REFERENCE'].str.contains("Report via Kachelmannwetter.com")]

In [5]:
#from reference column drop the dates, so all years(2020, 2021, 2022 and so on), all days(1-31) and all names of months(jan, feb, mar and so on)
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\b\d+\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bjan\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bfeb\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bmar\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bapr\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bmay\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bjun\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bjul\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\baug\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bsep\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\boct\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bnov\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bdec\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\b2020\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\b2021\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\b2022\b', '')
df['REFERENCE'] = df['REFERENCE'].str.replace(r'\b2023\b', '')


  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\b\d+\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bjan\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bfeb\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bmar\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bapr\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bmay\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bjun\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bjul\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\baug\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bsep\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\boct\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bnov\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\bdec\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\b2020\b', '')
  df['REFERENCE'] = df['REFERENCE'].str.replace(r'\b2021\b', '')
  df['REFERENCE'] = df['REFERENCE'].st

In [6]:
df['TYPE_EVENT'].value_counts()

WIND         1113
HAIL          907
LIGHTNING     425
PRECIP        343
TORNADO       134
AVALANCHE      71
DEVIL          12
SNOW            7
Name: TYPE_EVENT, dtype: int64

SBERT sentence tranformation: 

In [7]:
#print out first 10 references
df['REFERENCE'].head(10)

0    "Haute-Alpes : un skieur de  ans décède dans u...
1    "Un skieur de randonnée meurt, emporté par une...
2    "Alpes-de-Haute-Provence : deux randonneurs en...
3    "La foudre provoque un incendie, une maison fo...
4    "Orage dans le Gard : la foudre s'est abattue ...
5    "Un gros coup de vent fait des dégâts au villa...
6              "Tornade dans le Finistère", TF1,  NOV 
7    "Bouillé-Ménard. Une "mini-tornade" traverse l...
8    "À Ruffiac, ils évacuent leur maison en feu su...
9    "TÉMOIGNAGE. « Si le voisin n’avait pas vu cet...
Name: REFERENCE, dtype: object

In [8]:
#remove punctuation
df['REFERENCE'] = df['REFERENCE'].str.replace('[^\w\s]','')


  df['REFERENCE'] = df['REFERENCE'].str.replace('[^\w\s]','')


In [9]:
model = sentence_transformers.SentenceTransformer('distiluse-base-multilingual-cased-v1')

reference_embeddings = model.encode(df['REFERENCE'].tolist())

df['reference_embeddings'] = list(reference_embeddings)

In [10]:
reference_embeddings = np.array(df['reference_embeddings'].tolist())

references = df['REFERENCE'].tolist()
vectorizer = CountVectorizer()

vectorizer.fit_transform(references)
vocabulary = vectorizer.vocabulary_
word_importance = {}

for i in range(reference_embeddings.shape[1]):

    embedding_values = reference_embeddings[:, i]
    correlation = np.corrcoef(embedding_values, vectorizer.transform(references).toarray()[:, i])[0, 1]
    word = list(vocabulary.keys())[list(vocabulary.values()).index(i)]
    word_importance[word] = correlation
sorted_word_importance = {k: v for k, v in sorted(word_importance.items(), key=lambda item: item[1], reverse=True)}

for i, (word, score) in enumerate(sorted_word_importance.items()):
    if i == 30:
        break
    print(f'{i+1}. {word}: {score}')


1. actufr: 0.121628247607912
2. bleu: 0.10577256793296902
3. aug: 0.08567380655630943
4. ardèche: 0.07076007344247497
5. agriculteurs: 0.06882783751174253
6. au: 0.06742183275031724
7. access: 0.06272687231517135
8. annoncés: 0.06262826120251054
9. auvergne: 0.05737267135344401
10. agricoles: 0.05635128055189873
11. allait: 0.05594562947409327
12. arbre: 0.05550611444205244
13. actualité: 0.05416973160975915
14. alpesmaritimes: 0.05202047986401707
15. alain: 0.050618503977319394
16. arrachées: 0.049936991775451134
17. brusque: 0.04796479665550098
18. cambrai: 0.04657638149309971
19. an: 0.04654309437048608
20. catastrophes: 0.04566765616992503
21. aux: 0.043636620428391636
22. aréca: 0.04333469437959409
23. avignonnaise: 0.04297394162154565
24. bfm: 0.042830353592631036
25. bâtiments: 0.04234264663615913
26. avalanches: 0.04172740509950282
27. actuorangefr: 0.04125712602084873
28. averse: 0.04027559972597995
29. broyés: 0.03987352425625379
30. author: 0.03982339896808097


In [11]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

Using Logistic Regression to predict

In [14]:
from sklearn.metrics import f1_score, accuracy_score, make_scorer
model = sklearn.linear_model.LogisticRegression(solver='newton-cg', multi_class='multinomial',max_iter=1000, random_state=42)
model.fit(df_train['reference_embeddings'].tolist(), df_train['TYPE_EVENT'].tolist())
y_pred = model.predict(df_test['reference_embeddings'].tolist())

print(sklearn.metrics.accuracy_score(df_test['TYPE_EVENT'].tolist(), y_pred))
print(sklearn.metrics.f1_score(df_test['TYPE_EVENT'].tolist(), y_pred, average='weighted'))

0.7976782752902156
0.7839542730101993


In [55]:
#build a confusion matrix
def build_confusion_matrix(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    model = sklearn.linear_model.LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    class_labels = model.classes_
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels=class_labels)
    
    print(sklearn.metrics.classification_report(y_test, y_pred))
    print(cm)
    return cm, class_labels

In [56]:
def confusion_matrix_to_latex(cm, class_names):
    num_classes = cm.shape[0]
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + "c" * num_classes + "}\n"
    
    # Add header row with class names' first letters
    header_row = " & ".join([name[0] for name in class_names])
    latex_table += f" & {header_row} \\\\\n\\hline\n"
    
    for i, row in enumerate(cm):
        row_text = " & ".join(map(str, row))
        class_name = class_names[i]
        latex_table += f"{class_name} & {row_text} \\\\\n"
    
    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, "w") as file:
        file.write(latex_table)

In [57]:


cm_en, class_labels_en = build_confusion_matrix(df_train, df_test, 'reference_embeddings')

# Convert the confusion matrix to LaTeX table format
latex_table = confusion_matrix_to_latex(cm_en, class_labels_en)
save_latex_table(latex_table, "confusion_matrix_sbert_lr.tex")

              precision    recall  f1-score   support

   AVALANCHE       1.00      0.92      0.96        13
       DEVIL       0.00      0.00      0.00         3
        HAIL       0.87      0.89      0.88       198
   LIGHTNING       0.88      0.89      0.89        85
      PRECIP       0.54      0.42      0.47        65
        SNOW       0.00      0.00      0.00         1
     TORNADO       0.88      0.32      0.47        22
        WIND       0.77      0.87      0.82       216

    accuracy                           0.81       603
   macro avg       0.62      0.54      0.56       603
weighted avg       0.80      0.81      0.79       603

[[ 12   0   0   0   0   0   0   1]
 [  0   0   0   1   1   0   0   1]
 [  0   0 176   1   4   0   0  17]
 [  0   0   2  76   2   0   1   4]
 [  0   0  12   3  27   0   0  23]
 [  0   0   1   0   0   0   0   0]
 [  0   0   3   0   2   0   7  10]
 [  0   0   9   5  14   0   0 188]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
#print the misclassified events, the actual and the predicted and the reference
def print_misclassified_events(df_test, vector, model):
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    y_pred = model.predict(X_test)
    misclassified = np.where(y_test != y_pred)
    for i in misclassified[0]:
        print('Actual: ', y_test.iloc[i], ' Predicted: ', y_pred[i], ' Reference: ', df_test.iloc[i]['REFERENCE'])

print_misclassified_events(df_test, 'reference_embeddings', model)

Actual:  DEVIL  Predicted:  WIND  Reference:  "Lacanau : quel est ce tourbillon qui a fait s’envoler tous les parasols de la plage ?", Sud Ouest,  Jul .
Actual:  LIGHTNING  Predicted:  WIND  Reference:  "Erdre-en-Anjou. La foudre a encore frappé rue des Rochers", OUEST FRANCE,  MAY ;
Actual:  WIND  Predicted:  LIGHTNING  Reference:  "Un supermarché frappé par la foudre et inondé à Thyez, une cinquantaine d’interventions pour les pompiers ", LE DAUPHINÉ,  AUG 
Actual:  WIND  Predicted:  PRECIP  Reference:  "Heuilley-sur-Saône : des toits et des portails endommagés", Le Bien Public,  Jun .
Actual:  WIND  Predicted:  PRECIP  Reference:  'Bilan des orages dans le Calvados :  interventions des pompiers", Ouest-France,  Aug .
Actual:  HAIL  Predicted:  WIND  Reference:  LA FRANCE AGRICOLE,  JUN 
Actual:  TORNADO  Predicted:  WIND  Reference:  Collectif Chalonnais (on Facebook),  Nov .  "VIDEO. Marne : tornade à Suippes,  foyers sans électricité, le toit de la caserne des pompiers s'est à moi

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, make_scorer

def tune_random_forest(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']

    # Define the parameter grid for the Random Forest classifier
    param_grid = {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create the base model
    rf = RandomForestClassifier(random_state=0)

    # Instantiate the GridSearchCV object
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                               scoring=make_scorer(f1_score, average='weighted'),
                               cv=3, verbose=1, n_jobs=-1)

    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and the corresponding best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print("Best parameters:", best_params)
    print("Best F1 score:", best_score)

    # Train the model with the best parameters on the whole dataset
    best_rf = RandomForestClassifier(**best_params, random_state=0)
    best_rf.fit(X_train, y_train)

    # Test the model on the test dataset
    y_pred = best_rf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    print('Test accuracy:', accuracy)
    print('Test F1 score:', f1)

    return best_rf

best_model_rf = tune_random_forest(df_train, df_test, 'reference_embeddings')

Fitting 3 folds for each of 180 candidates, totalling 540 fits
Best parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best F1 score: 0.717056317242749
Test accuracy: 0.7810945273631841
Test F1 score: 0.7608966781235433


In [33]:
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators=150, max_depth=9, random_state=0)
# model.fit(df_train['reference_embeddings'].tolist(), df_train['TYPE_EVENT'].tolist())
# y_pred = model.predict(df_test['reference_embeddings'].tolist())

# print(sklearn.metrics.accuracy_score(df_test['TYPE_EVENT'].tolist(), y_pred))

# print(sklearn.metrics.f1_score(df_test['TYPE_EVENT'].tolist(), y_pred, average='weighted'))

0.7346600331674958
0.7003428411256595


Now, we will use bert transformation

In [None]:
possible_labels = df['TYPE_EVENT'].unique()
labels = {}
for index, possible_label in enumerate(possible_labels):
    labels[possible_label] = index
labels

{'AVALANCHE': 0,
 'LIGHTNING': 1,
 'WIND': 2,
 'TORNADO': 3,
 'PRECIP': 4,
 'HAIL': 5,
 'DEVIL': 6,
 'SNOW': 7}

We add a label column to have them in numbers

In [None]:
df['label'] = df['TYPE_EVENT'].replace(labels)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['TYPE_EVENT', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,REFERENCE,reference_embeddings
TYPE_EVENT,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1
AVALANCHE,0,train,60,60
AVALANCHE,0,val,11,11
DEVIL,6,train,10,10
DEVIL,6,val,2,2
HAIL,5,train,771,771
HAIL,5,val,136,136
LIGHTNING,1,train,361,361
LIGHTNING,1,val,64,64
PRECIP,4,train,292,292
PRECIP,4,val,51,51


In [73]:
# Create the exportable confusion matrix
def confusion_matrix_to_latex(cm, class_names):
    num_classes = cm.shape[0]
    latex_table = "\\begin{table}[!htbp]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + "c" * num_classes + "}\n"
    
    # Add header row with class names
    header_row = " & ".join(class_names)
    latex_table += f" & {header_row} \\\\\n\\hline\n"
    
    for i, row in enumerate(cm):
        row_text = " & ".join(map(str, row))
        class_name = class_names[i]
        latex_table += f"{class_name} & {row_text} \\\\\n"
    
    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, "w") as file:
        file.write(latex_table)

# Build the confusion matrix for the Random Forest model
def build_confusion_matrix_random_forest(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    model = sklearn.ensemble.RandomForestClassifier(n_estimators=100, random_state=0, min_samples_split=5, min_samples_leaf=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
    print(sklearn.metrics.classification_report(y_test, y_pred))
    return cm

def build_and_export_confusion_matrix_rf(df_train, df_test, vector, class_names, filename):
    cm = build_confusion_matrix_random_forest(df_train, df_test, vector)
    latex_table = confusion_matrix_to_latex(cm, class_names)
    save_latex_table(latex_table, filename)

class_names = ['AVALANCHE', 'DEVIL', 'HAIL', 'LIGHTNING', 'PRECIP', 'SNOW', 'TORNADO', 'WIND']
build_and_export_confusion_matrix_rf(df_train, df_test, 'reference_embeddings', class_names, "confusion_matrix_sbert_rf.tex")


              precision    recall  f1-score   support

   AVALANCHE       1.00      0.85      0.92        13
       DEVIL       0.00      0.00      0.00         3
        HAIL       0.81      0.88      0.84       198
   LIGHTNING       0.83      0.86      0.84        85
      PRECIP       0.50      0.22      0.30        65
        SNOW       0.00      0.00      0.00         1
     TORNADO       1.00      0.18      0.31        22
        WIND       0.72      0.86      0.78       216

    accuracy                           0.76       603
   macro avg       0.61      0.48      0.50       603
weighted avg       0.75      0.76      0.74       603



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


freeze the weights of all except last one (bert)

In [74]:
def print_misclassified_events(df_test, vector, model):
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    y_pred = model.predict(X_test)
    misclassified = np.where(y_test != y_pred)
    for i in misclassified[0]:
        print('Actual: ', y_test.iloc[i], ' Predicted: ', y_pred[i], ' Reference: ', df_test.iloc[i]['REFERENCE'])

In [75]:
print_misclassified_events(df_test, 'reference_embeddings', best_model_rf)

Actual:  HAIL  Predicted:  PRECIP  Reference:  Orages de grêle très localisés dans lYonne  dimportants dégâts dans les cultures autour de PontsurYonne et  clients privés délectricité en Puisaye LYONNE RÉPUBLICAINE  JUN  RAD
Actual:  DEVIL  Predicted:  WIND  Reference:  Lacanau  quel est ce tourbillon qui a fait senvoler tous les parasols de la plage  Sud Ouest  Jul 
Actual:  WIND  Predicted:  HAIL  Reference:  Intempéries Record  une rafale de vent à  kmh enregistrée à Montpellier Actufr  June 
Actual:  WIND  Predicted:  PRECIP  Reference:  Un supermarché frappé par la foudre et inondé à Thyez une cinquantaine dinterventions pour les pompiers  LE DAUPHINÉ  AUG 
Actual:  TORNADO  Predicted:  WIND  Reference:  StevenTual_off twitter  NOV    Impressionnant    une trombe marine observée à Penmarch Vidéos Le Telegramme  NOV   Une trombe marine se forme au large du Finistère en ce jour de Toussaint OuestFrance  NOV   Trombe marine sur la côte du Finistère le 1er novembre Observatoire Kerauno

tokenization for each word u have a vector
i only need the cls vector (usually the first one)
sbert first, as you give the sentence and it gives you a vector that represents the vector
with sbert i use logistic regression, random forests 
with bert we use bert classifier


LSTM classification and neural network
feature extraction to get the most important words and create a binary classification of the extreme or not extreme weathers

In [89]:
#use keras to build a neural network for multi-class classification
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.optimizers import RMSprop, Adam
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

def build_neural_network(df_train, df_test, embeddings_column):
    X_train = np.array(df_train[embeddings_column].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[embeddings_column].tolist())
    y_test = df_test['TYPE_EVENT']

    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the label encoder to the training data
    label_encoder.fit(y_train)

    # Encode the training and testing labels
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the training and testing labels
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(X_train[0]),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(y_train_encoded)), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(),
                  metrics=['accuracy'])
    history = model.fit(X_train, y_train_one_hot,
                        batch_size=128,
                        epochs=5,
                        verbose=1,
                        validation_data=(X_test, y_test_one_hot))
    score = model.evaluate(X_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    return model

# Call the function with the SBERT embeddings
model = build_neural_network(df_train, df_test, 'reference_embeddings')


Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_36 (Dense)            (None, 512)               262656    
                                                                 
 dropout_22 (Dropout)        (None, 512)               0         
                                                                 
 dense_37 (Dense)            (None, 512)               262656    
                                                                 
 dropout_23 (Dropout)        (None, 512)               0         
                                                                 
 dense_38 (Dense)            (None, 8)                 4104      
                                                                 
Total params: 529,416
Trainable params: 529,416
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5

In [87]:
#build a confusion matrix for the neural network
def build_confusion_matrix_neural_network(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the label encoder to the training data
    label_encoder.fit(y_train)

    # Encode the training and testing labels
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the training and testing labels
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(X_train[0]),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(y_train_encoded)), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])
    history = model.fit(X_train, y_train_one_hot,
                    batch_size=128,
                    epochs=5,
                    verbose=1,
                    validation_data=(X_test, y_test_one_hot))
    score = model.evaluate(X_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    y_pred = model.predict(X_test)
    cm = sklearn.metrics.confusion_matrix(y_test_one_hot.argmax(axis=1), y_pred.argmax(axis=1))
    print(sklearn.metrics.classification_report(y_test_one_hot.argmax(axis=1), y_pred.argmax(axis=1)))
    return cm

In [88]:
cm_neural = build_confusion_matrix_neural_network(df_train, df_test, 'reference_embeddings')

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_33 (Dense)            (None, 512)               262656    
                                                                 
 dropout_20 (Dropout)        (None, 512)               0         
                                                                 
 dense_34 (Dense)            (None, 512)               262656    
                                                                 
 dropout_21 (Dropout)        (None, 512)               0         
                                                                 
 dense_35 (Dense)            (None, 8)                 4104      
                                                                 
Total params: 529,416
Trainable params: 529,416
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [90]:
def confusion_matrix_to_latex(cm, class_names):
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + ' '.join(['c' for _ in class_names]) + "}\n"
    latex_table += "\\multicolumn{1}{c}{} & \\multicolumn{" + str(len(class_names)) + "}{c}{Predicted Class} \\\\\n"
    latex_table += "Actual Class & " + ' & '.join(class_names) + " \\\\\n\\hline\n"

    for i, row in enumerate(cm):
        row_text = f"{class_names[i]} & " + ' & '.join([str(cell) for cell in row])
        latex_table += f"{row_text} \\\\\n"

    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, 'w') as f:
        f.write(latex_table)

In [91]:
class_names = ['AVALANCHE', 'DEVIL', 'HAIL', 'LIGHTNING', 'PRECIP', 'SNOW', 'TORNADO', 'WIND']
latex_table = confusion_matrix_to_latex(cm_neural, class_names)
save_latex_table(latex_table, 'cm_neural_sbert.tex')

In [94]:
def print_incorrect_predictions_neural_network(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the label encoder to the training data
    label_encoder.fit(y_train)

    # Encode the training and testing labels
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the training and testing labels
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(X_train[0]),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(y_train_encoded)), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(),
                  metrics=['accuracy'])
    history = model.fit(X_train, y_train_one_hot,
                        batch_size=128,
                        epochs=5,
                        verbose=1,
                        validation_data=(X_test, y_test_one_hot))
    score = model.evaluate(X_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    y_pred = model.predict(X_test)
    y_pred = y_pred.argmax(axis=1)
    y_test = y_test_one_hot.argmax(axis=1)

    for i in range(len(y_pred)):
        if y_pred[i] != y_test[i]:
            true_class = label_encoder.inverse_transform([y_test[i]])[0]
            pred_class = label_encoder.inverse_transform([y_pred[i]])[0]
            print('True prediction:', true_class, 'Predicted prediction:', pred_class, 'Reference:', df_test['REFERENCE'].iloc[i])

In [95]:
print_incorrect_predictions_neural_network(df_train, df_test, 'reference_embeddings')


Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_42 (Dense)            (None, 512)               262656    
                                                                 
 dropout_26 (Dropout)        (None, 512)               0         
                                                                 
 dense_43 (Dense)            (None, 512)               262656    
                                                                 
 dropout_27 (Dropout)        (None, 512)               0         
                                                                 
 dense_44 (Dense)            (None, 8)                 4104      
                                                                 
Total params: 529,416
Trainable params: 529,416
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5