In [105]:
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import re
import spacy
from spacy_language_detection import LanguageDetector
import math
import spacy_fastlang
import sklearn
import sklearn.linear_model
import sklearn.metrics
from sklearn import feature_selection
from sklearn.feature_extraction.text import TfidfVectorizer

### BAG OF WORDS METHOD ON SPACY PROCESSED DATA 

In [106]:
#import df_en and df_fr from eswd_en and eswd_fr
df_en = pd.read_csv('eswd_en.csv')
df_fr = pd.read_csv('eswd_fr.csv')


In [107]:
print(df_fr.shape)

(2526, 3)


In [108]:
# Custom function to check the conditions
def filter_references(reference):
    lower_reference = reference.lower()
    if 'facebook' in lower_reference or 'twitter' in lower_reference:
        return len(reference.split()) >= 8
    return True

# Apply the custom function and filter the DataFrame
df_en = df_en[df_en['REFERENCE'].apply(filter_references)]
print(df_en.shape)
df_fr = df_fr[df_fr['REFERENCE'].apply(filter_references)]
print(df_fr.shape)

(199, 3)
(2357, 3)


In [109]:
#put all unique words from 'REFERENCE' into a list
unique_words = []
for i in df_fr['REFERENCE']:
    for word in i.split():
        if word not in unique_words:
            unique_words.append(word)

print(len(unique_words))






3517


In [110]:

df_en['REFERENCE'] = df_en['REFERENCE'].str.replace('kachelmannwetter', '')
df_en['REFERENCE'] = df_en['REFERENCE'].str.replace('wosnica', '')
df_en['REFERENCE'] = df_en['REFERENCE'].str.replace('woznica', '')
df_en['REFERENCE'] = df_en['REFERENCE'].str.replace('rtl', '')
df_en['REFERENCE'] = df_en['REFERENCE'].str.replace('meteo', '')
df_fr['REFERENCE'] = df_fr['REFERENCE'].str.replace('kachelmannwetter', '')
df_fr['REFERENCE'] = df_fr['REFERENCE'].str.replace('facebook', '')
df_fr['REFERENCE'] = df_fr['REFERENCE'].str.replace('twitter', '')
df_fr['REFERENCE'] = df_fr['REFERENCE'].str.replace('eyewitness', '')


In [111]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect_en = CountVectorizer(max_features=30000)
count_vect_fr = CountVectorizer(max_features=30000)

bow_en = count_vect_en.fit_transform(df_en['REFERENCE'])
bow_fr = count_vect_fr.fit_transform(df_fr['REFERENCE'])

bow_en = bow_en.toarray()
bow_fr = bow_fr.toarray()

In [112]:
#add the bow_fr to df_fr
df_fr['bow_fr'] = bow_fr.tolist()
df_en['bow_en'] = bow_en.tolist()

In [113]:
def build_model(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    model = sklearn.linear_model.LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = sklearn.metrics.f1_score(y_test, y_pred, average='weighted')
    accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
    #mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
    print('Accuracy: ', accuracy)
    print('F1 score: ', f1)
    #print('MSE: ', mse)
    return model


In [114]:
#split the data into train and test
df_train_en, df_test_en = train_test_split(df_en, test_size=0.2, random_state=42)
df_train_fr, df_test_fr = train_test_split(df_fr, test_size=0.2, random_state=42)

english_bow_lr = build_model(df_train_en, df_test_en, 'bow_en')
french_bow_lr = build_model(df_train_fr, df_test_fr, 'bow_fr')


Accuracy:  0.75
F1 score:  0.7182978723404255
Accuracy:  0.8305084745762712
F1 score:  0.8274706793326186


In [115]:
def top_features_table(weights, feature_names, class_names, top_n=10):
    df = pd.DataFrame(data=weights.T, columns=class_names, index=feature_names)
    
    top_features = {}
    for class_name in class_names:
        df_sorted = df.sort_values(class_name, ascending=False)
        top_features[class_name] = df_sorted.iloc[:top_n, :].index.tolist()
    
    return top_features

def top_features_to_latex(top_features, class_names, top_n=10):
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + "l" * len(class_names) + "}\n"

    
    header_row = " & ".join(class_names)
    latex_table += f" & {header_row} \\\\\n\\hline\n"

    for i in range(top_n):
        row = [f"{i+1}"]
        for class_name in class_names:
            row.append(top_features[class_name][i])
        row_text = " & ".join(row)
        latex_table += f"{row_text} \\\\\n"

    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, "w") as file:
        file.write(latex_table)

weights = english_bow_lr.coef_
feat_names_en = count_vect_en.get_feature_names()
class_names = ['AVALANCHE', 'DEVIL', 'HAIL', 'LIGHTNING', 'PRECIP', 'TORNADO', 'WIND']
top_features = top_features_table(weights, feat_names_en, class_names)
latex_table = top_features_to_latex(top_features, class_names)
save_latex_table(latex_table, "top_features_en_lrbow.tex")

In [116]:

weights = french_bow_lr.coef_
feat_names_fr = count_vect_fr.get_feature_names()
class_names = ['AVALANCHE', 'DEVIL', 'HAIL', 'LIGHTNING', 'PRECIP', 'SNOW', 'TORNADO', 'WIND']
top_features = top_features_table(weights, feat_names_fr, class_names)
latex_table = top_features_to_latex(top_features, class_names)
save_latex_table(latex_table, "top_features_fr_lrbow.tex")

In [117]:
class_labels = french_bow_lr.classes_
for index, label in enumerate(class_labels):
    print(f"Class {index}: {label}")

Class 0: AVALANCHE
Class 1: DEVIL
Class 2: HAIL
Class 3: LIGHTNING
Class 4: PRECIP
Class 5: SNOW
Class 6: TORNADO
Class 7: WIND


In [118]:
feat_names_en = count_vect_en.get_feature_names()
weights = english_bow_lr.coef_
df = pd.DataFrame(data=weights.T, columns=['Class_' + str(i) for i in range(weights.shape[0])], index=feat_names_en)
class_index = 0 # Change this to the index of the class you are interested in
df_sorted = df.sort_values(f"Class_{class_index}", ascending=False)
top_10_features = df_sorted.iloc[:10, class_index]
print(top_10_features)

jan         1.357286e+00
2021        8.631052e-01
27          5.356843e-01
france      4.994810e-01
rtl         3.430796e-01
bfmtv       3.210738e-01
25          3.199947e-01
26          2.874483e-01
fr          2.476900e-01
touraine    8.993755e-09
Name: Class_0, dtype: float64


In [119]:
#build a confusion matrix
def build_confusion_matrix(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    model = sklearn.linear_model.LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    class_labels = model.classes_
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels=class_labels)
    
    print(sklearn.metrics.classification_report(y_test, y_pred))
    print(cm)
    return cm, class_labels


In [120]:
def confusion_matrix_to_latex(cm, class_names):
    num_classes = cm.shape[0]
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + "c" * num_classes + "}\n"
    
    # Add header row with class names' first letters
    header_row = " & ".join([name[0] for name in class_names])
    latex_table += f" & {header_row} \\\\\n\\hline\n"
    
    for i, row in enumerate(cm):
        row_text = " & ".join(map(str, row))
        class_name = class_names[i]
        latex_table += f"{class_name} & {row_text} \\\\\n"
    
    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, "w") as file:
        file.write(latex_table)

In [121]:

#export the confusion matrix to tex file
cm_en = build_confusion_matrix(df_train_en, df_test_en, 'bow_en')
cm_fr = build_confusion_matrix(df_train_fr, df_test_fr, 'bow_fr')
# Save the LaTeX table to a file

cm_en, class_labels_en = build_confusion_matrix(df_train_en, df_test_en, 'bow_en')

# Convert the confusion matrix to LaTeX table format
latex_table = confusion_matrix_to_latex(cm_en, class_labels_en)
save_latex_table(latex_table, "confusion_matrix_en_bowlr.tex")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        HAIL       0.92      0.96      0.94        23
   LIGHTNING       0.00      0.00      0.00         3
      PRECIP       1.00      0.33      0.50         6
     TORNADO       0.00      0.00      0.00         1
        WIND       0.46      0.86      0.60         7

    accuracy                           0.75        40
   macro avg       0.48      0.43      0.41        40
weighted avg       0.76      0.75      0.72        40

[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0 22  0  0  0  1]
 [ 0  0  2  0  0  0  1]
 [ 0  0  0  0  2  0  4]
 [ 0  0  0  0  0  0  1]
 [ 0  0  0  0  0  1  6]]
              precision    recall  f1-score   support

   AVALANCHE       1.00      1.00      1.00        14
       DEVIL       0.00      0.00      0.00         2
        HAIL       0.84      0.89      0.87       123
   LIGHTNING       0.87      0.83      0.85        90
      PRECIP       0.51      0.54      0.52        52
        SNOW     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
cm_fr, class_labels_fr = build_confusion_matrix(df_train_fr, df_test_fr, 'bow_fr')

# Convert the confusion matrix to LaTeX table format
latex_table = confusion_matrix_to_latex(cm_fr, class_labels_fr)
save_latex_table(latex_table, "confusion_matrix_fr_bowlr.tex")

              precision    recall  f1-score   support

   AVALANCHE       1.00      1.00      1.00        14
       DEVIL       0.00      0.00      0.00         2
        HAIL       0.84      0.89      0.87       123
   LIGHTNING       0.87      0.83      0.85        90
      PRECIP       0.51      0.54      0.52        52
        SNOW       0.00      0.00      0.00         1
     TORNADO       0.86      0.50      0.63        12
        WIND       0.88      0.90      0.89       178

    accuracy                           0.83       472
   macro avg       0.62      0.58      0.60       472
weighted avg       0.83      0.83      0.83       472

[[ 14   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   1]
 [  0   0 109   0  13   0   0   1]
 [  0   0   3  75   8   0   0   4]
 [  0   0   8   4  28   0   0  12]
 [  0   0   0   1   0   0   0   0]
 [  0   0   1   0   2   0   6   3]
 [  0   0   8   6   4   0   0 160]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [123]:
#print the misclassified events, the actual and the predicted and the reference
def print_misclassified_events(df_test, vector, model):
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    y_pred = model.predict(X_test)
    misclassified = np.where(y_test != y_pred)
    for i in misclassified[0]:
        print('Actual: ', y_test.iloc[i], ' Predicted: ', y_pred[i], ' Reference: ', df_test.iloc[i]['REFERENCE'])

In [124]:
print_misclassified_events(df_test_en, 'bow_en', english_bow_lr)


Actual:  PRECIP  Predicted:  WIND  Reference:  ['12/13', 'France', '3', 'Hauts', 'de', 'France', '18', 'April', '2020']
Actual:  LIGHTNING  Predicted:  HAIL  Reference:  ['Nicolas', 'Suanez', 'Facebook', '18', 'June', '2022', 'Nicolas', 'Suanez', 'Facebook', '19', 'June', '2022']
Actual:  PRECIP  Predicted:  WIND  Reference:  ['bettancourt', 'la', 'ferrée', 'durement', 'touchée', 'jhm', '10', 'APR', '2022']
Actual:  WIND  Predicted:  TORNADO  Reference:  ['Gianni', 'Timachoff', 'Facebook', '25', '10', '2020', 'sotiz', 'K', 'tastrof', 'Facebook', '25', '10', '2020']
Actual:  LIGHTNING  Predicted:  WIND  Reference:  ['Ker', 'Nunnos', 'Facebook', '01', 'June', '2020', 'restrict', 'access']
Actual:  TORNADO  Predicted:  WIND  Reference:  ['Frederic', 'Franceschi', '15', 'July', '2019', 'Twitter', 'Meteo', 'Villes', '15', 'July', '2019', 'Twitter']
Actual:  PRECIP  Predicted:  WIND  Reference:  ['orage', 'LES', 'IMAGES', 'impressionnante', 'DES', 'INTEMPÉRIES', 'DANS', 'LA', 'RÉGION', 'RADI

In [125]:
print_misclassified_events(df_test_fr, 'bow_fr', french_bow_lr)

Actual:  LIGHTNING  Predicted:  PRECIP  Reference:  ['Orages', 'grêle', 'localiser', 'Yonne', 'important', 'dégât', 'culture', 'autour', 'pont-sur-yonne', '200', 'client', 'priver', 'électricité', 'Puisaye', 'YONNE', 'REPUBLICAINE', '29', 'JUN', '2021', 'RAD']
Actual:  LIGHTNING  Predicted:  WIND  Reference:  ['Orages', 'départ', 'feu', 'maison', 'détruite', 'arbre', 'couché', 'tarn', 'Haute-Garonne', 'tarn-et-garonne', 'FRANCE', 'TV', 'INFO', '30', 'aug', '2022']
Actual:  WIND  Predicted:  PRECIP  Reference:  ['Orages', 'Saint-Etienne', 'Gier', '2618', 'impact', 'foudre', 'relever', 'progrès', '22', 'JUL', '2020']
Actual:  WIND  Predicted:  HAIL  Reference:  ['Orages', 'grêle', 'faire', 'casse', 'auvergne', 'Rhône', 'Alpes', 'France', 'Info', '3', 'Jul', '2019']
Actual:  DEVIL  Predicted:  TORNADO  Reference:  ['saint-sernin', 'BOIS', 'mini-tornade', 'emmèn', 'toiture', 'Creusot', 'Infos', '12', 'april', '2020', 'colère', 'Zeus', 'Facebook', '14', 'april', '2020']
Actual:  TORNADO  Pr

### RANDOM FOREST

In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, make_scorer

def tune_random_forest(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']

    # Define the parameter grid for the Random Forest classifier
    param_grid = {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create the base model
    rf = RandomForestClassifier(random_state=0)

    # Instantiate the GridSearchCV object
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                               scoring=make_scorer(f1_score, average='weighted'),
                               cv=3, verbose=1, n_jobs=-1)

    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and the corresponding best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print("Best parameters:", best_params)
    print("Best F1 score:", best_score)

    # Train the model with the best parameters on the whole dataset
    best_rf = RandomForestClassifier(**best_params, random_state=0)
    best_rf.fit(X_train, y_train)

    # Test the model on the test dataset
    y_pred = best_rf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    print('Test accuracy:', accuracy)
    print('Test F1 score:', f1)

    return best_rf

best_model_rf = tune_random_forest(df_train_fr, df_test_fr, 'bow_fr')

Fitting 3 folds for each of 180 candidates, totalling 540 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best F1 score: 0.8257170279891239
Test accuracy: 0.8432203389830508
Test F1 score: 0.8337177839005806


In [127]:
# #build a random forest model
# from sklearn.ensemble import RandomForestClassifier
# def build_random_forest(df_train, df_test, vector):
#     X_train = np.array(df_train[vector].tolist())
#     y_train = df_train['TYPE_EVENT']
#     X_test = np.array(df_test[vector].tolist())
#     y_test = df_test['TYPE_EVENT']
#     model = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     f1 = sklearn.metrics.f1_score(y_test, y_pred, average='weighted')
#     accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
#     print('Accuracy: ', accuracy)
#     print('F1 score: ', f1)
#     return model

In [128]:
# english_bow_rf = build_random_forest(df_train_en, df_test_en, 'bow_en')
# french_bow_rf = build_random_forest(df_train_fr, df_test_fr, 'bow_fr')

In [129]:
# Create the exportable confusion matrix
def confusion_matrix_to_latex(cm, class_names):
    num_classes = cm.shape[0]
    latex_table = "\\begin{table}[!htbp]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + "c" * num_classes + "}\n"
    
    # Add header row with class names
    header_row = " & ".join(class_names)
    latex_table += f" & {header_row} \\\\\n\\hline\n"
    
    for i, row in enumerate(cm):
        row_text = " & ".join(map(str, row))
        class_name = class_names[i]
        latex_table += f"{class_name} & {row_text} \\\\\n"
    
    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, "w") as file:
        file.write(latex_table)

# Build the confusion matrix for the Random Forest model
def build_confusion_matrix_random_forest(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    model = sklearn.ensemble.RandomForestClassifier(n_estimators=100, random_state=0, min_samples_split=5, min_samples_leaf=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
    print(sklearn.metrics.classification_report(y_test, y_pred))
    return cm

def build_and_export_confusion_matrix_rf(df_train, df_test, vector, class_names, filename):
    cm = build_confusion_matrix_random_forest(df_train, df_test, vector)
    latex_table = confusion_matrix_to_latex(cm, class_names)
    save_latex_table(latex_table, filename)

class_names = ['AVALANCHE', 'DEVIL', 'HAIL', 'LIGHTNING', 'PRECIP', 'SNOW', 'TORNADO', 'WIND']
build_and_export_confusion_matrix_rf(df_train_fr, df_test_fr, 'bow_fr', class_names, "confusion_matrix_fr_rf.tex")



              precision    recall  f1-score   support

   AVALANCHE       1.00      1.00      1.00        14
       DEVIL       0.00      0.00      0.00         2
        HAIL       0.83      0.90      0.86       123
   LIGHTNING       0.86      0.86      0.86        90
      PRECIP       0.56      0.44      0.49        52
        SNOW       0.00      0.00      0.00         1
     TORNADO       0.71      0.42      0.53        12
        WIND       0.85      0.89      0.87       178

    accuracy                           0.82       472
   macro avg       0.60      0.56      0.58       472
weighted avg       0.81      0.82      0.82       472



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [130]:
build_confusion_matrix_random_forest(df_train_en, df_test_en, 'bow_en')


              precision    recall  f1-score   support

        HAIL       0.88      0.96      0.92        23
   LIGHTNING       0.00      0.00      0.00         3
      PRECIP       0.00      0.00      0.00         6
     TORNADO       0.00      0.00      0.00         1
        WIND       0.47      1.00      0.64         7

    accuracy                           0.73        40
   macro avg       0.27      0.39      0.31        40
weighted avg       0.59      0.72      0.64        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[22,  0,  0,  0,  1],
       [ 2,  0,  0,  0,  1],
       [ 1,  0,  0,  0,  5],
       [ 0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  7]], dtype=int64)

In [131]:

def print_misclassified_events(df_test, vector, model):
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    y_pred = model.predict(X_test)
    misclassified = np.where(y_test != y_pred)
    for i in misclassified[0]:
        print('Actual: ', y_test.iloc[i], ' Predicted: ', y_pred[i], ' Reference: ', df_test.iloc[i]['REFERENCE'])

In [132]:
print_misclassified_events(df_test_fr, 'bow_fr', best_model_rf)

Actual:  LIGHTNING  Predicted:  PRECIP  Reference:  ['Orages', 'grêle', 'localiser', 'Yonne', 'important', 'dégât', 'culture', 'autour', 'pont-sur-yonne', '200', 'client', 'priver', 'électricité', 'Puisaye', 'YONNE', 'REPUBLICAINE', '29', 'JUN', '2021', 'RAD']
Actual:  WIND  Predicted:  LIGHTNING  Reference:  ['Orages', 'Saint-Etienne', 'Gier', '2618', 'impact', 'foudre', 'relever', 'progrès', '22', 'JUL', '2020']
Actual:  PRECIP  Predicted:  HAIL  Reference:  ['météo', 'Saône-et-Loire', '71', 'v.', 'Twitter', '09', 'JUN', '2021', 'Panique', 'Autun', 'fort', 'averse', 'journal', 'SAÔNE-ET-LOIRE', '09', 'JUN', '2021']
Actual:  WIND  Predicted:  HAIL  Reference:  ['Orages', 'grêle', 'faire', 'casse', 'auvergne', 'Rhône', 'Alpes', 'France', 'Info', '3', 'Jul', '2019']
Actual:  DEVIL  Predicted:  WIND  Reference:  ['saint-sernin', 'BOIS', 'mini-tornade', 'emmèn', 'toiture', 'Creusot', 'Infos', '12', 'april', '2020', 'colère', 'Zeus', 'Facebook', '14', 'april', '2020']
Actual:  PRECIP  Pred

In [133]:
build_confusion_matrix_random_forest(df_train_fr, df_test_fr, 'bow_fr')

              precision    recall  f1-score   support

   AVALANCHE       1.00      1.00      1.00        14
       DEVIL       0.00      0.00      0.00         2
        HAIL       0.83      0.90      0.86       123
   LIGHTNING       0.86      0.86      0.86        90
      PRECIP       0.56      0.44      0.49        52
        SNOW       0.00      0.00      0.00         1
     TORNADO       0.71      0.42      0.53        12
        WIND       0.85      0.89      0.87       178

    accuracy                           0.82       472
   macro avg       0.60      0.56      0.58       472
weighted avg       0.81      0.82      0.82       472



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[ 14,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   2],
       [  0,   0, 111,   1,   8,   0,   2,   1],
       [  0,   0,   5,  77,   4,   0,   0,   4],
       [  0,   0,   9,   6,  23,   0,   0,  14],
       [  0,   0,   0,   0,   0,   0,   0,   1],
       [  0,   0,   1,   0,   1,   0,   5,   5],
       [  0,   0,   8,   6,   5,   0,   0, 159]], dtype=int64)

In [134]:
def top_features_table_rf(importances, feature_names, top_n=10):
    df = pd.DataFrame(data=importances, index=feature_names, columns=['Importance'])
    df_sorted = df.sort_values('Importance', ascending=False)

    top_features = df_sorted.iloc[:top_n].index.tolist()

    return top_features

def top_features_to_latex_rf(top_features, top_n=10):
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|l}\n"
    latex_table += "Rank & Feature \\\\\n\\hline\n"

    for i, feature in enumerate(top_features):
        row_text = f"{i + 1} & {feature}"
        latex_table += f"{row_text} \\\\\n"

    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

importances = best_model_rf.feature_importances_
feat_names = count_vect_fr.get_feature_names()
top_features = top_features_table_rf(importances, feat_names)
latex_table = top_features_to_latex_rf(top_features)
print(top_features)
save_latex_table(latex_table, "top_features_rf_bow.tex")

['foudre', 'grêle', 'ws', 'meteofrance', 'report', 'facebook', 'com', '2022', 'eyewitness', 'france']


In [23]:
#prepare df_train_en and df_test_en to be used with a neural network
from sklearn.preprocessing import LabelEncoder



In [135]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.optimizers import RMSprop, Adam
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

def build_neural_network(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the label encoder to the training data
    label_encoder.fit(y_train)

    # Encode the training and testing labels
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the training and testing labels
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(X_train[0]),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(y_train_encoded)), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])
    history = model.fit(X_train, y_train_one_hot,
                    batch_size=128,
                    epochs=20,
                    verbose=1,
                    validation_data=(X_test, y_test_one_hot))
    score = model.evaluate(X_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    return model


In [104]:
#build_neural_network(df_train_en, df_test_en, 'bow_en')

In [136]:
build_neural_network(df_train_fr, df_test_fr, 'bow_fr')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               1480704   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 512)               262656    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 8)                 4104      
                                                                 
Total params: 1,747,464
Trainable params: 1,747,464
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Ep

<keras.engine.sequential.Sequential at 0x16e8bef7040>

In [137]:
#build a confusion matrix for the neural network
def build_confusion_matrix_neural_network(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the label encoder to the training data
    label_encoder.fit(y_train)

    # Encode the training and testing labels
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the training and testing labels
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(X_train[0]),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(y_train_encoded)), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])
    history = model.fit(X_train, y_train_one_hot,
                    batch_size=128,
                    epochs=20,
                    verbose=1,
                    validation_data=(X_test, y_test_one_hot))
    score = model.evaluate(X_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    y_pred = model.predict(X_test)
    cm = sklearn.metrics.confusion_matrix(y_test_one_hot.argmax(axis=1), y_pred.argmax(axis=1))
    print(sklearn.metrics.classification_report(y_test_one_hot.argmax(axis=1), y_pred.argmax(axis=1)))
    return cm

In [139]:
cm_neural = build_confusion_matrix_neural_network(df_train_fr, df_test_fr, 'bow_fr')

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 512)               1480704   
                                                                 
 dropout_4 (Dropout)         (None, 512)               0         
                                                                 
 dense_7 (Dense)             (None, 512)               262656    
                                                                 
 dropout_5 (Dropout)         (None, 512)               0         
                                                                 
 dense_8 (Dense)             (None, 8)                 4104      
                                                                 
Total params: 1,747,464
Trainable params: 1,747,464
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [140]:
def confusion_matrix_to_latex(cm, class_names):
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + ' '.join(['c' for _ in class_names]) + "}\n"
    latex_table += "\\multicolumn{1}{c}{} & \\multicolumn{" + str(len(class_names)) + "}{c}{Predicted Class} \\\\\n"
    latex_table += "Actual Class & " + ' & '.join(class_names) + " \\\\\n\\hline\n"

    for i, row in enumerate(cm):
        row_text = f"{class_names[i]} & " + ' & '.join([str(cell) for cell in row])
        latex_table += f"{row_text} \\\\\n"

    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, 'w') as f:
        f.write(latex_table)

In [142]:
class_names = ['AVALANCHE', 'DEVIL', 'HAIL', 'LIGHTNING', 'PRECIP', 'SNOW', 'TORNADO', 'WIND']
latex_table = confusion_matrix_to_latex(cm_neural, class_names)
save_latex_table(latex_table, 'cm_neural_fr.tex')

In [147]:
def print_incorrect_predictions_neural_network(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the label encoder to the training data
    label_encoder.fit(y_train)

    # Encode the training and testing labels
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the training and testing labels
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(X_train[0]),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(y_train_encoded)), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(),
                  metrics=['accuracy'])
    history = model.fit(X_train, y_train_one_hot,
                        batch_size=128,
                        epochs=20,
                        verbose=1,
                        validation_data=(X_test, y_test_one_hot))
    score = model.evaluate(X_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    y_pred = model.predict(X_test)
    y_pred = y_pred.argmax(axis=1)
    y_test = y_test_one_hot.argmax(axis=1)

    for i in range(len(y_pred)):
        if y_pred[i] != y_test[i]:
            true_class = label_encoder.inverse_transform([y_test[i]])[0]
            pred_class = label_encoder.inverse_transform([y_pred[i]])[0]
            print('True prediction:', true_class, 'Predicted prediction:', pred_class, 'Reference:', df_test['REFERENCE'].iloc[i])

In [148]:
print_incorrect_predictions_neural_network(df_train_fr, df_test_fr, 'bow_fr')


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 512)               1480704   
                                                                 
 dropout_10 (Dropout)        (None, 512)               0         
                                                                 
 dense_16 (Dense)            (None, 512)               262656    
                                                                 
 dropout_11 (Dropout)        (None, 512)               0         
                                                                 
 dense_17 (Dense)            (None, 8)                 4104      
                                                                 
Total params: 1,747,464
Trainable params: 1,747,464
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


### TF-IDF vector

In [160]:
df_fr = pd.read_csv('eswd_fr.csv')

In [161]:
def filter_references(reference):
    lower_reference = reference.lower()
    if 'facebook' in lower_reference or 'twitter' in lower_reference:
        return len(reference.split()) >= 8
    return True

# Apply the custom function and filter the DataFrame
df_en = df_en[df_en['REFERENCE'].apply(filter_references)]
print(df_en.shape)
df_fr = df_fr[df_fr['REFERENCE'].apply(filter_references)]
print(df_fr.shape)

(199, 5)
(2357, 3)


In [171]:
#builf tf-idf vector for df_en and df_fr
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_en = TfidfVectorizer()
vectorizer_fr = TfidfVectorizer()

# Fit and transform the data
X_en = vectorizer_en.fit_transform(df_en['REFERENCE'])
X_fr = vectorizer_fr.fit_transform(df_fr['REFERENCE'])

#add the vectors in df_en and df_fr
df_en['tfidf-vector'] = list(X_en.toarray())
df_fr['tfidf-vector'] = list(X_fr.toarray())
df_fr.shape


(2357, 4)

In [172]:
#train and test split for df_en and df_fr
df_train_fr, df_test_fr = train_test_split(df_fr, test_size=0.2, random_state=42)
df_test_fr.shape

(472, 4)

In [173]:
def build_model(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    model = sklearn.linear_model.LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #adapt for multiclass the f1 score
    f1 = sklearn.metrics.f1_score(y_test, y_pred, average='weighted')
    accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
    #mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
    print('Accuracy: ', accuracy)
    print('F1 score: ', f1)
    #print('MSE: ', mse)
    return model


In [174]:
#english_lr = build_model(df_en_train, df_en_test, 'tfidf-vector')

In [175]:
french_lr = build_model(df_train_fr, df_test_fr, 'tfidf-vector')

Accuracy:  0.8326271186440678
F1 score:  0.8231958341509901


In [176]:
#build a confusion matrix
def build_confusion_matrix(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    model = sklearn.linear_model.LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    class_labels = model.classes_
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels=class_labels)
    
    print(sklearn.metrics.classification_report(y_test, y_pred))
    print(cm)
    return cm, class_labels

In [177]:
def confusion_matrix_to_latex(cm, class_names):
    num_classes = cm.shape[0]
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + "c" * num_classes + "}\n"
    
    # Add header row with class names' first letters
    header_row = " & ".join([name[0] for name in class_names])
    latex_table += f" & {header_row} \\\\\n\\hline\n"
    
    for i, row in enumerate(cm):
        row_text = " & ".join(map(str, row))
        class_name = class_names[i]
        latex_table += f"{class_name} & {row_text} \\\\\n"
    
    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, "w") as file:
        file.write(latex_table)

In [178]:

# Save the LaTeX table to a file

cm_fr, class_labels_fr = build_confusion_matrix(df_train_fr, df_test_fr, 'tfidf-vector')

# Convert the confusion matrix to LaTeX table format
latex_table = confusion_matrix_to_latex(cm_fr, class_labels_fr)
save_latex_table(latex_table, "confusion_matrix_tfidf_lr.tex")

              precision    recall  f1-score   support

   AVALANCHE       1.00      0.86      0.92        14
       DEVIL       0.00      0.00      0.00         2
        HAIL       0.84      0.91      0.87       123
   LIGHTNING       0.88      0.87      0.87        90
      PRECIP       0.63      0.50      0.56        52
        SNOW       0.00      0.00      0.00         1
     TORNADO       1.00      0.33      0.50        12
        WIND       0.84      0.90      0.87       178

    accuracy                           0.83       472
   macro avg       0.65      0.55      0.57       472
weighted avg       0.83      0.83      0.82       472

[[ 12   0   0   0   1   0   0   1]
 [  0   0   0   0   0   0   0   2]
 [  0   0 112   0   9   0   0   2]
 [  0   0   3  78   2   0   0   7]
 [  0   0  10   3  26   0   0  13]
 [  0   0   0   1   0   0   0   0]
 [  0   0   1   0   1   0   4   6]
 [  0   0   8   7   2   0   0 161]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [179]:
#print misclassified events with their prediction, true value and reference
def print_misclassified_events(df_test, vector, model):
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    y_pred = model.predict(X_test)
    for i in range(len(y_test)):
        if y_test.iloc[i] != y_pred[i]:
            print('Prediction: ', y_pred[i])
            print('True value: ', y_test.iloc[i])
            print('Reference: ', df_test.iloc[i]['REFERENCE'])
            print('------------------------')
            

In [180]:
print_misclassified_events(df_test_fr, 'tfidf-vector', french_lr)

Prediction:  HAIL
True value:  LIGHTNING
Reference:  ['Orages', 'grêle', 'localiser', 'Yonne', 'important', 'dégât', 'culture', 'autour', 'pont-sur-yonne', '200', 'client', 'priver', 'électricité', 'Puisaye', 'YONNE', 'REPUBLICAINE', '29', 'JUN', '2021', 'RAD']
------------------------
Prediction:  LIGHTNING
True value:  WIND
Reference:  ['Orages', 'Saint-Etienne', 'Gier', '2618', 'impact', 'foudre', 'relever', 'progrès', '22', 'JUL', '2020']
------------------------
Prediction:  HAIL
True value:  PRECIP
Reference:  ['météo', 'Saône-et-Loire', '71', 'v.', 'Twitter', '09', 'JUN', '2021', 'Panique', 'Autun', 'fort', 'averse', 'journal', 'SAÔNE-ET-LOIRE', '09', 'JUN', '2021']
------------------------
Prediction:  HAIL
True value:  WIND
Reference:  ['Orages', 'grêle', 'faire', 'casse', 'auvergne', 'Rhône', 'Alpes', 'France', 'Info', '3', 'Jul', '2019']
------------------------
Prediction:  WIND
True value:  DEVIL
Reference:  ['saint-sernin', 'BOIS', 'mini-tornade', 'emmèn', 'toiture', 'Cre

In [184]:
def top_features_table(weights, feature_names, class_names, top_n=10):
    df = pd.DataFrame(data=weights.T, columns=class_names, index=feature_names)
    
    top_features = {}
    for class_name in class_names:
        df_sorted = df.sort_values(class_name, ascending=False)
        top_features[class_name] = df_sorted.iloc[:top_n, :].index.tolist()
    
    return top_features

def top_features_to_latex(top_features, class_names, top_n=10):
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + "l" * len(class_names) + "}\n"

    
    header_row = " & ".join(class_names)
    latex_table += f" & {header_row} \\\\\n\\hline\n"

    for i in range(top_n):
        row = [f"{i+1}"]
        for class_name in class_names:
            row.append(top_features[class_name][i])
        row_text = " & ".join(row)
        latex_table += f"{row_text} \\\\\n"

    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, "w") as file:
        file.write(latex_table)

weights = french_lr.coef_
feat_names_en = vectorizer_fr.get_feature_names()
class_names = ['AVALANCHE', 'DEVIL', 'HAIL', 'LIGHTNING', 'PRECIP','SNOW', 'TORNADO', 'WIND']
top_features = top_features_table(weights, feat_names_en, class_names)
latex_table = top_features_to_latex(top_features, class_names)
save_latex_table(latex_table, "top_features_lrtfidf.tex")

In [185]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, make_scorer

def tune_random_forest(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']

    # Define the parameter grid for the Random Forest classifier
    param_grid = {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create the base model
    rf = RandomForestClassifier(random_state=0)

    # Instantiate the GridSearchCV object
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                               scoring=make_scorer(f1_score, average='weighted'),
                               cv=3, verbose=1, n_jobs=-1)

    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and the corresponding best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print("Best parameters:", best_params)
    print("Best F1 score:", best_score)

    # Train the model with the best parameters on the whole dataset
    best_rf = RandomForestClassifier(**best_params, random_state=0)
    best_rf.fit(X_train, y_train)

    # Test the model on the test dataset
    y_pred = best_rf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    print('Test accuracy:', accuracy)
    print('Test F1 score:', f1)

    return best_rf

model = tune_random_forest(df_train_fr, df_test_fr, 'tfidf-vector')

Fitting 3 folds for each of 180 candidates, totalling 540 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best F1 score: 0.8267818279515732
Test accuracy: 0.836864406779661
Test F1 score: 0.8301320638949388


In [186]:
# Create the exportable confusion matrix
def confusion_matrix_to_latex(cm, class_names):
    num_classes = cm.shape[0]
    latex_table = "\\begin{table}[!htbp]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + "c" * num_classes + "}\n"
    
    # Add header row with class names
    header_row = " & ".join(class_names)
    latex_table += f" & {header_row} \\\\\n\\hline\n"
    
    for i, row in enumerate(cm):
        row_text = " & ".join(map(str, row))
        class_name = class_names[i]
        latex_table += f"{class_name} & {row_text} \\\\\n"
    
    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, "w") as file:
        file.write(latex_table)

# Build the confusion matrix for the Random Forest model
def build_confusion_matrix_random_forest(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    model = sklearn.ensemble.RandomForestClassifier(n_estimators=100, random_state=0, min_samples_split=5, min_samples_leaf=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
    print(sklearn.metrics.classification_report(y_test, y_pred))
    return cm

def build_and_export_confusion_matrix_rf(df_train, df_test, vector, class_names, filename):
    cm = build_confusion_matrix_random_forest(df_train, df_test, vector)
    latex_table = confusion_matrix_to_latex(cm, class_names)
    save_latex_table(latex_table, filename)

class_names = ['AVALANCHE', 'DEVIL', 'HAIL', 'LIGHTNING', 'PRECIP', 'SNOW', 'TORNADO', 'WIND']
build_and_export_confusion_matrix_rf(df_train_fr, df_test_fr, 'tfidf-vector', class_names, "confusion_matrix_tfidf_rf.tex")


              precision    recall  f1-score   support

   AVALANCHE       1.00      1.00      1.00        14
       DEVIL       0.00      0.00      0.00         2
        HAIL       0.83      0.90      0.86       123
   LIGHTNING       0.89      0.88      0.88        90
      PRECIP       0.60      0.46      0.52        52
        SNOW       0.00      0.00      0.00         1
     TORNADO       0.75      0.50      0.60        12
        WIND       0.87      0.91      0.89       178

    accuracy                           0.84       472
   macro avg       0.62      0.58      0.59       472
weighted avg       0.83      0.84      0.83       472



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [188]:
def print_misclassified_events(df_test, vector, model):
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    y_pred = model.predict(X_test)
    misclassified = np.where(y_test != y_pred)
    for i in misclassified[0]:
        print('Actual: ', y_test.iloc[i], ' Predicted: ', y_pred[i], ' Reference: ', df_test.iloc[i]['REFERENCE'])

In [189]:
print_misclassified_events(df_test_fr, 'tfidf-vector', model)

Actual:  LIGHTNING  Predicted:  PRECIP  Reference:  ['Orages', 'grêle', 'localiser', 'Yonne', 'important', 'dégât', 'culture', 'autour', 'pont-sur-yonne', '200', 'client', 'priver', 'électricité', 'Puisaye', 'YONNE', 'REPUBLICAINE', '29', 'JUN', '2021', 'RAD']
Actual:  WIND  Predicted:  PRECIP  Reference:  ['Orages', 'Saint-Etienne', 'Gier', '2618', 'impact', 'foudre', 'relever', 'progrès', '22', 'JUL', '2020']
Actual:  PRECIP  Predicted:  HAIL  Reference:  ['météo', 'Saône-et-Loire', '71', 'v.', 'Twitter', '09', 'JUN', '2021', 'Panique', 'Autun', 'fort', 'averse', 'journal', 'SAÔNE-ET-LOIRE', '09', 'JUN', '2021']
Actual:  WIND  Predicted:  HAIL  Reference:  ['Orages', 'grêle', 'faire', 'casse', 'auvergne', 'Rhône', 'Alpes', 'France', 'Info', '3', 'Jul', '2019']
Actual:  DEVIL  Predicted:  WIND  Reference:  ['saint-sernin', 'BOIS', 'mini-tornade', 'emmèn', 'toiture', 'Creusot', 'Infos', '12', 'april', '2020', 'colère', 'Zeus', 'Facebook', '14', 'april', '2020']
Actual:  PRECIP  Predict

In [190]:
def top_features_table_rf(importances, feature_names, top_n=10):
    df = pd.DataFrame(data=importances, index=feature_names, columns=['Importance'])
    df_sorted = df.sort_values('Importance', ascending=False)

    top_features = df_sorted.iloc[:top_n].index.tolist()

    return top_features

def top_features_to_latex_rf(top_features, top_n=10):
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|l}\n"
    latex_table += "Rank & Feature \\\\\n\\hline\n"

    for i, feature in enumerate(top_features):
        row_text = f"{i + 1} & {feature}"
        latex_table += f"{row_text} \\\\\n"

    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

importances = model.feature_importances_
feat_names = vectorizer_fr.get_feature_names()
top_features = top_features_table_rf(importances, feat_names)
latex_table = top_features_to_latex_rf(top_features)
print(top_features)
save_latex_table(latex_table, "top_features_rf_tfidf.tex")

['foudre', 'grêle', 'report', 'météo', 'com', '2022', 'france', 'meteofrance', 'facebook', 'jun']


In [37]:
# from sklearn.ensemble import RandomForestClassifier
# def build_random_forest(df_train, df_test, vector):
#     X_train = np.array(df_train[vector].tolist())
#     y_train = df_train['TYPE_EVENT']
#     X_test = np.array(df_test[vector].tolist())
#     y_test = df_test['TYPE_EVENT']
#     model = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     f1 = sklearn.metrics.f1_score(y_test, y_pred, average='weighted')
#     accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
#     print('Accuracy: ', accuracy)
#     print('F1 score: ', f1)
#     return model

In [40]:
#build a confusion matrix
def build_confusion_matrix_random_forest(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    model = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
    print(sklearn.metrics.classification_report(y_test, y_pred))
    #print(cm)
    return cm
    

In [41]:
build_confusion_matrix_random_forest(df_en_train, df_en_test, 'tfidf-vector')

              precision    recall  f1-score   support

       DEVIL       0.00      0.00      0.00         1
        HAIL       0.79      0.94      0.86        36
   LIGHTNING       0.00      0.00      0.00         2
      PRECIP       0.00      0.00      0.00         5
     TORNADO       0.00      0.00      0.00        10
        WIND       0.64      0.87      0.74        31

    accuracy                           0.72        85
   macro avg       0.24      0.30      0.27        85
weighted avg       0.57      0.72      0.63        85



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[ 0,  0,  0,  0,  0,  1],
       [ 0, 34,  0,  0,  0,  2],
       [ 0,  2,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  5],
       [ 0,  3,  0,  0,  0,  7],
       [ 0,  4,  0,  0,  0, 27]], dtype=int64)

In [42]:
build_confusion_matrix_random_forest(df_fr_train, df_fr_test, 'tfidf-vector')

              precision    recall  f1-score   support

   AVALANCHE       0.00      0.00      0.00        13
       DEVIL       0.00      0.00      0.00         2
        HAIL       0.70      0.91      0.79       152
   LIGHTNING       1.00      0.23      0.37        74
      PRECIP       0.00      0.00      0.00        64
        SNOW       0.00      0.00      0.00         1
     TORNADO       1.00      0.05      0.10        20
        WIND       0.57      0.93      0.71       180

    accuracy                           0.64       506
   macro avg       0.41      0.26      0.25       506
weighted avg       0.60      0.64      0.55       506



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[  0,   0,   0,   0,   0,   0,   0,  13],
       [  0,   0,   1,   0,   0,   0,   0,   1],
       [  0,   0, 138,   0,   0,   0,   0,  14],
       [  0,   0,  15,  17,   0,   0,   0,  42],
       [  0,   0,  23,   0,   0,   0,   0,  41],
       [  0,   0,   0,   0,   0,   0,   0,   1],
       [  0,   0,   7,   0,   0,   0,   1,  12],
       [  0,   0,  13,   0,   0,   0,   0, 167]], dtype=int64)

In [191]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.optimizers import RMSprop
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

def build_neural_network(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the label encoder to the training data
    label_encoder.fit(y_train)

    # Encode the training and testing labels
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the training and testing labels
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(X_train[0]),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(y_train_encoded)), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])
    history = model.fit(X_train, y_train_one_hot,
                    batch_size=128,
                    epochs=20,
                    verbose=1,
                    validation_data=(X_test, y_test_one_hot))
    score = model.evaluate(X_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    return model


In [44]:
build_neural_network(df_en_train, df_en_test, 'tfidf-vector')

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 512)               300544    
                                                                 
 dropout_4 (Dropout)         (None, 512)               0         
                                                                 
 dense_7 (Dense)             (None, 512)               262656    
                                                                 
 dropout_5 (Dropout)         (None, 512)               0         
                                                                 
 dense_8 (Dense)             (None, 7)                 3591      
                                                                 
Total params: 566,791
Trainable params: 566,791
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoc

<keras.engine.sequential.Sequential at 0x22d95ee7070>

In [192]:
build_neural_network(df_train_fr, df_test_fr, 'tfidf-vector')

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 512)               1481728   
                                                                 
 dropout_12 (Dropout)        (None, 512)               0         
                                                                 
 dense_19 (Dense)            (None, 512)               262656    
                                                                 
 dropout_13 (Dropout)        (None, 512)               0         
                                                                 
 dense_20 (Dense)            (None, 8)                 4104      
                                                                 
Total params: 1,748,488
Trainable params: 1,748,488
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


<keras.engine.sequential.Sequential at 0x16eaa50c9a0>

In [193]:
def build_confusion_matrix_neural_network(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the label encoder to the training data
    label_encoder.fit(y_train)

    # Encode the training and testing labels
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the training and testing labels
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(X_train[0]),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(y_train_encoded)), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])
    history = model.fit(X_train, y_train_one_hot,
                    batch_size=128,
                    epochs=20,
                    verbose=1,
                    validation_data=(X_test, y_test_one_hot))
    score = model.evaluate(X_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    y_pred = model.predict(X_test)
    cm = sklearn.metrics.confusion_matrix(y_test_one_hot.argmax(axis=1), y_pred.argmax(axis=1))
    print(sklearn.metrics.classification_report(y_test_one_hot.argmax(axis=1), y_pred.argmax(axis=1)))
    return cm

In [194]:
cm_neural = build_confusion_matrix_neural_network(df_train_fr, df_test_fr, 'tfidf-vector')

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_21 (Dense)            (None, 512)               1481728   
                                                                 
 dropout_14 (Dropout)        (None, 512)               0         
                                                                 
 dense_22 (Dense)            (None, 512)               262656    
                                                                 
 dropout_15 (Dropout)        (None, 512)               0         
                                                                 
 dense_23 (Dense)            (None, 8)                 4104      
                                                                 
Total params: 1,748,488
Trainable params: 1,748,488
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [195]:
def confusion_matrix_to_latex(cm, class_names):
    latex_table = "\\begin{table}[h]\n\\centering\n\\scriptsize\n"
    latex_table += "\\begin{tabular}{l|" + ' '.join(['c' for _ in class_names]) + "}\n"
    latex_table += "\\multicolumn{1}{c}{} & \\multicolumn{" + str(len(class_names)) + "}{c}{Predicted Class} \\\\\n"
    latex_table += "Actual Class & " + ' & '.join(class_names) + " \\\\\n\\hline\n"

    for i, row in enumerate(cm):
        row_text = f"{class_names[i]} & " + ' & '.join([str(cell) for cell in row])
        latex_table += f"{row_text} \\\\\n"

    latex_table += "\\end{tabular}\n"
    latex_table += "\\end{table}\n"
    return latex_table

def save_latex_table(latex_table, filename):
    with open(filename, 'w') as f:
        f.write(latex_table)

In [196]:
class_names = ['AVALANCHE', 'DEVIL', 'HAIL', 'LIGHTNING', 'PRECIP', 'SNOW', 'TORNADO', 'WIND']
latex_table = confusion_matrix_to_latex(cm_neural, class_names)
save_latex_table(latex_table, 'cm_neural_tfidf.tex')

In [197]:
def print_incorrect_predictions_neural_network(df_train, df_test, vector):
    X_train = np.array(df_train[vector].tolist())
    y_train = df_train['TYPE_EVENT']
    X_test = np.array(df_test[vector].tolist())
    y_test = df_test['TYPE_EVENT']
    
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the label encoder to the training data
    label_encoder.fit(y_train)

    # Encode the training and testing labels
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the training and testing labels
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(X_train[0]),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(y_train_encoded)), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(),
                  metrics=['accuracy'])
    history = model.fit(X_train, y_train_one_hot,
                        batch_size=128,
                        epochs=20,
                        verbose=1,
                        validation_data=(X_test, y_test_one_hot))
    score = model.evaluate(X_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    y_pred = model.predict(X_test)
    y_pred = y_pred.argmax(axis=1)
    y_test = y_test_one_hot.argmax(axis=1)

    for i in range(len(y_pred)):
        if y_pred[i] != y_test[i]:
            true_class = label_encoder.inverse_transform([y_test[i]])[0]
            pred_class = label_encoder.inverse_transform([y_pred[i]])[0]
            print('True prediction:', true_class, 'Predicted prediction:', pred_class, 'Reference:', df_test['REFERENCE'].iloc[i])

In [198]:
print_incorrect_predictions_neural_network(df_train_fr, df_test_fr, 'tfidf-vector')


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 512)               1481728   
                                                                 
 dropout_16 (Dropout)        (None, 512)               0         
                                                                 
 dense_25 (Dense)            (None, 512)               262656    
                                                                 
 dropout_17 (Dropout)        (None, 512)               0         
                                                                 
 dense_26 (Dense)            (None, 8)                 4104      
                                                                 
Total params: 1,748,488
Trainable params: 1,748,488
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
