In [53]:
import csv
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import sklearn
import sklearn.linear_model
from sklearn.model_selection import KFold
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import sentence_transformers
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from tqdm import trange
import random
import spacy
from spacy_language_detection import LanguageDetector
import math
import spacy_fastlang


In [54]:

def convert_text_to_csv(input_file, output_file):
    # Open the input text file for reading
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Open the output CSV file for writing
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)

        # Write each line as a separate row in the CSV file
        for line in lines:
            writer.writerow([line.strip()])  
    print("Conversion completed successfully.")


# Usage example
input_file = 'non_severe_cleaned.txt'
output_file = 'false_data.csv'
convert_text_to_csv(input_file, output_file)

Conversion completed successfully.


In [55]:

df_false = pd.read_csv('false_data.csv', header=None)
df_false['false'] = 0


In [56]:

df_false.columns = ['REFERENCE', 'false']
df_false.to_csv('false_data.csv', index=False, header=True)


In [57]:
fields = ['REFERENCE', 'TYPE_EVENT']
df_true = pd.read_csv('eswd.csv', usecols=fields)

In [58]:
df_true = df_true.dropna(subset=['REFERENCE'])  # we are at 12871 rows now
df_true.drop_duplicates(inplace=True) 
df_true = df_true[~df_true['REFERENCE'].str.contains("Report via Kachelmannwetter.com")]


In [59]:
#add a column indicating whether it is a false data or no, so here add all 1s
df_true['false'] = 1




In [60]:
nlp1 = spacy.load("en_core_web_sm")
nlp2 = spacy.load("fr_core_news_sm")
nlp1.add_pipe("language_detector", last=True)
df_true['language'] = df_true['REFERENCE'].apply(lambda x: nlp1(x)._.language)
df_en = df_true[df_true['language'] == 'en']
df_fr = df_true[df_true['language'] == 'fr']



In [61]:

df_fr = df_fr[['REFERENCE', 'false', 'TYPE_EVENT']]


In [62]:
df_fr = df_fr[df_true.TYPE_EVENT != 'AVALANCHE']
df_fr = df_fr[['REFERENCE', 'false']]
#take 500 random samples from the french data
df_fr = df_fr.sample(n=500, random_state=1)

  df_fr = df_fr[df_true.TYPE_EVENT != 'AVALANCHE']


In [63]:
df = pd.concat([df_fr, df_false], ignore_index=True)
print(df.head())
print(df.shape)
#to csv
df.to_csv('data.csv', index=False, header=True)

                                           REFERENCE  false
0  "Bilan des violents orages en France les mardi...      1
1  "Les orages ont causé des inondations dans plu...      1
2  "D’importants dégâts au Coteau causés par la g...      1
3  "Saint-Loup-sur-Semouse : une mini-tornade sur...      1
4  "Raisins explosés par la grêle et vignerons à ...      1
(938, 2)


In [64]:
df['REFERENCE'] = df['REFERENCE'].apply(lambda x: x.split())
df['REFERENCE'] = df['REFERENCE'].apply(
    lambda x: [token.text for token in nlp2(" ".join(x)) if not token.is_punct])
df['REFERENCE'] = df['REFERENCE'].apply(
    lambda x: [token.text for token in nlp2(" ".join(x)) if not token.is_stop])
df['REFERENCE'] = df['REFERENCE'].apply(
    lambda x: [token.lemma_ for token in nlp2(" ".join(x))])
#now join tokenized words into one string
df['REFERENCE'] = df['REFERENCE'].apply(lambda x: " ".join(x))


In [65]:
df.to_csv('data.csv', index=False, header=True)

In [66]:
# #use bag of words
# cnt_vec = CountVectorizer(ngram_range=(1,2), max_features=30000)
# bow_train = cnt_vec.fit_transform(df['REFERENCE'])
# bow_train = bow_train.toarray()
# #add the transformed data to the dataframe
# df_bow = pd.DataFrame(bow_train, columns=cnt_vec.get_feature_names())



In [67]:
# Shuffle the DataFrame
df = df.sample(frac=1).reset_index(drop=True)


cnt_vec = CountVectorizer()
bow_train = cnt_vec.fit_transform(df['REFERENCE'])
bow_train = bow_train.toarray()

df_bow = pd.DataFrame(bow_train, columns=cnt_vec.get_feature_names())

df_bow['false'] = df['false']


kf = KFold(n_splits=10)


model = sklearn.linear_model.LogisticRegression()


accuracies = []

for train_idx, val_idx in kf.split(df_bow):

    train_data = df_bow.iloc[train_idx]
    val_data = df_bow.iloc[val_idx]


    X_train, y_train = train_data.iloc[:, :-1].values, train_data.iloc[:, -1].values
    X_val, y_val = val_data.iloc[:, :-1].values, val_data.iloc[:, -1].values


    model.fit(X_train, y_train)


    y_pred = model.predict(X_val)
    accuracy = sklearn.metrics.accuracy_score(y_val, y_pred)
    accuracies.append(accuracy)


avg_accuracy = np.mean(accuracies)
print("Average accuracy:", avg_accuracy)

Average accuracy: 0.9946694120338595


Now, we will analyze the data using spacy and use a tf-idf vector to build a binary classificator. 

In [68]:
#print the misclassified samples, their predicted and true labels and reference
for i in range(len(y_val)):
    if y_val[i] != y_pred[i]:
        print("Predicted:", y_pred[i], "True:", y_val[i], "Reference:", df.iloc[val_idx[i]]['REFERENCE'])

In [69]:

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['REFERENCE'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df['vectorized_reference'] = denselist



In [70]:
#shuffle the data
df = df.sample(frac=1).reset_index(drop=True)
train, test = train_test_split(df, test_size=0.2, random_state=42)
df = df[['vectorized_reference', 'false', 'REFERENCE']]

In [71]:
df.columns

Index(['vectorized_reference', 'false', 'REFERENCE'], dtype='object')

In [74]:
kf = KFold(n_splits=10)

model = sklearn.linear_model.LogisticRegression()

accuracies = []

mistakes = []  # List to store incorrectly classified references

for train_idx, val_idx in kf.split(df):
    train_data = df.iloc[train_idx]
    val_data = df.iloc[val_idx]

    X_train_tfidf = pd.DataFrame(train_data['vectorized_reference'].tolist())
    X_val_tfidf = pd.DataFrame(val_data['vectorized_reference'].tolist())

    y_train, y_val = train_data['false'], val_data['false']
    model.fit(X_train_tfidf, y_train)

    y_pred = model.predict(X_val_tfidf)
    accuracy = sklearn.metrics.accuracy_score(y_val, y_pred)
    accuracies.append(accuracy)

    # Find and store incorrectly classified references
    for idx, (true_label, pred_label) in enumerate(zip(y_val, y_pred)):
        if true_label != pred_label:
            mistakes.append((val_data.iloc[idx]['REFERENCE'], true_label, pred_label))

# Average
avg_accuracy = np.mean(accuracies)
print("Average accuracy:", avg_accuracy)

# Print incorrectly classified references
print("\nIncorrectly classified references:")
for idx, (ref, true_label, pred_label) in enumerate(mistakes, start=1):
    print(f"{idx}: {ref} (True label: {true_label}, Predicted label: {pred_label})")

Average accuracy: 0.9904140928849234

Incorrectly classified references:
1: Haute-Savoie glissement terrain bloquer route Thyez dauphiner Libéré 15 July 2021 for suscriber only (True label: 1, Predicted label: 0)
2: grêle sud-est orage l ' Ouest (True label: 0, Predicted label: 1)
3: météo France surprendre (True label: 0, Predicted label: 1)
4: Mickael B. observatoir ciel Orageux Tornade Médoc 2018 2019 (True label: 1, Predicted label: 0)
5: match xv never reporter cause vent Péméja never furieux réaction vidéo charente libre 01 March 2020 (True label: 1, Predicted label: 0)
6: témoignage voisin n ’ fumée c ’ toit partir voir pire REDON MAVILLE 24 nov 2022 (True label: 1, Predicted label: 0)
7: météo 2022 lodévoi ancrée (True label: 0, Predicted label: 1)
8: tempête vent passer Corse bien résister Corse matin 10 Dec 2018 (True label: 1, Predicted label: 0)
9: 2022 bien être l ' année chaude France (True label: 0, Predicted label: 1)


In [45]:
# #use logistic regression and bag of words to classify events as false or true
# logreg = sklearn.linear_model.LogisticRegression()
# logreg.fit(x_bow_train, y_bow_train)
# y_pred = logreg.predict(x_bow_test)
# accuracy = sklearn.metrics.accuracy_score(y_bow_test, y_pred)
# print(accuracy)
# print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(x_bow_test, y_bow_test)))



In [47]:
for i in range(len(y_val)):
    if y_val[i] != y_pred[i]:
        print("Predicted:", y_pred[i], "True:", y_val[i], "Reference:", df.iloc[val_idx[i]]['REFERENCE'])

KeyError: 0

In [45]:
#print out the reference column for the vectors that were misclassified
for i in range(len(predictions)):
    if predictions[i] != test['false'].tolist()[i]:
        print(test['REFERENCE'].tolist()[i])
        

Eyewitness report météo d ' antoine météo Limousin Facebook 04 JUN 2022 n ' jamais 30 an jean-luc pierron maire Crocq toucher intempérie Creuse France Bleu 05 JUN 2022
Orages savoie Haute-Savoie foudre frappe transformateur route couper France Bleu 24 OCT 2022
météo 2022 lodévoi ancrée
week end ensoleiller nouveau
Liévin petit tornade touch cité castor famille reloger voix Nord 16 feb 2020
ws report météo France METEOFRANCE.com 20 mar 2018
hautes-pyrénée grêle miner champ agriculteur garder espoir ladepeche.fr 22 JUN 2021 RAD
sant Galdric faire -t il venir plui
Orages Cher pluie diluvien grêle vent dégât France Bleu cher 22 june 2022
devoir faire ' 35 degré c l ' Hérault
vent exploiter villag
ogeu culture être toucher plein fouet l ’ averse grêle republique pyréner 21 JUN 2022 Eyewitness report République pyréner Facebook 21 JUN 2022
l ' absence plui bassin thau régime sec
Allemagne Autriche Belgique octobre 2022
météo Herbiers Facebook 21 oct 2021
actualité observation météo nouveau A