### Import libraries

In [1]:
import pandas as pd
import spacy
import nltk

from nltk.stem.snowball import SnowballStemmer
from unidecode import unidecode

nlp = spacy.load("ro_core_news_md")
stemmer = SnowballStemmer("romanian")

### Read data

In [2]:
df = pd.read_json('data/meteo_news_updated.json')

### Data inspection

Check dataframe for missing values

In [3]:
nan_df = df[df.isna().any(axis=1)]
nan_df

Unnamed: 0,article_url,article_title,article_lead,article_text,article_date
887,https://stirileprotv.ro/stiri/vremea/vremea-as...,"Vremea astăzi, 5 iulie. Țara se împarte între ...",,"[Vremea în țară, În Dobrogea şi Bărăgan, la am...",05-07-2022 07:10
942,https://stirileprotv.ro/stiri/vremea/vremea-as...,"Vremea astăzi, 11 iunie. Prognoza meteo pentru...",,"[\nSursa: , \n, \nEtichete:\n, ,\n, ,\n, ,\n, ...",11-06-2022 07:19
1291,https://stirileprotv.ro/stiri/vremea/vremea-12...,"Vremea, 12 Octombrie. Vremea se menține deoseb...",,"[ , Vreme ceva mai bună găsim doar în vest, î...",12-10-2021 07:24
2400,https://vacantalamunte.stirileprotv.ro/stiri/v...,,,[],
2435,https://vacantalamunte.stirileprotv.ro/stiri/n...,,,[],
3367,https://vacantalamare.stirileprotv.ro/stiri/ma...,,,[],
4149,https://vacantalamunte.stirileprotv.ro/stiri/s...,,,[],
6009,https://stirileprotv.ro/stiri/meteo/afla-cum-e...,Afla cum e vremea in Romania din ora in ora,,"[Sursa: ANM, \nSursa: , \n, \nEtichete:\n, ,\n...",26-10-2011 20:02
6906,https://stirileprotv.ro/stiri/meteo/vom-avea-t...,Vom avea temperaturi de peste 33 de grade in s...,,"[\nMasa de aer cald, tropical, va domina jumat...",23-05-2009 16:33


In [4]:
# Check shape
nan_df.shape

(9, 5)

In [5]:
# After reviewing those articles, we have determined that the URL is broken, the article has been deleted, and so on. 
# Therefore, we have decided to remove them, as they constitute only a small percentage of our dataframe.

Drop the NaN's

In [6]:
df.dropna(inplace=True)

Convert "article_date" to datetime datatype with pandas

In [7]:
df['article_date'] = pd.to_datetime(df['article_date'], format='%d-%m-%Y %H:%M')

# Sort values by article_date
df = df.sort_values('article_date').reset_index(drop=True)

Check dataframe shape

In [8]:
df.shape

(7088, 5)

In [9]:
# Our dataset is pretty consistent in order to build a POC or to prove our hypothesis

Check dtypes for each column in our dataframe

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7088 entries, 0 to 7087
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   article_url    7088 non-null   object        
 1   article_title  7088 non-null   object        
 2   article_lead   7088 non-null   object        
 3   article_text   7088 non-null   object        
 4   article_date   7088 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 277.0+ KB


Remove useless elements from article_text

In [11]:
def remove_sursa(text_list):
    for i in range(len(text_list)):
        if text_list[i].startswith('\nSursa:'):
            return text_list[:i]
    return text_list

df['article_text'] = df['article_text'].apply(remove_sursa)

Clean text

In [12]:
def clean_text_list(text_list):
    cleaned_list = [text.replace('\xa0', '').strip() for text in text_list if text.strip() != '']
    return cleaned_list

df['article_text'] = df['article_text'].apply(clean_text_list)

# Join sentences
df['article_text'] = df['article_text'].apply(lambda sentences: ' '.join(sentences))

Remove diacritics from our dataframe

In [13]:
# def remove_diacritics(text):
#     return unidecode(text)

# df = df.applymap(lambda x: remove_diacritics(str(x)))

Read the dataframe containing geo locations

In [14]:
# Read the dataframe containing the counties and the regions
df_romania = pd.read_excel('data/romania_geo.xlsx')
counties = df_romania['Judet'].to_list()
regions = df_romania['Regiune'].dropna().to_list()

Lemmatize article_text

In [15]:
# Function to lemmatize a text
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Lemmatize the 'article_text' column
df['article_text_lemmatized'] = df['article_text'].apply(lemmatize_text)

Stem article_text

In [16]:
# Function to stem text
def stem_text(text):
    return " ".join(stemmer.stem(word) for word in text.split())

# Stem the 'article_text' column
df['article_text_stemmed'] = df['article_text'].apply(stem_text)

Lowercase the words again

In [17]:
# df_preprocessed = df_preprocessed.applymap(lambda x: remove_diacritics(str(x)))

Create a copy of the original dataframe

In [18]:
df_preprocessed = df.copy()

Extract from lemmatized text

In [20]:
lemmatized_regions = [lemmatize_text(region) for region in regions]
lemmatized_counties = [lemmatize_text(county) for county in counties]

location_mapping_counties_lemma = dict(zip([lemma.lower() for lemma in lemmatized_counties], counties))
location_mapping_regions_lemma = dict(zip([lemma.lower() for lemma in lemmatized_regions], regions))

def extract_lemmatized_location(text, lemmatized_locations, location_mapping):
    words = text.lower().split()
    detected = [location_mapping[word] for word in words if word in lemmatized_locations]
    return list(set(detected))

df_preprocessed['article_text_county_lemma'] = df_preprocessed['article_text_lemmatized'].apply(extract_lemmatized_location, lemmatized_locations=[lemma.lower() for lemma in lemmatized_counties], location_mapping=location_mapping_counties_lemma)
df_preprocessed['article_text_region_lemma'] = df_preprocessed['article_text_lemmatized'].apply(extract_lemmatized_location, lemmatized_locations=[lemma.lower() for lemma in lemmatized_regions], location_mapping=location_mapping_regions_lemma)

Extract from stemmed text

In [22]:
stemmed_regions = [stem_text(region) for region in regions]
stemmed_counties = [stem_text(county) for county in counties]

location_mapping_counties = dict(zip([stem.lower() for stem in stemmed_counties], counties))
location_mapping_regions = dict(zip([stem.lower() for stem in stemmed_regions], regions))

def extract_stemmed_locations(text, stemmed_locations, location_mapping):
    words = text.lower().split()
    detected = [location_mapping[word] for word in words if word in stemmed_locations]
    return list(set(detected))

df_preprocessed['article_text_county_stem'] = df_preprocessed['article_text_stemmed'].apply(extract_stemmed_locations, stemmed_locations=[stem.lower() for stem in stemmed_counties], location_mapping=location_mapping_counties)
df_preprocessed['article_text_region_stem'] = df_preprocessed['article_text_stemmed'].apply(extract_stemmed_locations, stemmed_locations=[stem.lower() for stem in stemmed_regions], location_mapping=location_mapping_regions)

Read the dataframe containing extreme phenomena

In [24]:
df_phenomena = pd.read_excel('data/extreme_phenomena.xlsx')
extreme_phenomena = df_phenomena['Fenomene'].to_list()

Extract from lemma and stem extreme phenomena

In [28]:
lemmatized_phenomena = [lemmatize_text(phenomena) for phenomena in extreme_phenomena]
phenomena_mapping_lemma = dict(zip([lemma.lower() for lemma in lemmatized_phenomena], extreme_phenomena))

stemmed_phenomena = [stem_text(phenomena) for phenomena in extreme_phenomena]
phenomena_mapping_stem = dict(zip([stem.lower() for stem in stemmed_phenomena], extreme_phenomena))

df_preprocessed['article_text_phenomen_lemma'] = df_preprocessed['article_text_lemmatized'].apply(extract_lemmatized_location, lemmatized_locations=[lemma.lower() for lemma in lemmatized_phenomena], location_mapping=phenomena_mapping_lemma)
df_preprocessed['article_text_phenomen_stem'] = df_preprocessed['article_text_stemmed'].apply(extract_stemmed_locations, stemmed_locations=[stem.lower() for stem in stemmed_phenomena], location_mapping=phenomena_mapping_stem)

In [36]:
df_preprocessed.tail(5)

Unnamed: 0,article_url,article_title,article_lead,article_text,article_date,article_text_lemmatized,article_text_stemmed,article_text_county_lemma,article_text_region_lemma,article_text_county_stem,article_text_region_stem,article_text_phenomen_lemma,article_text_phenomen_stem
7083,https://stirileprotv.ro/stiri/vremea/furtuni-u...,"Ploi torențiale, vijelii și grindină în vest, ...","Meteorologii anunţă, că, până joi dimineaţă, e...",În anumite zone din ţară vântul va sufla cu pu...,2023-08-16 10:47:00,în anumit zonă din ţară vânt vrea sufla cu put...,în anum zon din ţar vânt va sufl cu putere. de...,[],[Dobrogea],[],[Dobrogea],"[Caniculă, Grindină, Vijelie]","[Caniculă, Vijelie]"
7084,https://stirileprotv.ro/stiri/vremea/alerta-de...,Avertisment de caniculă în mai multe regiuni a...,"Sudul şi sud-estul ţării se va afla, de joi pâ...","Astfel, în perioada 17 - 20 august 2023, va fi...",2023-08-17 08:22:00,"astfel , în perioadă 17 - 20 august 2023 , vre...","astfel, în perioad 17 - 20 august 2023, va fi ...",[],[],[],[],[],[Caniculă]
7085,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 17 august. Caniculă în sud și în s...","Vremea se încălzește, iar amiaza aduce canicul...","Aversele apar, local, la deal, la munte, de as...",2023-08-17 08:48:00,"Aversele apărea , local , la deal , la munte ,...","aver apar, local, la deal, la munte, de asemen...","[Iași, Maramureș, Vaslui, București]","[Moldova, Muntenia, Dobrogea, Maramureș, Olten...","[Iași, Maramureș, București]","[Moldova, Dobrogea, Maramureș, Oltenia, Transi...","[Fulger, Caniculă, Grindină]",[Caniculă]
7086,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 18 august. Disconfortul termic est...","Ziua de vineri ne aduce vreme foarte caldă, ca...",Maximele pornesc de la 28 de grade pe litoral ...,2023-08-18 10:44:00,maxim porni de la 28 de grad pe litoral și aju...,maxim porn de la 28 de grad pe litoral și ajun...,"[Maramureș, București]","[Moldova, Dobrogea, Muntenia, Maramureș, Olten...","[Maramureș, București]","[Moldova, Dobrogea, Muntenia, Maramureș, Olten...","[Grindină, Caniculă, Fulger]","[Caniculă, Grindină]"
7087,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 19 august. Disconfort termic ridic...","Ziua de astăzi ne-aduce vreme caldă, disconfor...",Maximele pleacă de la 28 de grade în sudul lit...,2023-08-19 11:09:00,maxim pleca de la 28 de grad în sud litoral şi...,maxim pleac de la 28 de grad în sud litoral şi...,[],"[Moldova, Dobrogea, Muntenia, Transilvania, Bu...",[],"[Moldova, Transilvania, Muntenia, Dobrogea]","[Caniculă, Fulger]",[Grindină]


### Export dataframe

In [30]:
df_preprocessed.to_csv('data/data_preprocessed.csv', index=False)