### Import libraries

In [1]:
import pandas as pd
import spacy
import nltk

from nltk.stem.snowball import SnowballStemmer
from unidecode import unidecode

nlp = spacy.load("ro_core_news_md")
stemmer = SnowballStemmer("romanian")

### Read data

In [2]:
df = pd.read_json('data/meteo_news_updated.json')

### Data inspection

Check dataframe for missing values

In [3]:
nan_df = df[df.isna().any(axis=1)]
nan_df

Unnamed: 0,article_url,article_title,article_lead,article_text,article_date
887,https://stirileprotv.ro/stiri/vremea/vremea-as...,"Vremea astăzi, 5 iulie. Țara se împarte între ...",,"[Vremea în țară, În Dobrogea şi Bărăgan, la am...",05-07-2022 07:10
942,https://stirileprotv.ro/stiri/vremea/vremea-as...,"Vremea astăzi, 11 iunie. Prognoza meteo pentru...",,"[\nSursa: , \n, \nEtichete:\n, ,\n, ,\n, ,\n, ...",11-06-2022 07:19
1291,https://stirileprotv.ro/stiri/vremea/vremea-12...,"Vremea, 12 Octombrie. Vremea se menține deoseb...",,"[ , Vreme ceva mai bună găsim doar în vest, î...",12-10-2021 07:24
2400,https://vacantalamunte.stirileprotv.ro/stiri/v...,,,[],
2435,https://vacantalamunte.stirileprotv.ro/stiri/n...,,,[],
3367,https://vacantalamare.stirileprotv.ro/stiri/ma...,,,[],
4149,https://vacantalamunte.stirileprotv.ro/stiri/s...,,,[],
6009,https://stirileprotv.ro/stiri/meteo/afla-cum-e...,Afla cum e vremea in Romania din ora in ora,,"[Sursa: ANM, \nSursa: , \n, \nEtichete:\n, ,\n...",26-10-2011 20:02
6906,https://stirileprotv.ro/stiri/meteo/vom-avea-t...,Vom avea temperaturi de peste 33 de grade in s...,,"[\nMasa de aer cald, tropical, va domina jumat...",23-05-2009 16:33


In [4]:
# Check shape
nan_df.shape

(9, 5)

In [5]:
# After reviewing those articles, we have determined that the URL is broken, the article has been deleted, and so on. 
# Therefore, we have decided to remove them, as they constitute only a small percentage of our dataframe.

Drop the NaN's

In [6]:
df.dropna(inplace=True)

Convert "article_date" to datetime datatype with pandas

In [7]:
df['article_date'] = pd.to_datetime(df['article_date'], format='%d-%m-%Y %H:%M')

# Sort values by article_date
df = df.sort_values('article_date').reset_index(drop=True)

Check dataframe shape

In [8]:
df.shape

(7088, 5)

In [9]:
# Our dataset is pretty consistent in order to build a POC or to prove our hypothesis

Check dtypes for each column in our dataframe

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7088 entries, 0 to 7087
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   article_url    7088 non-null   object        
 1   article_title  7088 non-null   object        
 2   article_lead   7088 non-null   object        
 3   article_text   7088 non-null   object        
 4   article_date   7088 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 277.0+ KB


Remove useless elements from article_text

In [11]:
def remove_sursa(text_list):
    for i in range(len(text_list)):
        if text_list[i].startswith('\nSursa:'):
            return text_list[:i]
    return text_list

df['article_text'] = df['article_text'].apply(remove_sursa)

Clean text

In [12]:
def clean_text_list(text_list):
    cleaned_list = [text.replace('\xa0', '').strip() for text in text_list if text.strip() != '']
    return cleaned_list

df['article_text'] = df['article_text'].apply(clean_text_list)

# Join sentences
df['article_text'] = df['article_text'].apply(lambda sentences: ' '.join(sentences))

Remove diacritics from our dataframe

In [13]:
def remove_diacritics(text):
    return unidecode(text)

df = df.applymap(lambda x: remove_diacritics(str(x)))

Read the dataframe containing geo locations

In [20]:
# Read the dataframe containing the counties and the regions
df_romania = pd.read_csv('data/romania_counties.csv')
counties = df_romania['Judet'].to_list()
regions = df_romania['Regiuni'].dropna().to_list()

Lemmatize article_text

In [15]:
# Function to lemmatize a text
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Lemmatize the 'article_text' column
df['article_text_lemmatized'] = df['article_text'].apply(lemmatize_text)

Stem article_text

In [16]:
# Function to stem text
def stem_text(text):
    return " ".join(stemmer.stem(word) for word in text.split())

# Stem the 'article_text' column
df['article_text_stemmed'] = df['article_text'].apply(stem_text)

Create a copy of the original dataframe

In [49]:
df_preprocessed = df.copy()

Stem counties and regions

In [50]:
stemmed_regions = [stem_text(region) for region in regions]
stemmed_counties = [stem_text(county) for county in counties]

Extract from lemmatized text

In [51]:
def extract_location(text, locations):
    return [location for location in locations if location.lower() in text.lower()]

df_preprocessed['article_text_county_lemma'] = df_preprocessed['article_text_lemmatized'].apply(extract_location, locations=counties)
df_preprocessed['article_text_region_lemma'] = df_preprocessed['article_text_lemmatized'].apply(extract_location, locations=regions)

Extract from stemmed text

In [52]:
location_mapping = dict(zip(stemmed_counties, counties))
location_mapping = dict(zip(stemmed_regions, regions))

def extract_stemmed_locations(text, stemmed_locations):
    detected = []
    for word in text.split():
        for location in stemmed_locations:
            if location in word:
                detected.append(location)
    return list(set(detected))

df_preprocessed['article_text_county_stem'] = df_preprocessed['article_text_stemmed'].apply(extract_stemmed_locations, stemmed_locations=stemmed_counties)
df_preprocessed['article_text_region_stem'] = df_preprocessed['article_text_stemmed'].apply(extract_stemmed_locations, stemmed_locations=stemmed_regions)

Read the dataframe containing extreme phenomena

In [66]:
df_phenomena = pd.read_csv('data/extreme_phenomena.csv')
extreme_phenomena = df_phenomena['Fenomene extreme'].to_list()

Extract from lemma and stem extreme phenomena

In [69]:
df_preprocessed['article_text_phenomena_lemma'] = df_preprocessed['article_text_lemmatized'].apply(extract_location, locations=extreme_phenomena)

stemmed_phenomena = [stem_text(phenomena) for phenomena in extreme_phenomena]
location_mapping = dict(zip(stemmed_phenomena, extreme_phenomena))
df_preprocessed['article_text_phenomena_stem'] = df_preprocessed['article_text_stemmed'].apply(extract_stemmed_locations, stemmed_locations=stemmed_phenomena)

### Export dataframe

In [70]:
df_preprocessed.to_csv('data_preprocessed.csv', index=False)

Unnamed: 0,article_url,article_title,article_lead,article_text,article_date,article_text_lemmatized,article_text_stemmed,article_text_county_lemma,article_text_region_lemma,article_text_county_stem,article_text_region_stem,article_text_phenomena_lemma,article_text_phenomena_stem
0,https://stirileprotv.ro/stiri/social/vezi-cum-...,Vezi cum va fi vremea pe continent si in tara ...,Temperaturi scazute in tara! - 10 grade Celsiu...,In Europa precipitatiile vor fi consistente in...,2009-02-19 17:24:00,in Europa precipitatie vrea fi consistent in j...,in europ precip vor fi consistent in jumat est...,"[Maramures, Mures, Bucuresti]","[Transilvania, Moldova, Bucovina, Dobrogea, Ba...","[tulc, mures, maramures, bucurest]","[bucovin, moldov, crisan, transilvan, maramure...","[Viscol, Ger]","[ger, viscol]"
1,https://stirileprotv.ro/stiri/social/vezi-cum-...,Vezi cum va fi vremea in urmatoarele trei zile!,Ziua de astazi va aduce inca un pic de ninsoar...,"Maine se va insenina in nord-vest, in schimb a...",2009-02-19 17:24:00,"Maine sine vrea insenina in nord-vest , in sch...","main se va insenin in nord-vest, in schimb ast...",[],[Banat],[],[banat],[],[]
2,https://stirileprotv.ro/stiri/social/vezi-aici...,Vezi aici cum va fi vremea in primele trei zil...,"Incepem saptamana cu vreme rece, chiar geroasa...","In Europa, presiunea atmosferica se va mentine...",2009-02-19 17:25:00,"in Europa , presiune atmosferică sine vrea men...","in europa, presiun atmosfer se va mentin ridic...",[Bucuresti],"[Transilvania, Moldova]",[bucurest],"[transilvan, moldov]","[Ciclon, Ger]","[ger, ciclon]"
3,https://stirileprotv.ro/stiri/social/vezi-aici...,Vezi aici cum va fi vremea pe continent si in ...,Pe continent precipitatiile vor fi indeosebi s...,"Masa de aer rece, de origine polara se va depl...",2009-02-19 17:25:00,"masă de aer rece , de origine polara sine vrea...","mas de aer rece, de origin polar se va deplas ...","[Maramures, Mures, Bucuresti]","[Transilvania, Muntenia, Moldova, Bucovina, Do...","[bucurest, mures, maramures]","[munten, bucovin, moldov, transilvan, maramure...",[Ger],[ger]
4,https://stirileprotv.ro/stiri/social/meteo-afl...,Meteo: afla cum va fi vremea in urmatoarele zi...,Cine a pariat pe primavara s-a cam grabit. Vre...,In Europa precipitatiile se concentreaza pe zo...,2009-02-19 17:27:00,in Europa precipitatie sine concentreaza pe zo...,in europ precip se concentreaz pe zon sudic si...,[Olt],"[Transilvania, Oltenia, Muntenia, Moldova, Dob...","[bucurest, olt]","[munten, olten, moldov, transilvan, dobrog, ba...",[Viscol],[viscol]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7083,https://stirileprotv.ro/stiri/vremea/furtuni-u...,"Ploi torentiale, vijelii si grindina in vest, ...","Meteorologii anunta, ca, pana joi dimineata, e...",In anumite zone din tara vantul va sufla cu pu...,2023-08-16 10:47:00,in anumit zonă din tară vant vrea sufla cu put...,in anum zon din tar vant va sufl cu putere. de...,[],[Dobrogea],[],"[munten, dobrog]",[Grindina],"[canicul, grindin, vijel]"
7084,https://stirileprotv.ro/stiri/vremea/alerta-de...,Avertisment de canicula in mai multe regiuni a...,"Sudul si sud-estul tarii se va afla, de joi pa...","Astfel, in perioada 17 - 20 august 2023, va fi...",2023-08-17 08:22:00,"astfel , in perioadă 17 - 20 august 2023 , vre...","astfel, in perioad 17 - 20 august 2023, va fi ...",[Bucuresti],[],[bucurest],[],[Canicula],[canicul]
7085,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 17 august. Canicula in sud si in s...","Vremea se incalzeste, iar amiaza aduce canicul...","Aversele apar, local, la deal, la munte, de as...",2023-08-17 08:48:00,"Aversele apărea , local , la deal , la munte ,...","aver apar, local, la deal, la munte, de asemen...","[Iasi, Maramures, Mures, Olt, Vaslui, Bucuresti]","[Transilvania, Oltenia, Muntenia, Moldova, Buc...","[ias, bucurest, vasl, maramures, olt, mures]","[munten, bucovin, olten, moldov, transilvan, m...","[Grindina, Canicula, Ger, Fulger]","[canicul, ger, grindin, fulger]"
7086,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 18 august. Disconfortul termic est...","Ziua de vineri ne aduce vreme foarte calda, ca...",Maximele pornesc de la 28 de grade pe litoral ...,2023-08-18 10:44:00,maxim porni de la 28 de grad pe litoral si aju...,maxim porn de la 28 de grad pe litoral si ajun...,"[Maramures, Mures, Olt, Bucuresti]","[Transilvania, Oltenia, Muntenia, Moldova, Buc...","[bucurest, mures, maramures, olt]","[munten, bucovin, olten, moldov, transilvan, m...","[Canicula, Ger, Fulger]","[canicul, ger, grindin, fulger]"
