### Import libraries

In [1]:
import pandas as pd

from unidecode import unidecode

### Read data

In [2]:
df = pd.read_json('../data/data_crawl/meteo_news_updated.json')

### Data inspection

Check dataframe for missing values

In [3]:
nan_df = df[df.isna().any(axis=1)]
nan_df

Unnamed: 0,article_url,article_title,article_lead,article_text,article_date
887,https://stirileprotv.ro/stiri/vremea/vremea-as...,"Vremea astăzi, 5 iulie. Țara se împarte între ...",,"[Vremea în țară, În Dobrogea şi Bărăgan, la am...",05-07-2022 07:10
942,https://stirileprotv.ro/stiri/vremea/vremea-as...,"Vremea astăzi, 11 iunie. Prognoza meteo pentru...",,"[\nSursa: , \n, \nEtichete:\n, ,\n, ,\n, ,\n, ...",11-06-2022 07:19
1291,https://stirileprotv.ro/stiri/vremea/vremea-12...,"Vremea, 12 Octombrie. Vremea se menține deoseb...",,"[ , Vreme ceva mai bună găsim doar în vest, î...",12-10-2021 07:24
2400,https://vacantalamunte.stirileprotv.ro/stiri/v...,,,[],
2435,https://vacantalamunte.stirileprotv.ro/stiri/n...,,,[],
3367,https://vacantalamare.stirileprotv.ro/stiri/ma...,,,[],
4149,https://vacantalamunte.stirileprotv.ro/stiri/s...,,,[],
6009,https://stirileprotv.ro/stiri/meteo/afla-cum-e...,Afla cum e vremea in Romania din ora in ora,,"[Sursa: ANM, \nSursa: , \n, \nEtichete:\n, ,\n...",26-10-2011 20:02
6906,https://stirileprotv.ro/stiri/meteo/vom-avea-t...,Vom avea temperaturi de peste 33 de grade in s...,,"[\nMasa de aer cald, tropical, va domina jumat...",23-05-2009 16:33


In [4]:
# Check shape
nan_df.shape

(9, 5)

In [5]:
# After reviewing those articles, we have determined that the URL is broken, the article has been deleted, and so on. 
# Therefore, we have decided to remove them, as they constitute only a small percentage of our dataframe.

Drop the NaN's

In [6]:
df.dropna(inplace=True)

Convert "article_date" to datetime datatype with pandas

In [7]:
df['article_date'] = pd.to_datetime(df['article_date'], format='%d-%m-%Y %H:%M')

# Sort values by article_date
df = df.sort_values('article_date').reset_index(drop=True)

Check dataframe shape

In [8]:
df.shape

(7088, 5)

In [9]:
# Our dataset is pretty consistent in order to build a POC or to prove our hypothesis

Check dtypes for each column in our dataframe

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7088 entries, 0 to 7087
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   article_url    7088 non-null   object        
 1   article_title  7088 non-null   object        
 2   article_lead   7088 non-null   object        
 3   article_text   7088 non-null   object        
 4   article_date   7088 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 277.0+ KB


Remove useless elements from article_text

In [11]:
def remove_sursa(text_list):
    for i in range(len(text_list)):
        if text_list[i].startswith('\nSursa:'):
            return text_list[:i]
    return text_list

df['article_text'] = df['article_text'].apply(remove_sursa)

Clean text

In [12]:
def clean_text_list(text_list):
    cleaned_list = [text.replace('\xa0', '').strip() for text in text_list if text.strip() != '']
    return cleaned_list

df['article_text'] = df['article_text'].apply(clean_text_list)

# Join sentences
df['article_text'] = df['article_text'].apply(lambda sentences: ' '.join(sentences))

Drop NaN's

In [13]:
df.dropna(inplace=True)

### Export cleaned dataset

In [14]:
df.to_excel('../data/data_crawl/meteo_news_cleaned.xlsx', index=False)