In [15]:
#we load the libraries needed
import pandas as pd
import re
from datetime import datetime
import matplotlib.pyplot as plt



#we load the dataset
articles_df = pd.read_csv("articles_2400_2422.csv")

print(articles_df.head())


                                         article_url  \
0  https://www.thejournal.ie/bujo-planning-permis...   
1  https://www.the42.ie/tyrone-dublin-league-repo...   
2  https://www.thejournal.ie/missing-jamie-walsh-...   
3  https://www.thejournal.ie/dept-health-evacuate...   
4  https://www.thejournal.ie/valentines-day-deals...   

                                        article_text article_date_created  \
0  \nAWARD-WINNING DUBLIN BURGER restaurant BuJo ...   1.01am, 9 Mar 2019   
1  Kevin O’Brien reports from Croke Park\nTHIS WI...  9.25pm, 16 Mar 2019   
2  A FOURTEEN-YEAR-old who went missing in Dublin...  7.45pm, 22 Mar 2019   
3  LAST UPDATE|4 Mar 2019\n\nTheJournal.ie/ YouTu...  12.08pm, 4 Mar 2019   
4  GROCERY SALES INCREASED by 3.5% in the latest ...  9.25pm, 11 Mar 2019   

  article_date_updated  
0                  NaN  
1                  NaN  
2                  NaN  
3           4 Mar 2019  
4                  NaN  


## Tidy up the article text ##

In [20]:
#we clean the text by putting everythin in lower case, removing unecessary text, punctuation and extra whitespace
def clean_article_text(text):
    #put all the text in lowercase
    text = text.lower()


    #we remove "last update" lines or similar metadata
    text = re.sub(r'last update\|?\s*\d{1,2}\s\w+\s\d{4}', '', text)

    #and remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

#we apply this cleaning function to the text, after cleaning for empty rows
articles_df = articles_df.dropna(subset=['article_text'])
articles_df['cleaned_text'] = articles_df['article_text'].apply(clean_article_text)


In [21]:
#we can check that it worked correctly
print(articles_df.head())

                                         article_url  \
0  https://www.thejournal.ie/bujo-planning-permis...   
1  https://www.the42.ie/tyrone-dublin-league-repo...   
2  https://www.thejournal.ie/missing-jamie-walsh-...   
3  https://www.thejournal.ie/dept-health-evacuate...   
4  https://www.thejournal.ie/valentines-day-deals...   

                                        article_text article_date_created  \
0  \nAWARD-WINNING DUBLIN BURGER restaurant BuJo ...   1.01am, 9 Mar 2019   
1  Kevin O’Brien reports from Croke Park\nTHIS WI...  9.25pm, 16 Mar 2019   
2  A FOURTEEN-YEAR-old who went missing in Dublin...  7.45pm, 22 Mar 2019   
3  LAST UPDATE|4 Mar 2019\n\nTheJournal.ie/ YouTu...  12.08pm, 4 Mar 2019   
4  GROCERY SALES INCREASED by 3.5% in the latest ...  9.25pm, 11 Mar 2019   

  article_date_updated                                       cleaned_text  
0                  NaN  award-winning dublin burger restaurant bujo ha...  
1                  NaN  kevin o’brien reports fr

In [22]:
#check the structure
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 920
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   article_url           920 non-null    object
 1   article_text          920 non-null    object
 2   article_date_created  919 non-null    object
 3   article_date_updated  94 non-null     object
 4   cleaned_text          920 non-null    object
dtypes: object(5)
memory usage: 43.1+ KB


In [23]:
#we need to change the format to datetime for the created and updated dates
articles_df['article_date_created'] = pd.to_datetime(
    articles_df['article_date_created'], 
    format='%I.%M%p, %d %b %Y', 
    errors='coerce'
)

articles_df['article_date_updated'] = pd.to_datetime(
    articles_df['article_date_updated'], 
    format='%d %b %Y', 
    errors='coerce'
)


In [24]:
#we check the data types and how the data looks like
print(articles_df[['article_date_created', 'article_date_updated']].dtypes)
print(articles_df[['article_date_created', 'article_date_updated']].head())


article_date_created    datetime64[ns]
article_date_updated    datetime64[ns]
dtype: object
  article_date_created article_date_updated
0  2019-03-09 01:01:00                  NaT
1  2019-03-16 21:25:00                  NaT
2  2019-03-22 19:45:00                  NaT
3  2019-03-04 12:08:00           2019-03-04
4  2019-03-11 21:25:00                  NaT


In [25]:
articles_df.info()
#this is what we wanted

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 920
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   article_url           920 non-null    object        
 1   article_text          920 non-null    object        
 2   article_date_created  919 non-null    datetime64[ns]
 3   article_date_updated  94 non-null     datetime64[ns]
 4   cleaned_text          920 non-null    object        
dtypes: datetime64[ns](2), object(3)
memory usage: 43.1+ KB


In [26]:
#we save this clean data into another csv
articles_df.to_csv('cleaned_articles_other_subjects.csv', index=False)

In [None]:
## Now we need to compare those articles with the Brexit articles in terms of sentiment