# ***Scandinavian News Articles***

This notebook is a part of the final project in *Computational Tools for data science 02807*

### ***Imports***

In [None]:
import pandas as pd 
from IPython.display import display, HTML

### ***Import Dataset***

The datasets have been scraped using ..

In [None]:
news_articles = pd.read_csv('data/articles.csv')

# information about the dataset
news_articles.head(10)
news_articles.shape[0]
news_articles.info()

### ***Preprocessing Steps***

#### 1. Remove irrelevant links

In some norwegian articles posted by "vg.no" there are links to another website called "e24.no". These are not relevant for the project, and are therefore dropped from the data frame

In [None]:
# drop all norwegian articles from "vg" that contains "e24" in the url
e24_links = news_articles[(news_articles['country'] == 'norway') & (news_articles['source'] == 'vg') & (news_articles['url'].str.contains('https://e24.no'))]
news_articles = news_articles.drop(e24_links.index)
news_articles.reset_index(drop=True, inplace=True)

# print number of articles after preprocessing
print("Number of articles after preprocessing:", news_articles.shape[0])

news_articles.head(10)


#### 2. Remove invalid source

During scraping the source "news" has been included, which is not a valid source. Rows with this source has also been dropped. 

In [None]:
# count rows with source "news"
invalid_source = news_articles[news_articles['source'] == 'news']
print("Number of articles with source 'news':", invalid_source.shape[0])

# drop rows with source "news"
news_articles = news_articles[news_articles['source'] != 'news']
news_articles.reset_index(drop=True, inplace=True)
print("Number of articles after dropping 'news' source:", news_articles.shape[0])

#### 3. Standardize source names

Some sources have uppercase letters and some not, so we standardize by only using lowercase.

In [None]:
news_articles['source'] = news_articles['source'].str.lower()

#### 4. Remove articles outside the intended time period 2020-2025

In [None]:
news_articles['date'] = pd.to_datetime(news_articles['date'], errors='coerce')
news_articles = news_articles[(news_articles['date'] >= '2020-01-01') & (news_articles['date'] <= '2025-12-31')]
news_articles.reset_index(drop=True, inplace=True)
print("Number of articles after removing those outside 2020-2025:", news_articles.shape[0])

### ***Dataset Exploration***

#### Source information

The table below presents information about the sources we have retrieved. 

In [None]:
unique_sources = news_articles[['country', 'source']].drop_duplicates().sort_values(by='country')

sources_per_country = unique_sources['country'].value_counts()
display(HTML("<h4 style='font-weight:700'>Number of Sources per Country:</h4>"))
display(HTML("<pre style='font-family:monospace'>{}</pre>".format(
    sources_per_country.rename(index=lambda s: s.capitalize()).to_string(header=False)
)))

src_counts = (news_articles
              .groupby(['country', 'source'])
              .size()
              .reset_index(name='n_articles'))

# compute average worldview_score per country and source
temp_view = news_articles.copy()
temp_view['worldview_num'] = pd.to_numeric(temp_view['worldview_score'], errors='coerce')
avg_world = temp_view.groupby(['country', 'source'])['worldview_num'].mean().reset_index(name='avg_worldview').round(2)
display_df = src_counts.merge(avg_world, on=['country', 'source'], how='left')

# add column showing the time range of articles per country and source
time_ranges = (news_articles
                .groupby(['country', 'source'])
                .agg(first_article=('date', 'min'), last_article=('date', 'max'))
                .reset_index())

time_ranges['time_range'] = (
    time_ranges['first_article'].dt.strftime('%Y-%m-%d') +
    ' to ' +
    time_ranges['last_article'].dt.strftime('%Y-%m-%d')
)

display_df = display_df.merge(time_ranges[['country', 'source', 'time_range']], on=['country', 'source'])  
display_df = display_df[['source', 'n_articles', 'avg_worldview', 'country', 'time_range']]
display_df = display_df.sort_values(by='country').copy()
display(HTML(display_df.to_html(index=False, border=0)))


#### Article Distribution

In [None]:
import matplotlib.pyplot as plt

countries = news_articles['country'].unique()
num_countries = len(countries)

fig, axes = plt.subplots(1, num_countries, figsize=(18, 6), sharey=True)

if num_countries == 1:
    axes = [axes]

for ax, country in zip(axes, countries):
    country_data = news_articles[news_articles['country'] == country].copy()

    for i, source in enumerate(country_data['source'].unique()):
        source_data = country_data[country_data['source'] == source].copy()
        source_data['year'] = source_data['date'].dt.year
        year_counts = source_data['year'].value_counts().sort_index()

        ax.bar(
            year_counts.index + (0.1 * i),
            year_counts.values,
            width=0.1,
            label=source
        )

    ax.set_title(f'Articles per Year – {country.capitalize()}')
    ax.set_xlabel('Year')
    ax.set_xticks(range(2020, 2026))
    ax.legend()

axes[0].set_ylabel('Number of Articles')

plt.tight_layout()
plt.show()


#### Average Worldview in each Country from 2020-2025

In [None]:
import matplotlib.pyplot as plt
worldview_over_time = (news_articles
                       .copy())
worldview_over_time['worldview_num'] = pd.to_numeric(worldview_over_time['worldview_score'], errors='coerce')
worldview_over_time['year'] = worldview_over_time['date'].dt.year
avg_worldview_time = (worldview_over_time
                      .groupby(['country', 'year'])['worldview_num']
                      .mean()
                      .reset_index())

plt.figure(figsize=(10, 6))
for country in avg_worldview_time['country'].unique():
    country_data = avg_worldview_time[avg_worldview_time['country'] == country]
    plt.plot(country_data['year'], country_data['worldview_num'], marker='o', label=country.capitalize())

    
plt.title('Average Worldview Score Over Time by Country')
plt.xlabel('Year')
plt.ylabel('Average Worldview Score')
plt.xticks(avg_worldview_time['year'].unique())
plt.legend()
plt.grid()
plt.show()

### ***TF-IDF***

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

base_stopwords = set(stopwords.words('english') +
                       stopwords.words('swedish') +
                       stopwords.words('danish') +
                       stopwords.words('norwegian'))

# Domain-specific extra stopwords: outlets, generic news words, light verbs/adverbs
extra_stopwords = {
    "vg", "nrk", "aftonbladet", "dr", "artikler", "artikkel", "artikel",
    "sier", "mener", "siste", "litt", "mye", "mer", "år", "dag", "dager",
    "blant", "andre", "må", "fikk", "veldig", "annet", "går", "nå",
    "uten", "over", "etter", "inn", "to", "via", "se", "gå", "kom", "kommet",
    "ifølge", "har", "ha", "være", "viktig", "nyhet", "nyheter", "les", "lese",
    # add more as you notice junk terms
}

custom_stopwords = list(base_stopwords.union(extra_stopwords))

vectorizer = TfidfVectorizer(
    stop_words=custom_stopwords,   # must be list, not set
    max_features=5000,
    ngram_range=(1, 2)
)

news_articles['text'] = news_articles['headline'].fillna('') + " " + news_articles['content'].fillna('')
tfidf_matrix = vectorizer.fit_transform(news_articles['text'])

feature_names = vectorizer.get_feature_names_out()


print("TF-IDF shape:", tfidf_matrix.shape)


In [None]:
def top_tfidf_terms(mask, n=30):
    sub = tfidf_matrix[mask]
    mean_scores = sub.mean(axis=0).A1
    top_idx = mean_scores.argsort()[::-1][:n]
    return [(feature_names[i], mean_scores[i]) for i in top_idx]

# Example: top terms for Sweden
norway_mask = news_articles['country'] == 'norway'
top_tfidf_terms(norway_mask)
