# ***Scandinavian News Articles***

This notebook is a part of the final project in *Computational Tools for data science 02807*

### **Imports**

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import numpy as np

### **Import Datasets**

The datasets have been scraped using ..

In [2]:
norwegian_articles = pd.read_csv('./data/norway_all_articles_classified.csv')
danish_articles = pd.read_csv('./data/denmark_all_articles_classified.csv')
swedish_articles = pd.read_csv('./data/sweden_all_articles_classified.csv')

norwegian_articles.head(3)

Unnamed: 0,date,headline,content,url,source,country,primary_agenda,primary_category,political_categories,cultural_categories,political_score,cultural_score
0,2025-11-05,Sykefraværet stupte: Her jobber de fire dagers...,Velkommen til drømme-barnehagen! Plenty ansatt...,https://www.vg.no/nyheter/i/MnMr5K/80-100-ordn...,vg,norway,cultural,cultural_identity,elections,"cultural_identity,arts",2,2
1,2025-10-12,Østers-misjonæren som må kaste alt,Et kloakkutslipp i Glomma har stanset østershø...,https://www.vg.no/nyheter/i/RzPMed/oestersprod...,vg,norway,political,elections,"elections,defense",,3,0
2,2025-10-04,Forsvarer landslagsprofil etter rasismeanklage...,Aron Dønnum er blant spillerne som er tatt med...,https://www.nrk.no/sport/martin-odegaard-er-ik...,nrk,norway,cultural,arts,defense,arts,1,1


In [3]:
df = pd.concat([norwegian_articles, danish_articles, swedish_articles], ignore_index=True)

In [4]:
# Keep only rows with non-empty content
df = df.dropna(subset=['content']).copy()

In [5]:
url_re = re.compile(r'https?://\S+|www\.\S+')
num_re = re.compile(r'\d+')


def basic_clean(text: str) -> str:
    text = text.lower()
    text = url_re.sub(' ', text)
    text = num_re.sub(' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


df['content'] = df['content'].astype(str).map(basic_clean)

In [6]:
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.8,
    sublinear_tf=True,
    norm='l2'
)

X = tfidf.fit_transform(df['content'])
X.shape  # (num_docs, num_features)

(1192, 9356)

In [7]:
# Target: 'primary_agenda' (e.g., 'political' vs 'cultural')
df = df.dropna(subset=['primary_agenda'])
y = df['primary_agenda'].astype(str)

X_train, X_test, y_train, y_test = train_test_split(
    df['content'], y, test_size=0.2, random_state=42, stratify=y
)

clf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=5,
        max_df=0.8,
        sublinear_tf=True,
        norm='l2'
    )),
    ('clf', LinearSVC())
])

clf_pipeline.fit(X_train, y_train)
y_pred = clf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    cultural       0.72      0.86      0.78       133
   political       0.72      0.62      0.66        78
unclassified       0.85      0.39      0.54        28

    accuracy                           0.72       239
   macro avg       0.76      0.62      0.66       239
weighted avg       0.73      0.72      0.71       239





In [8]:
vectorizer = clf_pipeline.named_steps['tfidf']
clf = clf_pipeline.named_steps['clf']

feature_names = np.array(vectorizer.get_feature_names_out())
classes = clf.classes_

# For LinearSVC, coef_ shape: (n_classes, n_features) in one-vs-rest scheme
for i, cls in enumerate(classes):
    top_idx = np.argsort(clf.coef_[i])[-15:]  # 15 strongest positive features
    print(f"\nTop terms for class '{cls}':")
    print(feature_names[top_idx][::-1])


Top terms for class 'cultural':
['ai' 'sig' 'artikler' 'dette' 'ham' 'upp' 'at vi' 'verden' 'the' 'start'
 'ind' 'kunde' 'populære' 'og' 'så']

Top terms for class 'political':
['eu' 'valg' 'forsvarsminister' 'lars' 'regeringen' 'usa' 'at hun'
 'reuters' 'stemme' 'mot' 'regering' 'valgt' 'kan være' 'ikke'
 'demokraterne']

Top terms for class 'unclassified':
['check' 'att ha' 'anmelder' 'kristian' 'agf' 'borgmester new'
 'stockholms' 'kampen' 'sig om' 'guld' 'franska' 'bilen' 'børn' 'strøm'
 'ti år']
