# Nordic Content Recommender â€” Data Exploration

This notebook explores publicly available movie and TV metadata
used as the foundation for a Nordic-focused recommendation system.

The goal is to:
- validate data loading
- understand available features
- prepare for embedding and modeling steps


In [1]:
import sys
import pandas as pd
import numpy as np

print("Python version:", sys.version)
print("Pandas version:", pd.__version__)
print("Numpy version:", np.__version__)


Python version: 3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]
Pandas version: 2.3.3
Numpy version: 2.3.5


In [2]:
data = [
    {
        "title_id": 1,
        "title": "The Bridge",
        "year": 2011,
        "country": "SE/DK",
        "language": "sv",
        "genres": ["Crime", "Drama", "Thriller"],
        "overview": "A body is found on the bridge between Sweden and Denmark.",
        "popularity": 8.7
    },
    {
        "title_id": 2,
        "title": "Force Majeure",
        "year": 2014,
        "country": "SE",
        "language": "sv",
        "genres": ["Drama"],
        "overview": "A family vacation in the Alps takes an unexpected turn.",
        "popularity": 7.8
    },
    {
        "title_id": 3,
        "title": "The Worst Person in the World",
        "year": 2021,
        "country": "NO",
        "language": "no",
        "genres": ["Drama", "Romance"],
        "overview": "A young woman navigates love and work in Oslo.",
        "popularity": 8.3
    }
]

df = pd.DataFrame(data)
df


Unnamed: 0,title_id,title,year,country,language,genres,overview,popularity
0,1,The Bridge,2011,SE/DK,sv,"[Crime, Drama, Thriller]",A body is found on the bridge between Sweden a...,8.7
1,2,Force Majeure,2014,SE,sv,[Drama],A family vacation in the Alps takes an unexpec...,7.8
2,3,The Worst Person in the World,2021,NO,no,"[Drama, Romance]",A young woman navigates love and work in Oslo.,8.3


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title_id    3 non-null      int64  
 1   title       3 non-null      object 
 2   year        3 non-null      int64  
 3   country     3 non-null      object 
 4   language    3 non-null      object 
 5   genres      3 non-null      object 
 6   overview    3 non-null      object 
 7   popularity  3 non-null      float64
dtypes: float64(1), int64(2), object(5)
memory usage: 320.0+ bytes


In [4]:
df["year"].describe()

count       3.000000
mean     2015.333333
std         5.131601
min      2011.000000
25%      2012.500000
50%      2014.000000
75%      2017.500000
max      2021.000000
Name: year, dtype: float64

In [5]:
df["genres"].explode().value_counts()

genres
Drama       3
Crime       1
Thriller    1
Romance     1
Name: count, dtype: int64

In [6]:
def recommend_by_genre(df, genre, top_n=3):
    """
    Return the top N most popular titles that match a given genre.
    This is a simple baseline recommender used for exploration.
    """
    return(
        df[df["genres"].apply(lambda g: genre in g)]
        .sort_values("popularity", ascending=False)
        .head(top_n)
    )

recommend_by_genre(df, "Drama")

Unnamed: 0,title_id,title,year,country,language,genres,overview,popularity
0,1,The Bridge,2011,SE/DK,sv,"[Crime, Drama, Thriller]",A body is found on the bridge between Sweden a...,8.7
2,3,The Worst Person in the World,2021,NO,no,"[Drama, Romance]",A young woman navigates love and work in Oslo.,8.3
1,2,Force Majeure,2014,SE,sv,[Drama],A family vacation in the Alps takes an unexpec...,7.8
