In [76]:
import kagglehub
import pandas as pd
import os


# 1. Carga de datos
# Cargamos el dataset original extraído de TMDB
path = kagglehub.dataset_download("raveennimbiwal/top-rated-tv-shows-dataset-global-2025")
files = os.listdir(path)
df = pd.read_csv(os.path.join(path, files[0]))
df.head()

Unnamed: 0,id,title,original_title,overview,premiere_date,popularity,genre,country_origin,original_language,rating,votes
0,1,Breaking Bad,Breaking Bad,"Walter White, a New Mexico chemistry teacher, ...",2008-01-20,108.782,"Drama, Crime",United States,English,8.9,16556
1,2,Avatar: The Last Airbender,Avatar: The Last Airbender,"In a war-torn world of elemental magic, a youn...",2005-02-21,12.5347,"Animation, Action & Adventure, Sci-Fi & Fantasy",United States,English,8.8,4557
2,3,Arcane,Arcane,Amid the stark discord of twin cities Piltover...,2021-11-06,22.5739,"Animation, Sci-Fi & Fantasy, Drama, Action & A...",United States,English,8.8,5481
3,4,When Life Gives You Tangerines,폭싹 속았수다,"In Jeju, a spirited girl and a steadfast boy's...",2025-03-07,18.5771,Drama,South Korea,Korean,8.751,423
4,5,Frieren: Beyond Journey's End,葬送のフリーレン,Decades after her party defeated the Demon Kin...,2023-09-29,28.3761,"Animation, Action & Adventure, Drama, Sci-Fi &...",Japan,Japanese,8.735,565


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2000 non-null   int64  
 1   title              2000 non-null   object 
 2   original_title     2000 non-null   object 
 3   overview           1980 non-null   object 
 4   premiere_date      2000 non-null   object 
 5   popularity         2000 non-null   float64
 6   genre              2000 non-null   object 
 7   country_origin     1999 non-null   object 
 8   original_language  2000 non-null   object 
 9   rating             2000 non-null   float64
 10  votes              2000 non-null   int64  
dtypes: float64(2), int64(2), object(7)
memory usage: 172.0+ KB


In [78]:
df.isna().sum()

id                    0
title                 0
original_title        0
overview             20
premiere_date         0
popularity            0
genre                 0
country_origin        1
original_language     0
rating                0
votes                 0
dtype: int64

In [82]:
# 2. Limpieza e imputacion de datos

# Eliminamos registros repetidos basados en el ID único de la serie, tambien donde el contenido sea identico pero no asi el ID
df = df.drop_duplicates(subset='id')
cols_sin_id = df.columns.drop('id')
df = df.drop_duplicates(subset=cols_sin_id, keep='first')
df.duplicated(subset='id').sum()
# Imputamos manualmente el país faltante para no perder el registro en el análisis geográfico
df.loc[df['title'] == 'Los heroes del norte', 'country_origin'] = 'Mexico'
# Cambio tipo de datos a columnas de fecha
df['premiere_date'] = pd.to_datetime(df['premiere_date'], errors='coerce')
df['year'] = df['premiere_date'].dt.year
# Cambio de tipo de datos a variables numericas
num_cols = ['rating', 'votes', 'popularity']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
# Se revisan la frecuencia de los generos

df_genres = df.copy()
df_genres['genre'] = df_genres['genre'].str.split(',')
df_genres = df_genres.explode('genre')
df_genres['genre'] = df_genres['genre'].str.strip()
genre_counts = df_genres['genre'].value_counts()
genre_counts


genre
Drama                 1261
Comedy                 743
Sci-Fi & Fantasy       680
Action & Adventure     593
Animation              586
Mystery                353
Crime                  353
Family                 194
Kids                   152
Soap                    69
Documentary             55
War & Politics          44
Reality                 20
Western                 20
Talk                     6
Romance                  2
News                     2
History                  1
Name: count, dtype: int64

In [None]:
# 3. Lógica de Jerarquización de Géneros 
# Definimos una prioridad para asignar un único género principal y evitar duplicidad de datos
priority_list = [
    'Kids', 
    'Animation', 
    'Family', 
    'Sci-Fi & Fantasy', 
    'Comedy', 
    'Crime', 
    'Drama'
]
def definir_genero_principal(genre_string):
    for prioridad in priority_list:
        if prioridad in genre_string:
            return prioridad
    return 'other'
df['primary_genre'] = df['genre'].apply(definir_genero_principal)

In [None]:
# 4. Exportación del dataset procesado
# Usamos punto como decimal para asegurar compatibilidad con la configuración de Power BI
df.to_csv(
    "tv_series_clean.csv",
    index=False,
    decimal='.'
)