In [None]:
import re

import pdfplumber
import pandas as pd
import numpy as np

from scripts.tolookandcompare_v2 import transform_info, transform_headtail
from scripts.soporte_eda import resumen_df

In [None]:

with pdfplumber.open("data/raw/World_Poll_Dataset_Details.pdf") as pdf:
    print(f"El PDF tiene {len(pdf.pages)} páginas.")


El PDF tiene 244 páginas.


In [12]:
def extraer_filas_de_texto(texto):
    # Insertamos el 'patrón' para capturar filas que empiezan con un año

    patron = r"(\d{4}-\d{4}\s+\w.+?)(?=\n\d{4}-\d{4}|$)"
    filas = re.findall(patron, texto, re.DOTALL)

    # Limpiamos cada fila: se dividen en columnas usando espacios múltiples
    tablas = []
    
    for fila in filas:
        columnas = re.split(r"\s{2,}", fila.strip())
        tablas.append(columnas)
    return tablas


In [None]:
def extraer_tablas_por_bloques(ruta_pdf, paginas_por_bloque=20):
    with pdfplumber.open(ruta_pdf) as pdf:
        total_paginas = len(pdf.pages)
        all_tables = []

        for i in range(0, total_paginas, paginas_por_bloque):
            for j in range(i, min(i + paginas_por_bloque, total_paginas)):
                page = pdf.pages[j]
                table = page.extract_table()
                if table:
                    df = pd.DataFrame(table[1:], columns=table[0])
                    all_tables.append(df)

        df = pd.concat(all_tables, ignore_index=True)
        return df

df = extraer_tablas_por_bloques("files/World_Poll_Dataset_Details.pdf", paginas_por_bloque=20)


# Guardamos CSV
df.to_csv("data/processed/gallup-interviews-info-2005-2025.csv", index=False)
print(f"Total de filas extraídas: {len(df)}")


Total de filas extraídas: 2730


In [16]:
df.head()

Unnamed: 0,Data\nCollection\nYear,Country,Data Collection Date,Number of\nInterviews,Design\nEffecta,Margin\nof Errorb,Mode of Interviewing,Languages,Over-\nSamplec
0,2005-2006,Albania,"Jan, 2007",981,1.61,4.0,Face-to-Face,Albanian,
1,2005-2006,Argentina,"May, 2006",1000,1.12,3.3,Face-to-Face,Spanish,
2,2005-2006,Armenia,"Jul, 2006",1000,1.06,3.2,Face-to-Face,"Armenian, Russian",
3,2005-2006,Australia,"Dec, 2005",1001,1.3,3.5,Landline Telephone,English,
4,2005-2006,Austria,"April, 2006",1004,1.4,3.7,Landline Telephone,German,


In [15]:
df.tail()

Unnamed: 0,Data\nCollection\nYear,Country,Data Collection Date,Number of\nInterviews,Design\nEffecta,Margin\nof Errorb,Mode of Interviewing,Languages,Over-\nSamplec
2725,2025,Norway,"Apr 28 - Jun 13, 2025",1000,1.98,4.4,Mobile Telephone,Norwegian,
2726,2025,Slovenia,"Mar 27 - May 21, 2025",1030,1.63,3.9,Landline and Mobile\nTelephone,Slovene,
2727,2025,Sweden,"Apr 22 - Jun 8, 2025",1001,1.72,4.1,Mobile Telephone,Swedish,
2728,2025,"Taiwan,\nProvince of\nChina","May 16 - Jun 13, 2025",1000,1.64,4.0,Landline and Mobile\nTelephone,Chinese,
2729,2025,Ukraine,"Jul 1 - Jul 14, 2025",1000,1.36,3.6,Mobile Telephone,"Russian, Ukrainian",


In [17]:
nuevos_nombres = [
    "Data Collection Year",
    "Country",
    "Data Collection Date",
    "Number of Interviews",
    "Design Effect",
    "Margin of Error",
    "Mode of Interviewing",
    "Languages",
    "Over-Sample"
]

df.columns = nuevos_nombres[:len(df.columns)]

In [18]:
df.head()

Unnamed: 0,Data Collection Year,Country,Data Collection Date,Number of Interviews,Design Effect,Margin of Error,Mode of Interviewing,Languages,Over-Sample
0,2005-2006,Albania,"Jan, 2007",981,1.61,4.0,Face-to-Face,Albanian,
1,2005-2006,Argentina,"May, 2006",1000,1.12,3.3,Face-to-Face,Spanish,
2,2005-2006,Armenia,"Jul, 2006",1000,1.06,3.2,Face-to-Face,"Armenian, Russian",
3,2005-2006,Australia,"Dec, 2005",1001,1.3,3.5,Landline Telephone,English,
4,2005-2006,Austria,"April, 2006",1004,1.4,3.7,Landline Telephone,German,


In [19]:
resumen_df(df)


Forma del DataFrame: (2730, 9)

Tipos de datos:
Data Collection Year    object
Country                 object
Data Collection Date    object
Number of Interviews    object
Design Effect           object
Margin of Error         object
Mode of Interviewing    object
Languages               object
Over-Sample             object
dtype: object

Valores nulos:
Data Collection Year    0
Country                 0
Data Collection Date    0
Number of Interviews    0
Design Effect           0
Margin of Error         0
Mode of Interviewing    0
Languages               0
Over-Sample             0
dtype: int64

Valores duplicados: 0

⚠️ No hay columnas numéricas en el DataFrame.

Resumen estadístico (categóricas):


Unnamed: 0,count,unique,top,freq
Data Collection Year,2730,20,2012,195
Country,2730,482,Ukraine,20
Data Collection Date,2730,2480,"Jul, 2007",22
Number of Interviews,2730,258,1000,1587
Design Effect,2730,186,1.36,69
Margin of Error,2730,67,3.7,414
Mode of Interviewing,2730,26,Face-to-Face,980
Languages,2730,280,Spanish,333
Over-Sample,2730,54,,2593


In [20]:
df['Over-Sample'].unique()

array(['', 'Urban', 'Toronto', 'Shanghai,\nBeijing,\nGuangzhou', 'Berlin',
       'Jakarta', 'Tehran',
       'Urban areas\nand ARMM\n(autonomous\nregion of\nMuslim\nMindanao)',
       'Bangkok', 'Dhaka', 'Beijing,\nShanghai,\nGuangzhou', 'Paris',
       'Kuala\nLumpur', 'Ulan Bator', 'Karachi,\nUrban',
       'Urban,\nDagestan,\nTatarstan\n(Muslim\nregions)', 'London',
       'Tbilisi', 'Nairobi', 'Albanians in\nNorthwest\nand Skojpe',
       'Urban\noversample', 'East\nHerzegovina\nand West\nHerzegovina',
       'Serbs in\nSerbian\nNorth and\nSerbian\nEnklaves',
       'Albanians in\nNorthwest\nand Skopje',
       'Albanians in\nNorth, Cost\nand\nPodgorica', 'Muslims in\nSandzak',
       'Croats in\nEast and\nWest\nHerzegovina',
       'Serbs in\nSerbian\nNorth and\nSerbian\nenclaves', 'Vienna',
       'Brussels', 'Sofia', 'Prague', 'Copenhagen', 'Helsinki',
       'Paris City', 'Budapest', 'Dublin City', 'Rome',
       'Albanians in\nNorthwest', 'Amsterdam', 'Warsaw', 'Lisbon',
    

### Limpiamos las columnas a partir de aquí

In [22]:
if "Over-Sample" in df.columns:
    df["Over-Sample"] = (
        df["Over-Sample"]
        .astype(str)
        .str.replace(r"\n", " ")
        .str.replace(r"\\n", " ")
        .str.strip()
    )

    # Creamos una función de limpieza para todas las demás
    def limpiar_caracteres_extra(valor):
        valor = re.sub(r'\s+', ' ', valor)
        return valor

    df["Over-Sample"] = df["Over-Sample"].apply(limpiar_caracteres_extra)

# Verificar los valores únicos en la columna "Over-Sample"
print(df["Over-Sample"].unique())



['' 'Urban' 'Toronto' 'Shanghai, Beijing, Guangzhou' 'Berlin' 'Jakarta'
 'Tehran' 'Urban areas and ARMM (autonomous region of Muslim Mindanao)'
 'Bangkok' 'Dhaka' 'Beijing, Shanghai, Guangzhou' 'Paris' 'Kuala Lumpur'
 'Ulan Bator' 'Karachi, Urban'
 'Urban, Dagestan, Tatarstan (Muslim regions)' 'London' 'Tbilisi'
 'Nairobi' 'Albanians in Northwest and Skojpe' 'Urban oversample'
 'East Herzegovina and West Herzegovina'
 'Serbs in Serbian North and Serbian Enklaves'
 'Albanians in Northwest and Skopje'
 'Albanians in North, Cost and Podgorica' 'Muslims in Sandzak'
 'Croats in East and West Herzegovina'
 'Serbs in Serbian North and Serbian enclaves' 'Vienna' 'Brussels' 'Sofia'
 'Prague' 'Copenhagen' 'Helsinki' 'Paris City' 'Budapest' 'Dublin City'
 'Rome' 'Albanians in Northwest' 'Amsterdam' 'Warsaw' 'Lisbon' 'Bucharest'
 'Bratislava' 'Ljubljana' 'Madrid' 'Stockholm' 'Istanbul'
 'Serbs in Serbian North Kosovo and Serbian Enclaves'
 'Beijing, Shanghai and Guangzhou'
 'Disproportion ate allo

In [23]:
df['Over-Sample'].unique()

array(['', 'Urban', 'Toronto', 'Shanghai, Beijing, Guangzhou', 'Berlin',
       'Jakarta', 'Tehran',
       'Urban areas and ARMM (autonomous region of Muslim Mindanao)',
       'Bangkok', 'Dhaka', 'Beijing, Shanghai, Guangzhou', 'Paris',
       'Kuala Lumpur', 'Ulan Bator', 'Karachi, Urban',
       'Urban, Dagestan, Tatarstan (Muslim regions)', 'London', 'Tbilisi',
       'Nairobi', 'Albanians in Northwest and Skojpe', 'Urban oversample',
       'East Herzegovina and West Herzegovina',
       'Serbs in Serbian North and Serbian Enklaves',
       'Albanians in Northwest and Skopje',
       'Albanians in North, Cost and Podgorica', 'Muslims in Sandzak',
       'Croats in East and West Herzegovina',
       'Serbs in Serbian North and Serbian enclaves', 'Vienna',
       'Brussels', 'Sofia', 'Prague', 'Copenhagen', 'Helsinki',
       'Paris City', 'Budapest', 'Dublin City', 'Rome',
       'Albanians in Northwest', 'Amsterdam', 'Warsaw', 'Lisbon',
       'Bucharest', 'Bratislava', 'Ljubljan

In [24]:
df["Country"] = df["Country"].apply(limpiar_caracteres_extra)

In [25]:
df['Country'].unique()

array(['Albania', 'Argentina', 'Armenia', 'Australia', 'Austria',
       'Azerbaijan', 'Bangladesh', 'Belarus', 'Belgium', 'Benin',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada',
       'Chad', 'Chile', 'China', 'Colombia', 'Costa Rica', 'Croatia',
       'Cuba', 'Cyprus', 'Czech Republic', 'Denmark',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia',
       'Finland', 'France', 'Georgia', 'Germany', 'Ghana', 'Greece',
       'Guatemala', 'Haiti', 'Honduras', 'Hong Kong, S.A.R. of China',
       'Hungary', 'India', 'Indonesia', 'Iran', 'Ireland', 'Israel',
       'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya',
       'Kosovo', 'Kuwait', 'Kyrgyzstan',
       "Lao People's Democratic Republic", 'Latvia', 'Lebanon',
       'Lithuania', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia',
       'Mali', 'Mexico', 'Moldova (the Republic of)', 'Montenegro',
       'Mozambi

In [26]:

df["Design Effect"] = df["Design Effect"].apply(limpiar_caracteres_extra)
df["Design Effect"].unique()

array(['1.61', '1.12', '1.06', '1.30', '1.40', '1.14', '1.11', '1.04',
       '1.50', '1.51', '1.01', '1.49', '1.20', '1.31', '1.03', '2.03',
       '1.42', '1.74', '2.17', '1.33', '1.92', '1.09', '1.18', '1.07',
       '1.17', '1.35', '1.72', '1.23', '1.19', '1.38', '1.45', '1.28',
       '1.10', '3.51', '1.29', '1.00', '1.15', '1.70', '2.88', '1.37',
       '1.73', '1.26', '2.08', '1.13', '1.02', '1.25', '2.33', '1.55',
       '1.16', '1.05', '1.43', '1.32', '1.21', '1.64', '2.58', '2.15',
       '2.07', '1.47', '1.56', '1.24', '2.71', '2.70', '1.62', '1.82',
       '1.66', '2.06', '2.05', '1.22', '3.71', '1.59', '1.27', '2.98',
       '1.78', '1.76', '1.08', '1.79', '1.93', '1.36', '1.46', '1.69',
       '1.68', '2.04', '2.99', '3.23', '2.20', '1.60', '1.54', '1.65',
       '1.34', '2.51', '1.41', '2.10', '2.45', '1.53', '1.39', '1.85',
       '1.52', '2.02', '2.11', '1.81', '1.58', '2.37', '1.71', '1.48',
       '1.44', '3.02', '1.80', '1.67', '1.77', '2.14', '1.98', '2.61',
      

In [27]:
df["Languages"].unique()

array(['Albanian', 'Spanish', 'Armenian, Russian', 'English', 'German',
       'Azeri, Russian', 'Bengali', 'Russian,\nBelarusian',
       'Dutch, French', 'French, Fon,\nBariba',
       'Bosnian, Croatian,\nSerbian', 'English, Setswana', 'Portuguese',
       'French, Moore,\nDioula, Fulfulde', 'French, Kirundi', 'Khmer',
       'French, English,\nFulfulde', 'English, French',
       'Chadian Arabic,\nFrench, Ngambaye', 'Chinese', 'Croatian',
       'Greek', 'Czech', 'Danish', 'Arabic', 'Estonian, Russian',
       'Finnish', 'French', 'Georgian,\nRussian, Armenian',
       'English, Hausa,\nEwe, Twi', 'Creole', 'Hungarian',
       'English, Hindi,\nTamil, Kannada,\nTelugu, Marathi,\nGujarati, Bengali,\nMalayalam',
       'Bahasa Indonesia', 'Farsi', 'Hebrew, Arabic,\nRussian', 'Italian',
       'Japanese', 'Kazakh, Russian', 'English, Swahili',
       'Albanian, Serbian', 'Kyrgyz, Russian,\nUzbek', 'Lao',
       'Latvian, Russian', 'Lithuanian', 'Macedonian,\nAlbanian',
       'French,

In [28]:
df["Languages"] = df["Languages"].apply(limpiar_caracteres_extra)

In [29]:
df["Margin of Error"] = df["Margin of Error"].apply(limpiar_caracteres_extra)
df["Margin of Error"].unique()

array(['4.0', '3.3', '3.2', '3.5', '3.7', '3.0', '3.8', '3.1', '2.7',
       '3.4', '4.4', '4.1', '2.9', '4.6', '3.6', '2.2', '5.8', '4.5',
       '4.7', '4.3', '5.0', '3.9', '5.1', '4.2', '5.7', '5.3', '4.9',
       '2.1', '5.4', '2.3', '5.2', '2.6', '2.5', '4.8', '5.5', '2.8',
       '1.7', '2.3 4.0', '2.4', '2.0', '2.1 3.7', '1.6', '5.6', '1.5',
       '1.9', '1.4', 'Phone: 4.3 Total: 3.9', 'Phone: 4.5 Total: 4.2',
       'Phone: 4.0 Total: 3.6', 'Phone: 4.1 Total: 3.6',
       'Phone: 3.8 Total: 3.5', 'Phone: 4.6 Total: 4.0',
       'Phone: 4.2 Total: 3.7', 'Phone: 4.3 Total: 3.8',
       'Phone: 5.3 Total: 4.6', 'Phone: 4.9 Total: 4.4',
       'Phone: 4.1 Total: 3.8', 'Phone: 5.0 Total: 4.4',
       'Phone: 4.9 Total: 4.3', 'Phone: 4.3 Total: 4.0',
       'Phone: 4.5 Total: 4.0', 'Phone: 4.7 Total: 4.1',
       'Phone: 4.4 Total: 4.0', 'Phone: 4.2 Total: 3.9',
       'Phone: 4.0 Total: 3.7', 'Phone: 4.2 Total: 3.8',
       'Phone: 4.6 Total: 4.1'], dtype=object)

In [30]:
df["Mode of Interviewing"] = df["Mode of Interviewing"].apply(limpiar_caracteres_extra)
df["Mode of Interviewing"].unique()

array(['Face-to-Face', 'Landline Telephone',
       'Face-to-Face and Landline Telephone',
       'Landline and Mobile Telephone', 'Panel', 'Face-to-Face (HH)*',
       'Mobile Telephone', 'Face-to-Face, Landline Telephone',
       'Face-to-Face (Landline Telephone in Tripoli)',
       'Landline and Mobile Telephone, half RDD and half survey recontacts',
       'Landline Telephone and Face-to-Face', 'Mobile Telephone Only',
       'Face to Face', 'Face-to-Face, Telephone', 'Face to Face (HH)*',
       'Landline and Mobile Telephone and Face-to- Face (HH)*',
       'Face-to-Face (HH)* and Face-to-Face',
       'Face-to-Face (HH)* and PAPI',
       'Face-to-Face and Face- to-Face (HH)*',
       'Landline and Mobile Telephone / Web',
       'Mobile Telephone Telephone/Web', 'Web', 'Mobile Telephone / Web',
       'Landline and Mobile Telephone/Web',
       'Face-to-Face (HH)* Mobile Telephone'], dtype=object)

In [31]:
# Hacemos diccionario de reemplazos
reemplazos = {
    'Face-to-Face and Landline Telephone': 'Landline Telephone and Face-to-Face',
    'Face to Face': 'Face-to-Face',
    'Face-to-Face, Landline Telephone': 'Landline Telephone and Face-to-Face',
    'Face to Face (HH)*': 'Face-to-Face (HH)*',
    'Face-to-Face and Face- to-Face (HH)*': 'Face-to-Face (HH)* and Face-to-Face',
    'Landline and Mobile Telephone and Face-to- Face (HH)*': 'Landline, Mobile Telephone and Face-to-Face (HH)*',
    'Landline and Mobile Telephone / Web': 'Landline, Mobile Telephone and Web',
    'Mobile Telephone Telephone/Web': 'Mobile Telephone and Web',
    'Face-to-Face (HH)* Mobile Telephone': 'Face-to-Face (HH)* and Mobile Telephone'
}

# Aplicar los reemplazos
if "Mode of Interviewing" in df.columns:
    df["Mode of Interviewing"] = df["Mode of Interviewing"].replace(reemplazos)

# Verificamos los valores únicos después de los reemplazos
print(df["Mode of Interviewing"].unique())


['Face-to-Face' 'Landline Telephone' 'Landline Telephone and Face-to-Face'
 'Landline and Mobile Telephone' 'Panel' 'Face-to-Face (HH)*'
 'Mobile Telephone' 'Face-to-Face (Landline Telephone in Tripoli)'
 'Landline and Mobile Telephone, half RDD and half survey recontacts'
 'Mobile Telephone Only' 'Face-to-Face, Telephone'
 'Landline, Mobile Telephone and Face-to-Face (HH)*'
 'Face-to-Face (HH)* and Face-to-Face' 'Face-to-Face (HH)* and PAPI'
 'Landline, Mobile Telephone and Web' 'Mobile Telephone and Web' 'Web'
 'Mobile Telephone / Web' 'Landline and Mobile Telephone/Web'
 'Face-to-Face (HH)* and Mobile Telephone']


In [32]:
reemplazo = {
    'Mobile Telephone only' : 'Mobile Telephone'
}

# Aplicamos más reemplazos
if "Mode of Interviewing" in df.columns:
    df["Mode of Interviewing"] = df["Mode of Interviewing"].replace(reemplazos)

# Verificamos los valores únicos después de los reemplazos
df["Mode of Interviewing"].unique()


array(['Face-to-Face', 'Landline Telephone',
       'Landline Telephone and Face-to-Face',
       'Landline and Mobile Telephone', 'Panel', 'Face-to-Face (HH)*',
       'Mobile Telephone', 'Face-to-Face (Landline Telephone in Tripoli)',
       'Landline and Mobile Telephone, half RDD and half survey recontacts',
       'Mobile Telephone Only', 'Face-to-Face, Telephone',
       'Landline, Mobile Telephone and Face-to-Face (HH)*',
       'Face-to-Face (HH)* and Face-to-Face',
       'Face-to-Face (HH)* and PAPI',
       'Landline, Mobile Telephone and Web', 'Mobile Telephone and Web',
       'Web', 'Mobile Telephone / Web',
       'Landline and Mobile Telephone/Web',
       'Face-to-Face (HH)* and Mobile Telephone'], dtype=object)

In [33]:
df["Number of Interviews"].unique()

array(['981', '1,000', '1,001', '1,004', '1,048', '1,092', '1,003',
       '2,002', '1,029', '1,355', '1,007', '3,730', '1,002', '1,067',
       '999', '1,010', '1,021', '505', '800', '1,025', '2,100', '1,180',
       '1,300', '543', '1,046', '996', '1,015', '1,042', '1,012', '834',
       '1,028', '1,005', '1,200', '500', '1,022', '2,011', '1,504',
       '1,556', '1,095', '1,018', '1,009', '1,100', '1,033', '1,410',
       '508', '995', '1,102', '1,013', '1,037', '1,023', '1,205', '1,114',
       '502', '1,038', '4,238', '1,072', '1,061', '1,024', '1,220',
       '1,221', '501', '3,186', '1,008', '1,150', '1,016', '1,017',
       '1,233', '750', '1,502', '2,949', '1,006', '1,066', '1,204',
       '1,225', '1,020', '1,091', '1,032', '1,108', '4,383', '1,105',
       '601', '1,080', '1,011', '751', '2,000', '1,050', '1,040', '990',
       '2,200', '1,047', '513', '506', '804', '840', '2,019', '1,548',
       '1,209', '1,074', '1,051', '1,077', '1,031', '4,201', '607', '755',
       '3,

In [34]:
df["Number of Interviews"] = df["Number of Interviews"].apply(limpiar_caracteres_extra)
df["Number of Interviews"].unique()

array(['981', '1,000', '1,001', '1,004', '1,048', '1,092', '1,003',
       '2,002', '1,029', '1,355', '1,007', '3,730', '1,002', '1,067',
       '999', '1,010', '1,021', '505', '800', '1,025', '2,100', '1,180',
       '1,300', '543', '1,046', '996', '1,015', '1,042', '1,012', '834',
       '1,028', '1,005', '1,200', '500', '1,022', '2,011', '1,504',
       '1,556', '1,095', '1,018', '1,009', '1,100', '1,033', '1,410',
       '508', '995', '1,102', '1,013', '1,037', '1,023', '1,205', '1,114',
       '502', '1,038', '4,238', '1,072', '1,061', '1,024', '1,220',
       '1,221', '501', '3,186', '1,008', '1,150', '1,016', '1,017',
       '1,233', '750', '1,502', '2,949', '1,006', '1,066', '1,204',
       '1,225', '1,020', '1,091', '1,032', '1,108', '4,383', '1,105',
       '601', '1,080', '1,011', '751', '2,000', '1,050', '1,040', '990',
       '2,200', '1,047', '513', '506', '804', '840', '2,019', '1,548',
       '1,209', '1,074', '1,051', '1,077', '1,031', '4,201', '607', '755',
       '3,

In [35]:
df['Over-Sample'].unique()

array(['', 'Urban', 'Toronto', 'Shanghai, Beijing, Guangzhou', 'Berlin',
       'Jakarta', 'Tehran',
       'Urban areas and ARMM (autonomous region of Muslim Mindanao)',
       'Bangkok', 'Dhaka', 'Beijing, Shanghai, Guangzhou', 'Paris',
       'Kuala Lumpur', 'Ulan Bator', 'Karachi, Urban',
       'Urban, Dagestan, Tatarstan (Muslim regions)', 'London', 'Tbilisi',
       'Nairobi', 'Albanians in Northwest and Skojpe', 'Urban oversample',
       'East Herzegovina and West Herzegovina',
       'Serbs in Serbian North and Serbian Enklaves',
       'Albanians in Northwest and Skopje',
       'Albanians in North, Cost and Podgorica', 'Muslims in Sandzak',
       'Croats in East and West Herzegovina',
       'Serbs in Serbian North and Serbian enclaves', 'Vienna',
       'Brussels', 'Sofia', 'Prague', 'Copenhagen', 'Helsinki',
       'Paris City', 'Budapest', 'Dublin City', 'Rome',
       'Albanians in Northwest', 'Amsterdam', 'Warsaw', 'Lisbon',
       'Bucharest', 'Bratislava', 'Ljubljan

In [None]:
df.to_csv("data/processed/gallup-interviews-info-2005-2025.csv", index=False)

In [39]:
resumen_df(df)


Forma del DataFrame: (2730, 9)

Tipos de datos:
Data Collection Year    object
Country                 object
Data Collection Date    object
Number of Interviews    object
Design Effect           object
Margin of Error         object
Mode of Interviewing    object
Languages               object
Over-Sample             object
dtype: object

Valores nulos:
Data Collection Year    0
Country                 0
Data Collection Date    0
Number of Interviews    0
Design Effect           0
Margin of Error         0
Mode of Interviewing    0
Languages               0
Over-Sample             0
dtype: int64

Valores duplicados: 0

⚠️ No hay columnas numéricas en el DataFrame.

Resumen estadístico (categóricas):


Unnamed: 0,count,unique,top,freq
Data Collection Year,2730,20,2012,195
Country,2730,482,Ukraine,20
Data Collection Date,2730,2480,"Jul, 2007",22
Number of Interviews,2730,258,1000,1587
Design Effect,2730,186,1.36,69
Margin of Error,2730,67,3.7,414
Mode of Interviewing,2730,20,Face-to-Face,981
Languages,2730,274,Spanish,333
Over-Sample,2730,54,,2593
