# Análisis de Dominios de Estudios: Mapping Review IA y ML en Educación Matemática K-12\n
\n
**MQ7: ¿En qué dominios se centran los estudios?**\n
\n
Este notebook analiza los dominios específicos y áreas temáticas en las que se centran los estudios sobre IA y ML en educación matemática K-12.

## 1. Configuración del Entorno

In [None]:
# Instalación de dependencias\n
!pip install pandas numpy matplotlib seaborn plotly nltk wordcloud

In [None]:
# Importación de librerías\n
import pandas as pd\n
import numpy as np\n
import matplotlib.pyplot as plt\n
import seaborn as sns\n
import plotly.express as px\n
import plotly.graph_objects as go\n
from plotly.subplots import make_subplots\n
import re\n
import nltk\n
from nltk.corpus import stopwords\n
from nltk.tokenize import word_tokenize\n
from collections import Counter\n
from wordcloud import WordCloud\n
import warnings\n
warnings.filterwarnings('ignore')\n
\n
# Descargar recursos de NLTK\n
try:\n
    nltk.data.find('tokenizers/punkt')\n
except LookupError:\n
    nltk.download('punkt')\n
\n
try:\n
    nltk.data.find('corpora/stopwords')\n
except LookupError:\n
    nltk.download('stopwords')\n
\n
# Configuración de estilo\n
plt.style.use('seaborn-v0_8')\n
sns.set_palette(\"husl\")\n
plt.rcParams['figure.figsize'] = (12, 8)\n
plt.rcParams['font.size'] = 12\n
\n
# Configuración para mostrar todas las columnas\n
pd.set_option('display.max_columns', None)\n
pd.set_option('display.max_colwidth', None)

## 2. Carga de Datos desde GitHub

In [None]:
# Cargar el dataset desde GitHub\n
# IMPORTANTE: Cambiar la URL por tu repositorio real\n
url = \"https://raw.githubusercontent.com/TU_USUARIO/TU_REPOSITORIO/main/MappingReview.csv\"\n
df = pd.read_csv(url, sep=';', encoding='utf-8')\n
\n
print(f\"Dataset cargado: {df.shape[0]} filas y {df.shape[1]} columnas\")\n
print(\"\\nPrimeras 5 filas:\")\n
df.head()

## 3. Análisis de Dominios de Estudios (MQ7)

In [None]:
# Definir dominios de estudio\n
study_domains = {\n
    'Intelligent Tutoring Systems': ['intelligent tutoring system', 'its', 'tutoring system', 'adaptive tutoring'],\n
    'Chatbots & Conversational AI': ['chatbot', 'chatbots', 'conversational ai', 'dialogue system', 'conversational agent'],\n
    'Predictive Analytics': ['predictive analytics', 'prediction', 'predictive modeling', 'forecasting'],\n
    'Learning Analytics': ['learning analytics', 'educational data mining', 'data mining', 'analytics'],\n
    'Computer Vision': ['computer vision', 'image recognition', 'visual recognition', 'image processing'],\n
    'Natural Language Processing': ['natural language processing', 'nlp', 'text analysis', 'language processing'],\n
    'Personalized Learning': ['personalized learning', 'adaptive learning', 'individualized learning', 'customized learning'],\n
    'Assessment & Evaluation': ['assessment', 'evaluation', 'testing', 'measurement', 'performance evaluation'],\n
    'Gamification': ['gamification', 'game-based learning', 'serious games', 'educational games'],\n
    'Virtual/Augmented Reality': ['virtual reality', 'augmented reality', 'vr', 'ar', 'immersive learning'],\n
    'Robotics': ['robotics', 'educational robots', 'robot', 'robotic'],\n
    'Neural Networks': ['neural network', 'neural networks', 'deep learning', 'artificial neural network'],\n
    'Machine Learning': ['machine learning', 'ml', 'supervised learning', 'unsupervised learning'],\n
    'Data Science': ['data science', 'big data', 'data analysis', 'statistical analysis'],\n
    'STEM Education': ['stem', 'science', 'technology', 'engineering', 'mathematics']\n
}\n
\n
# Función para identificar dominios en el texto\n
def identify_domains(text):\n
    if pd.isna(text):\n
        return []\n
    \n
    text = str(text).lower()\n
    identified_domains = []\n
    \n
    for domain, keywords in study_domains.items():\n
        for keyword in keywords:\n
            if keyword in text:\n
                identified_domains.append(domain)\n
                break\n
    \n
    return list(set(identified_domains))\n
\n
# Aplicar identificación de dominios\n
df['Title_Domains'] = df['Title'].apply(identify_domains)\n
df['Abstract_Domains'] = df['Abstract'].apply(identify_domains)\n
\n
print(\"Identificación de dominios completada\")\n
print(f\"Publicaciones con dominios en títulos: {len(df[df['Title_Domains'].apply(len) > 0])}\")\n
print(f\"Publicaciones con dominios en abstracts: {len(df[df['Abstract_Domains'].apply(len) > 0])}\")

In [None]:
# Análisis de frecuencia de dominios\n
all_title_domains = []\n
all_abstract_domains = []\n
\n
for domains in df['Title_Domains']:\n
    all_title_domains.extend(domains)\n
\n
for domains in df['Abstract_Domains']:\n
    all_abstract_domains.extend(domains)\n
\n
# Contar frecuencia\n
title_domain_counts = Counter(all_title_domains)\n
abstract_domain_counts = Counter(all_abstract_domains)\n
\n
print(\"=== DOMINIOS MÁS FRECUENTES EN TÍTULOS ===\")\n
for domain, count in title_domain_counts.most_common(10):\n
    print(f\"{domain}: {count} apariciones\")\n
\n
print(\"\\n=== DOMINIOS MÁS FRECUENTES EN ABSTRACTS ===\")\n
for domain, count in abstract_domain_counts.most_common(10):\n
    print(f\"{domain}: {count} apariciones\")

In [None]:
# Gráfico de barras para dominios más frecuentes\n
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))\n
\n
# Dominios en títulos\n
top_title_domains = dict(title_domain_counts.most_common(8))\n
bars1 = ax1.bar(range(len(top_title_domains)), top_title_domains.values(), color='lightblue', alpha=0.7)\n
ax1.set_xlabel('Dominios de Estudio', fontsize=14)\n
ax1.set_ylabel('Frecuencia', fontsize=14)\n
ax1.set_title('Dominios Más Frecuentes en Títulos', fontsize=16, fontweight='bold')\n
ax1.set_xticks(range(len(top_title_domains)))\n
ax1.set_xticklabels(top_title_domains.keys(), rotation=45, ha='right')\n
ax1.grid(True, alpha=0.3, axis='y')\n
\n
# Agregar valores en las barras\n
for i, (bar, count) in enumerate(zip(bars1, top_title_domains.values())):\n
    ax1.text(i, count + 0.5, str(count), ha='center', va='bottom', fontweight='bold')\n
\n
# Dominios en abstracts\n
top_abstract_domains = dict(abstract_domain_counts.most_common(8))\n
bars2 = ax2.bar(range(len(top_abstract_domains)), top_abstract_domains.values(), color='lightcoral', alpha=0.7)\n
ax2.set_xlabel('Dominios de Estudio', fontsize=14)\n
ax2.set_ylabel('Frecuencia', fontsize=14)\n
ax2.set_title('Dominios Más Frecuentes en Abstracts', fontsize=16, fontweight='bold')\n
ax2.set_xticks(range(len(top_abstract_domains)))\n
ax2.set_xticklabels(top_abstract_domains.keys(), rotation=45, ha='right')\n
ax2.grid(True, alpha=0.3, axis='y')\n
\n
# Agregar valores en las barras\n
for i, (bar, count) in enumerate(zip(bars2, top_abstract_domains.values())):\n
    ax2.text(i, count + 0.5, str(count), ha='center', va='bottom', fontweight='bold')\n
\n
plt.tight_layout()\n
plt.show()

## 4. Análisis de Combinaciones de Dominios

In [None]:
# Análisis de combinaciones de dominios\n
all_domains = []\n
for domains in df['Title_Domains'] + df['Abstract_Domains']:\n
    if domains:\n
        all_domains.append('+'.join(sorted(domains)))\n
\n
domain_combination_counts = Counter(all_domains)\n
\n
print(\"=== COMBINACIONES DE DOMINIOS MÁS FRECUENTES ===\")\n
for combination, count in domain_combination_counts.most_common(15):\n
    print(f\"{combination}: {count} publicaciones\")\n
\n
# Gráfico de combinaciones\n
top_combinations = dict(domain_combination_counts.most_common(10))\n
\n
plt.figure(figsize=(15, 8))\n
bars = plt.bar(range(len(top_combinations)), top_combinations.values(), color='lightgreen', alpha=0.7)\n
plt.xlabel('Combinación de Dominios', fontsize=14)\n
plt.ylabel('Número de Publicaciones', fontsize=14)\n
plt.title('Combinaciones de Dominios Más Frecuentes', fontsize=16, fontweight='bold')\n
plt.xticks(range(len(top_combinations)), list(top_combinations.keys()), rotation=45, ha='right')\n
plt.grid(True, alpha=0.3, axis='y')\n
\n
# Agregar valores en las barras\n
for i, (bar, count) in enumerate(zip(bars, top_combinations.values())):\n
    plt.text(i, count + 0.5, str(count), ha='center', va='bottom', fontweight='bold')\n
\n
plt.tight_layout()\n
plt.show()

## 5. Análisis Temporal por Dominio

In [None]:
# Convertir Year a numérico\n
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')\n
\n
# Análisis temporal por dominio\n
temporal_domains = {}\n
\n
for year in sorted(df['Year'].unique()):\n
    if pd.notna(year):\n
        year_data = df[df['Year'] == year]\n
        year_domains = []\n
        \n
        for domains in year_data['Title_Domains']:\n
            year_domains.extend(domains)\n
        for domains in year_data['Abstract_Domains']:\n
            year_domains.extend(domains)\n
        \n
        temporal_domains[year] = Counter(year_domains)\n
\n
print(\"=== EVOLUCIÓN TEMPORAL DE DOMINIOS ===\")\n
for year, domain_counts in temporal_domains.items():\n
    print(f\"\\nAño {year}:\")\n
    for domain, count in domain_counts.most_common(5):\n
        print(f\"  {domain}: {count} apariciones\")

In [None]:
# Gráfico de evolución temporal de dominios principales\n
main_domains = ['Machine Learning', 'Intelligent Tutoring Systems', 'Personalized Learning', 'Assessment & Evaluation', 'Learning Analytics']\n
\n
plt.figure(figsize=(15, 8))\n
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']\n
\n
for i, domain in enumerate(main_domains):\n
    domain_counts = []\n
    years = []\n
    \n
    for year in sorted(temporal_domains.keys()):\n
        years.append(year)\n
        count = temporal_domains[year].get(domain, 0)\n
        domain_counts.append(count)\n
    \n
    \n
    plt.plot(years, domain_counts, marker='o', linewidth=2, label=domain, color=colors[i])\n
\n
plt.title('Evolución Temporal de Dominios Principales', fontsize=16, fontweight='bold')\n
plt.xlabel('Año', fontsize=14)\n
plt.ylabel('Frecuencia', fontsize=14)\n
plt.legend(title='Dominios', bbox_to_anchor=(1.05, 1), loc='upper left')\n
plt.grid(True, alpha=0.3)\n
plt.xticks(rotation=45)\n
plt.tight_layout()\n
plt.show()

## 6. Análisis de Dominios por Tipo de Publicación

In [None]:
# Análisis de dominios por tipo de publicación\n
domain_by_type = {}\n
\n
for pub_type in df['Type of Publication'].unique():\n
    type_data = df[df['Type of Publication'] == pub_type]\n
    type_domains = []\n
    \n
    for domains in type_data['Title_Domains']:\n
        type_domains.extend(domains)\n
    for domains in type_data['Abstract_Domains']:\n
        type_domains.extend(domains)\n
    \n
    domain_by_type[pub_type] = Counter(type_domains)\n
\n
print(\"=== DOMINIOS POR TIPO DE PUBLICACIÓN ===\")\n
for pub_type, domain_counts in domain_by_type.items():\n
    print(f\"\\n{pub_type}:\")\n
    for domain, count in domain_counts.most_common(5):\n
        print(f\"  {domain}: {count} apariciones\")\n
\n
# Gráfico de calor\n
domain_type_matrix = pd.DataFrame(domain_by_type).fillna(0)\n
\n
plt.figure(figsize=(12, 8))\n
sns.heatmap(domain_type_matrix, annot=True, fmt='.0f', cmap='YlOrRd', cbar_kws={'label': 'Frecuencia'})\n
plt.title('Dominios por Tipo de Publicación', fontsize=16, fontweight='bold')\n
plt.xlabel('Tipo de Publicación', fontsize=14)\n
plt.ylabel('Dominio de Estudio', fontsize=14)\n
plt.tight_layout()\n
plt.show()

## 7. Análisis de Dominios por Fuente

In [None]:
# Análisis de dominios por fuente\n
domain_by_source = {}\n
\n
for source in df['Source'].unique():\n
    source_data = df[df['Source'] == source]\n
    source_domains = []\n
    \n
    for domains in source_data['Title_Domains']:\n
        source_domains.extend(domains)\n
    for domains in source_data['Abstract_Domains']:\n
        source_domains.extend(domains)\n
    \n
    domain_by_source[source] = Counter(source_domains)\n
\n
print(\"=== DOMINIOS POR FUENTE ===\")\n
for source, domain_counts in domain_by_source.items():\n
    print(f\"\\n{source}:\")\n
    for domain, count in domain_counts.most_common(5):\n
        print(f\"  {domain}: {count} apariciones\")\n
\n
# Gráfico de barras agrupadas\n
top_domains = ['Machine Learning', 'Intelligent Tutoring Systems', 'Personalized Learning', 'Assessment & Evaluation']\n
\n
domain_source_data = []\n
for source in df['Source'].unique():\n
    for domain in top_domains:\n
        count = domain_by_source[source].get(domain, 0)\n
        domain_source_data.append({\n
            'Source': source,\n
            'Domain': domain,\n
            'Count': count\n
        })\n
\n
domain_source_df = pd.DataFrame(domain_source_data)\n
\n
fig = px.bar(domain_source_df, x='Source', y='Count', color='Domain',\n
              title='Dominios por Fuente de Publicación',\n
              barmode='group')\n
\n
fig.update_layout(\n
    title_font_size=16,\n
    xaxis_title='Fuente',\n
    yaxis_title='Frecuencia'\n
)\n
\n
fig.show()

## 8. Análisis de Co-ocurrencia de Dominios

In [None]:
# Análisis de co-ocurrencia de dominios\n
domain_co_occurrence = {}\n
\n
for _, row in df.iterrows():\n
    all_domains = row['Title_Domains'] + row['Abstract_Domains']\n
    \n
    for i, domain1 in enumerate(all_domains):\n
        for j, domain2 in enumerate(all_domains):\n
            if i < j:\n
                pair = tuple(sorted([domain1, domain2]))\n
                domain_co_occurrence[pair] = domain_co_occurrence.get(pair, 0) + 1\n
\n
# Top co-ocurrencias\n
top_domain_co_occurrences = sorted(domain_co_occurrence.items(), key=lambda x: x[1], reverse=True)[:15]\n
\n
print(\"=== TOP 15 CO-OCURRENCIAS DE DOMINIOS ===\")\n
for i, ((domain1, domain2), count) in enumerate(top_domain_co_occurrences, 1):\n
    print(f\"{i:2d}. {domain1} + {domain2}: {count} co-ocurrencias\")\n
\n
# Gráfico de co-ocurrencias\n
if top_domain_co_occurrences:\n
    co_occurrence_df = pd.DataFrame(top_domain_co_occurrences, columns=['Domains', 'Count'])\n
    co_occurrence_df['Domain_Pair'] = [f\"{d1} + {d2}\" for (d1, d2) in co_occurrence_df['Domains']]\n
    \n
    plt.figure(figsize=(12, 8))\n
    bars = plt.barh(range(len(co_occurrence_df)), co_occurrence_df['Count'], color='lightblue', alpha=0.7)\n
    plt.yticks(range(len(co_occurrence_df)), co_occurrence_df['Domain_Pair'], fontsize=10)\n
    plt.xlabel('Número de Co-ocurrencias', fontsize=14)\n
    plt.title('Top 15 Co-ocurrencias de Dominios', fontsize=16, fontweight='bold')\n
    plt.grid(True, alpha=0.3, axis='x')\n
    \n
    # Agregar valores en las barras\n
    for i, count in enumerate(co_occurrence_df['Count']):\n
        plt.text(count + 0.1, i, str(count), va='center', fontweight='bold')\n
    \n
    plt.tight_layout()\n
    plt.show()

## 9. Resumen y Conclusiones

In [None]:
# Generar resumen ejecutivo\n
print(\"=== RESUMEN EJECUTIVO ===\\n\")\n
\n
print(f\"📊 Total de dominios únicos identificados: {len(set(all_title_domains + all_abstract_domains))}\")\n
print(f\"📝 Dominio más frecuente en títulos: {title_domain_counts.most_common(1)[0][0]} ({title_domain_counts.most_common(1)[0][1]} apariciones)\")\n
print(f\"📈 Dominio más frecuente en abstracts: {abstract_domain_counts.most_common(1)[0][0]} ({abstract_domain_counts.most_common(1)[0][1]} apariciones)\")\n
\n
# Análisis de diversidad\n
total_publications = len(df)\n
publications_with_domains = len(df[df['Title_Domains'].apply(len) > 0]) + len(df[df['Abstract_Domains'].apply(len) > 0])\n
coverage_percentage = (publications_with_domains / (total_publications * 2)) * 100\n
print(f\"🌐 Cobertura de dominios: {coverage_percentage:.1f}%\")\n
\n
# Mejor combinación\n
if domain_combination_counts:\n
    best_combination = domain_combination_counts.most_common(1)[0]\n
    print(f\"🏆 Mejor combinación de dominios: {best_combination[0]} ({best_combination[1]} publicaciones)\")\n
\n
# Co-ocurrencia más frecuente\n
if top_domain_co_occurrences:\n
    most_common_co_occurrence = top_domain_co_occurrences[0]\n
    print(f\"🔗 Co-ocurrencia más frecuente: {most_common_co_occurrence[0][0]} + {most_common_co_occurrence[0][1]} ({most_common_co_occurrence[1]} veces)\")\n
\n
print(\"\\n=== CONCLUSIONES ===\")\n
print(\"1. Los sistemas de tutoría inteligente dominan la investigación\")\n
print(\"2. El aprendizaje personalizado es un área clave\")\n
print(\"3. Hay una evolución temporal en los dominios de estudio\")\n
print(\"4. Existen patrones de co-ocurrencia entre dominios\")