In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 🔍 04 - Insights Validation & Business Recommendations\n",
    "## Validação de Insights e Recomendações de Negócio - Voos Delhi-Mumbai\n",
    "\n",
    "**Objetivo**: Validar estatisticamente os insights descobertos e gerar recomendações acionáveis para o negócio.\n",
    "\n",
    "**Autor**: [Seu Nome]  \n",
    "**Data**: $(date +\"%Y-%m-%d\")  \n",
    "**Versão**: 1.0\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup e Importações"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importações principais\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "\n",
    "# Análise estatística\n",
    "from scipy import stats\n",
    "from scipy.stats import chi2_contingency, fisher_exact\n",
    "from statsmodels.stats.contingency_tables import mcnemar\n",
    "from statsmodels.stats.proportion import proportions_ztest\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.metrics import mean_absolute_error, r2_score\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "# Utilitários\n",
    "import json\n",
    "import os\n",
    "from datetime import datetime\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print(\"Bibliotecas carregadas com sucesso!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Carregamento de Dados e Resultados Anteriores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Carregar dados enriquecidos\n",
    "df = pd.read_csv('../data/processed/flights_with_statistical_analysis.csv')\n",
    "\n",
    "# Carregar resultados estatísticos\n",
    "with open('../data/processed/statistical_analysis_results.json', 'r') as f:\n",
    "    stats_results = json.load(f)\n",
    "\n",
    "# Carregar resumo da exploração\n",
    "with open('../data/processed/exploration_summary.json', 'r') as f:\n",
    "    exploration_summary = json.load(f)\n",
    "\n",
    "print(f\"Dataset final: {df.shape[0]} linhas × {df.shape[1]} colunas\")\n",
    "print(f\"Clusters identificados: {stats_results['clustering_analysis']['optimal_clusters']}\")\n",
    "print(f\"Insights estatísticos disponíveis: {len(stats_results)} categorias\")\n",
    "\n",
    "# Verificar integridade dos dados\n",
    "data_quality = {\n",
    "    'missing_values': df.isnull().sum().sum(),\n",
    "    'duplicates': df.duplicated().sum(),\n",
    "    'data_types_consistency': len(df.select_dtypes(include=[object]).columns),\n",
    "    'numerical_columns': len(df.select_dtypes(include=[np.number]).columns)\n",
    "}\n",
    "\n",
    "print(f\"\\nQualidade dos dados:\")\n",
    "for key, value in data_quality.items():\n",
    "    print(f\"  {key}: {value}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Validação Estatística dos Principais Insights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def validate_key_insights():\n",
    "    \"\"\"Validar estatisticamente os insights mais importantes\"\"\"\n",
    "    \n",
    "    print(\"🔍 VALIDAÇÃO ESTATÍSTICA DOS PRINCIPAIS INSIGHTS\")\n",
    "    print(\"=\" * 60)\n",
    "    \n",
    "    insights_validation = {}\n",
    "    \n",
    "    # 1. Insight: Voos diretos são significativamente mais caros\n",
    "    print(\"\\n1. VALIDAÇÃO: 'Voos diretos são mais caros que voos com parada'\")\n",
    "    direct_prices = df[df['stops'] == 'zero']['price']\n",
    "    stop_prices = df[df['stops'] == 'one']['price']\n",
    "    \n",
    "    # Teste estatístico\n",
    "    t_stat, p_value = stats.ttest_ind(direct_prices, stop_prices)\n",
    "    effect_size = (direct_prices.mean() - stop_prices.mean()) / np.sqrt(\n",
    "        ((len(direct_prices) - 1) * direct_prices.var() + (len(stop_prices) - 1) * stop_prices.var()) /\n",
    "        (len(direct_prices) + len(stop_prices) - 2)\n",
    "    )\n",
    "    \n",
    "    price_premium = direct_prices.mean() - stop_prices.mean()\n",
    "    premium_pct = (price_premium / stop_prices.mean()) * 100\n",
    "    \n",
    "    insights_validation['direct_flights_premium'] = {\n",
    "        'hypothesis': 'Voos diretos custam mais que voos com parada',\n",
    "        'p_value': p_value,\n",
    "        'effect_size': effect_size,\n",
    "        'price_difference': price_premium,\n",
    "        'percentage_premium': premium_pct,\n",
    "        'statistically_significant': p_value < 0.05,\n",
    "        'practical_significance': abs(effect_size) > 0.5\n",
    "    }\n",
    "    \n",
    "    print(f\"   Diferença média: ₹{price_premium:,.0f} ({premium_pct:.1f}%)\")\n",
    "    print(f\"   p-valor: {p_value:.6f}\")\n",
    "    print(f\"   Effect size (Cohen's d): {effect_size:.3f}\")\n",
    "    print(f\"   Estatisticamente significativo: {'✅ Sim' if p_value < 0.05 else '❌ Não'}\")\n",
    "    print(f\"   Praticamente relevante: {'✅ Sim' if abs(effect_size) > 0.5 else '❌ Não'}\")\n",
    "    \n",
    "    # 2. Insight: Diferentes airlines têm estratégias de preço distintas\n",
    "    print(\"\\n2. VALIDAÇÃO: 'Airlines têm estratégias de preço significativamente diferentes'\")\n",
    "    \n",
    "    # ANOVA\n",
    "    airline_groups = [df[df['airline'] == airline]['price'].values for airline in df['airline'].unique()]\n",
    "    f_stat, f_p = stats.f_oneway(*airline_groups)\n",
    "    \n",
    "    # Calcular eta-squared (effect size para ANOVA)\n",
    "    ss_between = sum([len(group) * (np.mean(group) - df['price'].mean())**2 for group in airline_groups])\n",
    "    ss_total = sum([(x - df['price'].mean())**2 for x in df['price']])\n",
    "    eta_squared = ss_between / ss_total\n",
    "    \n",
    "    insights_validation['airline_pricing_strategies'] = {\n",
    "        'hypothesis': 'Airlines têm estratégias de preço distintas',\n",
    "        'f_statistic': f_stat,\n",
    "        'p_value': f_p,\n",
    "        'eta_squared': eta_squared,\n",
    "        'statistically_significant': f_p < 0.05,\n",
    "        'practical_significance': eta_squared > 0.14  # Large effect\n",
    "    }\n",
    "    \n",
    "    print(f\"   F-estatística: {f_stat:.4f}\")\n",
    "    print(f\"   p-valor: {f_p:.6f}\")\n",
    "    print(f\"   Eta-squared: {eta_squared:.3f}\")\n",
    "    print(f\"   Estatisticamente significativo: {'✅ Sim' if f_p < 0.05 else '❌ Não'}\")\n",
    "    print(f\"   Praticamente relevante: {'✅ Sim' if eta_squared > 0.14 else '❌ Não'}\")\n",
    "    \n",
    "    # 3. Insight: Horários de partida influenciam significativamente os preços\n",
    "    print(\"\\n3. VALIDAÇÃO: 'Horários de partida influenciam os preços'\")\n",
    "    \n",
    "    time_groups = [df[df['departure_time'] == time]['price'].values for time in df['departure_time'].unique()]\n",
    "    f_time, f_time_p = stats.f_oneway(*time_groups)\n",
    "    \n",
    "    # Effect size para horários\n",
    "    ss_between_time = sum([len(group) * (np.mean(group) - df['price'].mean())**2 for group in time_groups])\n",
    "    eta_squared_time = ss_between_time / ss_total\n",
    "    \n",
    "    insights_validation['departure_time_effect'] = {\n",
    "        'hypothesis': 'Horários de partida afetam significativamente os preços',\n",
    "        'f_statistic': f_time,\n",
    "        'p_value': f_time_p,\n",
    "        'eta_squared': eta_squared_time,\n",
    "        'statistically_significant': f_time_p < 0.05,\n",
    "        'practical_significance': eta_squared_time > 0.06  # Medium effect\n",
    "    }\n",
    "    \n",
    "    print(f\"   F-estatística: {f_time:.4f}\")\n",
    "    print(f\"   p-valor: {f_time_p:.6f}\")\n",
    "    print(f\"   Eta-squared: {eta_squared_time:.3f}\")\n",
    "    print(f\"   Estatisticamente significativo: {'✅ Sim' if f_time_p < 0.05 else '❌ Não'}\")\n",
    "    print(f\"   Praticamente relevante: {'✅ Sim' if eta_squared_time > 0.06 else '❌ Não'}\")\n",
    "    \n",
    "    # 4. Insight: Correlação entre duração e preço\n",
    "    print(\"\\n4. VALIDAÇÃO: 'Existe correlação significativa entre duração e preço'\")\n",
    "    \n",
    "    correlation_coeff, corr_p = stats.pearsonr(df['duration'], df['price'])\n",
    "    \n",
    "    insights_validation['duration_price_correlation'] = {\n",
    "        'hypothesis': 'Duração e preço são significativamente correlacionados',\n",
    "        'correlation_coefficient': correlation_coeff,\n",
    "        'p_value': corr_p,\n",
    "        'r_squared': correlation_coeff**2,\n",
    "        'statistically_significant': corr_p < 0.05,\n",
    "        'practical_significance': abs(correlation_coeff) > 0.3\n",
    "    }\n",
    "    \n",
    "    print(f\"   Coeficiente de correlação: {correlation_coeff:.4f}\")\n",
    "    print(f\"   p-valor: {corr_p:.6f}\")\n",
    "    print(f\"   R-squared: {correlation_coeff**2:.3f}\")\n",
    "    print(f\"   Estatisticamente significativo: {'✅ Sim' if corr_p < 0.05 else '❌ Não'}\")\n",
    "    print(f\"   Praticamente relevante: {'✅ Sim' if abs(correlation_coeff) > 0.3 else '❌ Não'}\")\n",
    "    \n",
    "    return insights_validation\n",
    "\n",
    "# Executar validação\n",
    "validated_insights = validate_key_insights()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Análise de Confiabilidade dos Clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def validate_clustering_stability():\n",
    "    \"\"\"Validar a estabilidade e confiabilidade dos clusters\"\"\"\n",
    "    \n",
    "    print(\"\\n🎯 VALIDAÇÃO DA ESTABILIDADE DOS CLUSTERS\")\n",
    "    print(\"=\" * 60)\n",
    "    \n",
    "    if 'cluster' not in df.columns:\n",
    "        print(\"Dados de clustering não disponíveis\")\n",
    "        return None\n",
    "    \n",
    "    # 1. Análise da separação entre clusters\n",
    "    from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score\n",
    "    from sklearn.preprocessing import StandardScaler\n",
    "    \n",
    "    # Preparar dados\n",
    "    features = df[['price', 'duration', 'days_left']].values\n",
    "    scaler = StandardScaler()\n",
    "    features_scaled = scaler.fit_transform(features)\n",
    "    labels = df['cluster'].values\n",
    "    \n",
    "    # Métricas de qualidade do clustering\n",
    "    silhouette_avg = silhouette_score(features_scaled, labels)\n",
    "    calinski_harabasz = calinski_harabasz_score(features_scaled, labels)\n",
    "    davies_bouldin = davies_bouldin_score(features_scaled, labels)\n",
    "    \n",
    "    print(f\"📊 Métricas de Qualidade do Clustering:\")\n",
    "    print(f\"   Silhouette Score: {silhouette_avg:.3f} (melhor > 0.5)\")\n",
    "    print(f\"   Calinski-Harabasz Index: {calinski_harabasz:.2f} (maior = melhor)\")\n",
    "    print(f\"   Davies-Bouldin Index: {davies_bouldin:.3f} (menor = melhor)\")\n",
    "    \n",
    "    # 2. Análise da consistência dos clusters\n",
    "    cluster_consistency = {}\n",
    "    \n",
    "    for cluster_id in sorted(df['cluster'].unique()):\n",
    "        cluster_data = df[df['cluster'] == cluster_id]\n",
    "        \n",
    "        # Calcular coeficiente de variação para cada feature\n",
    "        price_cv = (cluster_data['price'].std() / cluster_data['price'].mean()) * 100\n",
    "        duration_cv = (cluster_data['duration'].std() / cluster_data['duration'].mean()) * 100\n",
    "        \n",
    "        # Analisar composição por airline\n",
    "        airline_composition = cluster_data['airline'].value_counts(normalize=True).to_dict()\n",
    "        dominant_airline = max(airline_composition, key=airline_composition.get)\n",
    "        dominance_pct = airline_composition[dominant_airline] * 100\n",
    "        \n",
    "        cluster_consistency[f'Cluster_{cluster_id}'] = {\n",
    "            'size': len(cluster_data),\n",
    "            'price_cv': price_cv,\n",
    "            'duration_cv': duration_cv,\n",
    "            'dominant_airline': dominant_airline,\n",
    "            'dominance_percentage': dominance_pct,\n",
    "            'avg_price': cluster_data['price'].mean(),\n",
    "            'avg_duration': cluster_data['duration'].mean()\n",
    "        }\n",
    "    \n",
    "    print(f\"\\n📋 Consistência dos Clusters:\")\n",
    "    for cluster_name, metrics in cluster_consistency.items():\n",
    "        print(f\"\\n{cluster_name} (n={metrics['size']})\")\n",
    "        print(f\"   Preço médio: ₹{metrics['avg_price']:,.0f}\")\n",
    "        print(f\"   Duração média: {metrics['avg_duration']:.1f}h\")\n",
    "        print(f\"   CV Preço: {metrics['price_cv']:.1f}% ({'Consistente' if metrics['price_cv'] < 30 else 'Variável'})\")\n",
    "        print(f\"   Airline dominante: {metrics['dominant_airline']} ({metrics['dominance_percentage']:.1f}%)\")\n",
    "    \n",
    "    # 3. Teste estatístico de diferenças entre clusters\n",
    "    print(f\"\\n🔍 Testes de Diferenças Entre Clusters:\")\n",
    "    \n",
    "    # ANOVA para preços entre clusters\n",
    "    cluster_price_groups = [df[df['cluster'] == c]['price'].values for c in sorted(df['cluster'].unique())]\n",
    "    f_cluster, p_cluster = stats.f_oneway(*cluster_price_groups)\n",
    "    \n",
    "    print(f\"   ANOVA (Preços): F = {f_cluster:.4f}, p = {p_cluster:.6f}\")\n",
    "    print(f\"   Clusters diferem significativamente: {'✅ Sim' if p_cluster < 0.05 else '❌ Não'}\")\n",
    "    \n",
    "    return {\n",
    "        'quality_metrics': {\n",
    "            'silhouette_score': silhouette_avg,\n",
    "            'calinski_harabasz_score': calinski_harabasz,\n",
    "            'davies_bouldin_score': davies_bouldin\n",
    "        },\n",
    "        'cluster_consistency': cluster_consistency,\n

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 🔍 04 - Insights Validation & Business Intelligence\n",
    "## Validação de Insights e Inteligência de Negócios - Dataset Airlines\n",
    "\n",
    "**Objetivo**: Validar descobertas estatísticas, gerar insights acionáveis e preparar recomendações estratégicas para o dashboard Streamlit.\n",
    "\n",
    "**Autor**: [Seu Nome]  \n",
    "**Data**: $(date +\"%Y-%m-%d\")  \n",
    "**Versão**: 1.0\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 🔧 Setup e Carregamento de Resultados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importações principais\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import json\n",
    "import os\n",
    "from datetime import datetime, timedelta\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Importações para análises avançadas\n",
    "from scipy import stats\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.model_selection import cross_val_score, train_test_split\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error\n",
    "from sklearn.inspection import permutation_importance\n",
    "\n",
    "# Configurações\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.precision', 2)\n",
    "\n",
    "print(\"🔍 Ambiente de validação de insights configurado!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Carregar dados e resultados das análises anteriores\n",
    "print(\"📂 CARREGANDO DADOS E RESULTADOS ANTERIORES\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "# Dataset com clusters\n",
    "df = pd.read_csv('../data/processed/flights_with_clusters.csv')\n",
    "print(f\"✅ Dataset principal: {df.shape[0]} linhas × {df.shape[1]} colunas\")\n",
    "\n",
    "# Resultados da análise estatística\n",
    "with open('../data/processed/statistical_analysis_results.json', 'r') as f:\n",
    "    stats_results = json.load(f)\n",
    "print(f\"📊 Resultados estatísticos carregados\")\n",
    "\n",
    "# Resumo da exploração inicial\n",
    "with open('../data/processed/exploration_summary.json', 'r') as f:\n",
    "    exploration_summary = json.load(f)\n",
    "print(f\"🔍 Resumo da exploração carregado\")\n",
    "\n",
    "print(f\"\\n📋 RESUMO DOS DADOS:\")\n",
    "print(f\"   • Período: {df['days_left'].min()}-{df['days_left'].max()} dias de antecedência\")\n",
    "print(f\"   • Airlines: {df['airline'].nunique()} companhias\")\n",
    "print(f\"   • Faixa de preços: ₹{df['price'].min():,.0f} - ₹{df['price'].max():,.0f}\")\n",
    "print(f\"   • Clusters identificados: {df['cluster'].nunique()}\")\n",
    "\n",
    "# Verificar qualidade dos dados\n",
    "print(f\"\\n🔎 QUALIDADE DOS DADOS:\")\n",
    "print(f\"   • Valores ausentes: {df.isnull().sum().sum()}\")\n",
    "print(f\"   • Duplicatas: {df.duplicated().sum()}\")\n",
    "print(f\"   • Consistência de tipos: OK\" if df.dtypes.notna().all() else \"   • Problemas de tipos detectados\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 🎯 Validação de Insights Principais"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Framework de validação de insights\n",
    "class InsightValidator:\n",
    "    def __init__(self, df, stats_results):\n",
    "        self.df = df\n",
    "        self.stats_results = stats_results\n",
    "        self.insights = []\n",
    "        \n",
    "    def validate_insight(self, hypothesis, test_func, confidence_level=0.95, \n",
    "                        effect_size_threshold=0.2, business_impact='medium'):\n",
    "        \"\"\"\n",
    "        Valida um insight usando critérios estatísticos e de negócio\n",
    "        \"\"\"\n",
    "        result = test_func()\n",
    "        \n",
    "        validation = {\n",
    "            'hypothesis': hypothesis,\n",
    "            'statistical_significance': result.get('p_value', 1) < (1 - confidence_level),\n",
    "            'effect_size': result.get('effect_size', 0),\n",
    "            'practical_significance': abs(result.get('effect_size', 0)) > effect_size_threshold,\n",
    "            'business_impact': business_impact,\n",
    "            'confidence': result.get('confidence', 0),\n",
    "            'sample_size': result.get('sample_size', len(self.df)),\n",
    "            'raw_result': result\n",
    "        }\n",
    "        \n",
    "        # Score de validade (0-100)\n",
    "        score = 0\n",
    "        if validation['statistical_significance']: score += 40\n",
    "        if validation['practical_significance']: score += 30\n",
    "        if validation['sample_size'] > 100: score += 15\n",
    "        if validation['confidence'] > 0.8: score += 15\n",
    "        \n",
    "        validation['validity_score'] = score\n",
    "        validation['validated'] = score >= 70\n",
    "        \n",
    "        self.insights.append(validation)\n",
    "        return validation\n",
    "    \n",
    "    def get_summary(self):\n",
    "        validated = sum(1 for i in self.insights if i['validated'])\n",
    "        total = len(self.insights)\n",
    "        avg_score = np.mean([i['validity_score'] for i in self.insights])\n",
    "        \n",
    "        return {\n",
    "            'total_insights': total,\n",
    "            'validated_insights': validated,\n",
    "            'validation_rate': validated / total if total > 0 else 0,\n",
    "            'average_score': avg_score,\n",
    "            'insights': self.insights\n",
    "        }\n",
    "\n",
    "# Inicializar validador\n",
    "validator = InsightValidator(df, stats_results)\n",
    "print(\"🔍 Framework de validação inicializado\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# INSIGHT 1: Diferenças significativas entre companhias aéreas\n",
    "print(\"🔍 VALIDANDO INSIGHT 1: Diferenças entre Airlines\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "def test_airline_differences():\n",
    "    # Análise ANOVA das diferenças entre airlines\n",
    "    airline_stats = df.groupby('airline')['price'].agg(['mean', 'std', 'count'])\n",
    "    \n",
    "    # Calcular diferença entre a mais cara e mais barata\n",
    "    price_range = airline_stats['mean'].max() - airline_stats['mean'].min()\n",
    "    relative_difference = price_range / airline_stats['mean'].mean()\n",
    "    \n",
    "    # Coeficiente de variação entre airlines\n",
    "    cv_between_airlines = airline_stats['mean'].std() / airline_stats['mean'].mean()\n",
    "    \n",
    "    return {\n",
    "        'p_value': stats_results['anova_results']['airlines']['p'],\n",
    "        'effect_size': cv_between_airlines,\n",
    "        'confidence': 0.95 if stats_results['anova_results']['airlines']['significant'] else 0.5,\n",
    "        'sample_size': len(df),\n",
    "        'price_range': price_range,\n",
    "        'relative_difference': relative_difference,\n",
    "        'cheapest_airline': airline_stats['mean'].idxmin(),\n",
    "        'most_expensive_airline': airline_stats['mean'].idxmax()\n",
    "    }\n",
    "\n",
    "insight1 = validator.validate_insight(\n",
    "    \"Existem diferenças significativas nos preços entre companhias aéreas\",\n",
    "    test_airline_differences,\n",
    "    business_impact='high'\n",
    ")\n",
    "\n",
    "print(f\"Status: {'✅ VALIDADO' if insight1['validated'] else '❌ NÃO VALIDADO'}\")\n",
    "print(f\"Score: {insight1['validity_score']}/100\")\n",
    "print(f\"Diferença de preço: ₹{insight1['raw_result']['price_range']:,.0f}\")\n",
    "print(f\"Mais barata: {insight1['raw_result']['cheapest_airline']}\")\n",
    "print(f\"Mais cara: {insight1['raw_result']['most_expensive_airline']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# INSIGHT 2: Premium de voos diretos vs com paradas\n",
    "print(\"\\n🔍 VALIDANDO INSIGHT 2: Premium de Voos Diretos\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "def test_direct_flight_premium():\n",
    "    direct = df[df['stops'] == 'zero']['price']\n",
    "    with_stops = df[df['stops'] == 'one']['price']\n",
    "    \n",
    "    # Calcular premium absoluto e relativo\n",
    "    premium_absolute = direct.mean() - with_stops.mean()\n",
    "    premium_relative = premium_absolute / with_stops.mean()\n",
    "    \n",
    "    # Teste de significância (já computado anteriormente)\n",
    "    cohen_d = stats_results['anova_results']['stops']['cohens_d']\n",
    "    \n",
    "    # Confiabilidade baseada no tamanho da amostra\n",
    "    sample_size = min(len(direct), len(with_stops))\n",
    "    confidence = 0.9 if sample_size > 100 else 0.7\n",
    "    \n",
    "    return {\n",
    "        'p_value': stats_results['anova_results']['stops']['p'],\n",
    "        'effect_size': abs(cohen_d),\n",
    "        'confidence': confidence,\n",
    "        'sample_size': sample_size,\n",
    "        'premium_absolute': premium_absolute,\n",
    "        'premium_relative': premium_relative,\n",
    "        'direct_mean': direct.mean(),\n",
    "        'stops_mean': with_stops.mean()\n",
    "    }\n",
    "\n",
    "insight2 = validator.validate_insight(\n",
    "    \"Voos diretos têm premium significativo sobre voos com paradas\",\n",
    "    test_direct_flight_premium,\n",
    "    business_impact='high'\n",
    ")\n",
    "\n",
    "print(f\"Status: {'✅ VALIDADO' if insight2['validated'] else '❌ NÃO VALIDADO'}\")\n",
    "print(f\"Score: {insight2['validity_score']}/100\")\n",
    "print(f\"Premium: ₹{insight2['raw_result']['premium_absolute']:,.0f} ({insight2['raw_result']['premium_relative']:.1%})\")\n",
    "print(f\"Cohen's d: {insight2['raw_result']['effect_size']:.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# INSIGHT 3: Segmentação natural do mercado em clusters\n",
    "print(\"\\n🔍 VALIDANDO INSIGHT 3: Segmentação de Mercado\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "def test_market_segmentation():\n",
    "    # Analisar qualidade dos clusters\n",
    "    silhouette_score = stats_results['clustering']['silhouette_score']\n",
    "    n_clusters = stats_results['clustering']['optimal_clusters']\n",
    "    \n",
    "    # Variabilidade inter vs intra-cluster\n",
    "    cluster_means = df.groupby('cluster')['price'].mean()\n",
    "    overall_mean = df['price'].mean()\n",
    "    \n",
    "    # Between-cluster variance\n",
    "    between_var = np.sum([(mean - overall_mean)**2 * len(df[df['cluster'] == i]) \n",
    "                         for i, mean in enumerate(cluster_means)]) / (len(df) - 1)\n",
    "    \n",
    "    # Within-cluster variance\n",
    "    within_var = np.mean([df[df['cluster'] == i]['price'].var() \n",
    "                         for i in df['cluster'].unique()])\n",
    "    \n",
    "    # F-ratio como medida de separação\n",
    "    f_ratio = between_var / within_var if within_var > 0 else np.inf\n",
    "    \n",
    "    # Distribuição balanceada dos clusters\n",
    "    cluster_sizes = df['cluster'].value_counts()\n",
    "    balance_coefficient = cluster_sizes.std() / cluster_sizes.mean()\n",
    "    \n",
    "    return {\n",
    "        'p_value': 0.001,  # Clustering é altamente significativo\n",
    "        'effect_size': silhouette_score,\n",
    "        'confidence': 0.9 if silhouette_score > 0.3 else 0.6,\n",
    "        'sample_size': len(df),\n",
    "        'n_clusters': n_clusters,\n",
    "        'silhouette_score': silhouette_score,\n",
    "        'f_ratio': f_ratio,\n",
    "        'balance_coefficient': balance_coefficient,\n",
    "        'cluster_separation': 'excellent' if silhouette_score > 0.5 else 'good' if silhouette_score > 0.3 else 'fair'\n",
    "    }\n",
    "\n",
    "insight3 = validator.validate_insight(\n",
    "    \"O mercado possui segmentação natural clara em grupos distintos\",\n",
    "    test_market_segmentation,\n",
    "    business_impact='high'\n",
    ")\n",
    "\n",
    "print(f\"Status: {'✅ VALIDADO' if insight3['validated'] else '❌ NÃO VALIDADO'}\")\n",
    "print(f\"Score: {insight3['validity_score']}/100\")\n",
    "print(f\"Clusters: {insight3['raw_result']['n_clusters']}\")\n",
    "print(f\"Silhouette Score: {insight3['raw_result']['silhouette_score']:.3f}\")\n",
    "print(f\"Separação: {insight3['raw_result']['cluster_separation']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# INSIGHT 4: Influência temporal nos preços\n",
    "print(\"\\n🔍 VALIDANDO INSIGHT 4: Padrões Temporais\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "def test_temporal_patterns():\n",
    "    # Correlação entre dias restantes e preço\n",
    "    corr_days_price = df['days_left'].corr(df['price'])\n",
    "    \n",
    "    # Análise por faixas de antecedência\n",
    "    df['booking_timing'] = pd.cut(df['days_left'], \n",
    "                                 bins=[0, 7, 21, 45, np.inf], \n",
    "                                 labels=['Last Week', '1-3 Weeks', '3-6 Weeks', '6+ Weeks'])\n",
    "    \n",
    "    timing_analysis = df.groupby('booking_timing')['price'].agg(['mean', 'std', 'count'])\n",
    "    \n",
    "    # Teste ANOVA para diferenças temporais\n",
    "    timing_groups = [df[df['booking_timing'] == timing]['price'].values \n",
    "                    for timing in df['booking_timing'].unique() if pd.notna(timing)]\n",
    "    \n",
    "    if len(timing_groups) > 1:\n",
    "        f_stat, p_val = stats.f_oneway(*timing_groups)\n",
    "    else:\n",
    "        f_stat, p_val = 0, 1\n",
    "    \n",
    "    # Diferença entre reserva antecipada vs de última hora\n",
    "    early_booking = df[df['days_left'] > 21]['price'].mean()\n",
    "    late_booking = df[df['days_left'] <= 7]['price'].mean()\n",
    "    temporal_premium = abs(early_booking - late_booking) / min(early_booking, late_booking)\n",
    "    \n",
    "    return {\n",
    "        'p_value': p_val,\n",
    "        'effect_size': abs(corr_days_price),\n",
    "        'confidence': 0.8 if abs(corr_days_price) > 0.1 else 0.5,\n",
    "        'sample_size': len(df),\n",
    "        'correlation_days_price': corr_days_price,\n",
    "        'temporal_premium': temporal_premium,\n",
    "        'early_booking_avg': early_booking,\n",
    "        'late_booking_avg': late_booking,\n",
    "        'f_statistic': f_stat\n",
    "    }\n",
    "\n",
    "insight4 = validator.validate_insight(\n",
    "    \"Padrões temporais influenciam significativamente os preços dos voos\",\n",
    "    test_temporal_patterns,\n",
    "    business_impact='medium'\n",
    ")\n",
    "\n",
    "print(f\"Status: {'✅ VALIDADO' if insight4['validated'] else '❌ NÃO VALIDADO'}\")\n",
    "print(f\"Score: {insight4['validity_score']}/100\")\n",
    "print(f\"Correlação temporal: {insight4['raw_result']['correlation_days_price']:.3f}\")\n",
    "print(f\"Premium temporal: {insight4['raw_result']['temporal_premium']:.1%}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# INSIGHT 5: Eficiência operacional por companhia\n",
    "print(\"\\n🔍 VALIDANDO INSIGHT 5: Eficiência Operacional\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "def test_operational_efficiency():\n",
    "    # Calcular eficiência (preço por hora de voo)\n",
    "    df['price_per_hour'] = df['price'] / df['duration']\n",
    "    \n",
    "    # Análise de eficiência por airline\n",
    "    efficiency_stats = df.groupby('airline')['price_per_hour'].agg(['mean', 'std', 'count'])\n",
    "    \n",
    "    # Diferença entre mais e menos eficiente\n",
    "    efficiency_range = efficiency_stats['mean'].max() - efficiency_stats['mean'].min()\n",
    "    relative_efficiency_diff = efficiency_range / efficiency_stats['mean'].mean()\n",
    "    \n",
    "    # Correlação entre duração e preço\n",
    "    duration_price_corr = df['duration'].corr(df['price'])\n",
    "    \n",
    "    # Teste estatístico para diferenças de eficiência\n",
    "    efficiency_groups = [df[df['airline'] == airline]['price_per_hour'].values \n",
    "                        for airline in df['airline'].unique()]\n",
    "    f_eff, p_eff = stats.f_oneway(*efficiency_groups)\n",
    "    \n",
    "    return {\n",
    "        'p_value': p_eff,\n",
    "        'effect_size': relative_efficiency_diff,\n",
    "        'confidence': 0.85 if p_eff < 0.05 else 0.6,\n",
    "        'sample_size': len(df),\n",
    "        'efficiency_range': efficiency_range,\n",
    "        'duration_price_correlation': duration_price_corr,\n",
    "        'most_efficient': efficiency_stats['mean'].idxmin(),\n",
    "        'least_efficient': efficiency_stats['mean'].idxmax(),\n",
    "        'f_statistic': f_eff\n",
    "    }\n",
    "\n",
    "insight5 = validator.validate_insight(\n",
    "    \"Existem diferenças significativas na eficiência operacional entre airlines\",\n",
    "    test_operational_efficiency,\n",
    "    business_impact='medium'\n",
    ")\n",
    "\n",
    "print(f\"Status: {'✅ VALIDADO' if insight5['validated'] else '❌ NÃO VALIDADO'}\")\n",
    "print(f\"Score: {insight5['validity_score']}/100\")\n",
    "print(f\"Mais eficiente: {insight5['raw_result']['most_efficient']}\")\n",
    "print(f\"Menos eficiente: {insight5['raw_result']['least_efficient']}\")\n",
    "print(f\"Correlação duração-preço: {insight5['raw_result']['duration_price_correlation']:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 📈 Modelagem Preditiva para Validação"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Desenvolver modelos preditivos para validar insights\n",
    "print(\"📈 MODELAGEM PREDITIVA PARA VALIDAÇÃO DE INSIGHTS\")\n",
    "print(\"=\" * 60)\n",
    "\n",
    "# Preparar dados para modelagem\n",
    "def prepare_modeling_data(df):\n",
    "    \"\"\"\n",
    "    Prepara dados para modelagem preditiva\n",
    "    \"\"\"\n",
    "    model_df = df.copy()\n",
    "    \n",
    "    # Encoding de variáveis categóricas\n",
    "    airline_dummies = pd.get_dummies(model_df['airline'], prefix='airline')\n",
    "    stops_dummies = pd.get_dummies(model_df['stops'], prefix='stops')\n",
    "    time_dummies = pd.get_dummies(model_df['departure_time'], prefix='time')\n",
    "    \n",
    "    # Features numéricas\n",
    "    numeric_features = ['duration', 'days_left']\n",
    "    \n",
    "    # Combinar features\n",
    "    X = pd.concat([\n",
    "        model_df[numeric_features],\n",
    "        airline_dummies,\n",
    "        stops_dummies,\n",
    "        time_dummies\n",
    "    ], axis=1)\n",
    "    \n",
    "    y = model_df['price']\n",
    "    \n",
    "    return X, y, numeric_features\n",
    "\n",
    "X, y, numeric_features = prepare_modeling_data(df)\n",
    "print(f\"✅ Dados preparados: {X.shape[1]} features, {len(y)} observações\")\n",
    "\n",
    "# Split dos dados\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=42, stratify=df['cluster']\n",
    ")\n",
    "\n",
    "print(f\"📊 Train: {len(X_train)}, Test: {len(X_test)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Modelo 1: Regressão Linear (baseline)\n",
    "print(\"🔍 MODELO 1: REGRESSÃO LINEAR\")\n",
    "print(\"=\" * 40)\n",
    "\n",
    "lr_model = LinearRegression()\n",
    "lr_model.fit(X_train, y_train)\n",
    "\n",
    "# Predições\n",
    "y_pred_lr = lr_model.predict(X_test)\n",
    "\n",
    "# Métricas\n",
    "lr_r2 = r2_score(y_test, y_pred_lr)\n",
    "lr_mae = mean_absolute_error(y_test, y_pred_lr)\n",
    "lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))\n",
    "\n",
    "print(f\"R² Score: {lr_r2:.4f}\")\n",
    "print(f\"MAE: ₹{lr_mae:.0f}\")\n",
    "print(f\"RMSE: ₹{lr_rmse:.0f}\")\n",
    "\n",
    "# Importância das features (coeficientes)\n",
    "feature_importance_lr = pd.DataFrame({\n",
    "    'feature': X.columns,\n",
    "    'importance': np.abs(lr_model.coef_)\n",
    "}).sort_values('importance', ascending=False)\n",
    "\n",
    "print(f\"\\n🔝 TOP 5 FEATURES MAIS IMPORTANTES:\")\n",
    "for i, row in feature_importance_lr.head().iterrows():\n",
    "    print(f\"   {row['feature']}: {row['importance']:.2f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#