In [176]:
#Importar e instalar
import pandas as pd
import numpy as np
import pylab as plt   
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')   # para quitar esos prints

In [177]:
data = pd.read_csv('data/fusion_MG_UNICEF_1.0_all.csv', sep=',', encoding='utf-8')
dataglob = pd.read_csv('data/fusion_GLOBAL_DATAFLOW_UNICEF_1.0_all.csv', sep=',', encoding='utf-8')

In [178]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37699 entries, 0 to 37698
Data columns (total 18 columns):
 #   Column                                                                          Non-Null Count  Dtype  
---  ------                                                                          --------------  -----  
 0   DATAFLOW                                                                        37699 non-null  object 
 1   REF_AREA:Geographic area                                                        37699 non-null  object 
 2   INDICATOR:Indicator                                                             37699 non-null  object 
 3   AGE:Current age                                                                 37699 non-null  object 
 4   STAT_POP:Statistical Population                                                 37699 non-null  object 
 5   TIME_PERIOD:Time period                                                         37699 non-null  int64  
 6   OBS_VALUE:Obse

In [179]:
# Eliminar las columnas con valores nulos
df = data.dropna(axis=1)

In [180]:
# Verificar los resultados
df.head()

Unnamed: 0,DATAFLOW,REF_AREA:Geographic area,INDICATOR:Indicator,AGE:Current age,STAT_POP:Statistical Population,TIME_PERIOD:Time period,OBS_VALUE:Observation Value,UNIT_MEASURE:Unit of measure,SOURCE_LINK:Citation of or link to the data source,DATA_SOURCE:Data Source,OBS_FOOTNOTE:Observation footnote
0,UNICEF:MG(1.0): Migration,BDI: Burundi,MG_INTNL_MG_CNTRY_DEST: International migrants...,_T: Total,_T: Total,1990,333,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...
1,UNICEF:MG(1.0): Migration,BDI: Burundi,MG_INTNL_MG_CNTRY_DEST: International migrants...,_T: Total,_T: Total,1995,255,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...
2,UNICEF:MG(1.0): Migration,BDI: Burundi,MG_INTNL_MG_CNTRY_DEST: International migrants...,_T: Total,_T: Total,2000,126,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...
3,UNICEF:MG(1.0): Migration,BDI: Burundi,MG_INTNL_MG_CNTRY_DEST: International migrants...,_T: Total,_T: Total,2005,193,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...
4,UNICEF:MG(1.0): Migration,BDI: Burundi,MG_INTNL_MG_CNTRY_DEST: International migrants...,_T: Total,_T: Total,2010,247,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...


In [181]:
# Eliminar las filas que contienen "total" en la columna "AGE"
df = df[~df['AGE:Current age'].str.contains('_T: Total', case=False)]

In [182]:
# Eliminar la columna "DATAFLOW"
df = df.drop("DATAFLOW", axis=1)

In [183]:
# Contar la frecuencia de cada valor en la columna 'UNIT_MEASURE:Unit of measure'
unit_measure_counts = df['UNIT_MEASURE:Unit of measure'].value_counts()
print(unit_measure_counts)

PS: Persons       8286
NUMBER: Number    6351
Name: UNIT_MEASURE:Unit of measure, dtype: int64


In [184]:
# Filtrar el DataFrame original manteniendo solo las filas con 'UNIT_MEASURE:Unit of measure' igual a 'PS: Persons'
df = df[df['UNIT_MEASURE:Unit of measure'] == 'PS: Persons']

# Mostrar el DataFrame actualizado
df.head()

Unnamed: 0,REF_AREA:Geographic area,INDICATOR:Indicator,AGE:Current age,STAT_POP:Statistical Population,TIME_PERIOD:Time period,OBS_VALUE:Observation Value,UNIT_MEASURE:Unit of measure,SOURCE_LINK:Citation of or link to the data source,DATA_SOURCE:Data Source,OBS_FOOTNOTE:Observation footnote
1742,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,1990,15,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...
1743,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,1995,18,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...
1744,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,2000,19,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...
1745,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,2005,22,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...
1746,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,2010,25,PS: Persons,https://www.un.org/development/desa/pd/content...,United Nations Department of Economic and Soci...,235 countries/areas where migration data avail...


In [185]:
# Utilizar la función pivot para convertir la columna "time_period" en columnas separadas
data_pivot = data.pivot(index=['REF_AREA:Geographic area', 'INDICATOR:Indicator', 'AGE:Current age', 'STAT_POP:Statistical Population'],
                        columns='TIME_PERIOD:Time period',
                        values='OBS_VALUE:Observation Value')

# Restablecer el índice del DataFrame resultante
data_pivot = data_pivot.reset_index()

# Imprimir el resultado
data_pivot.head()

TIME_PERIOD:Time period,REF_AREA:Geographic area,INDICATOR:Indicator,AGE:Current age,STAT_POP:Statistical Population,1990,1995,2000,2001,2002,2003,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,ABW: Aruba,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,3.0,4.0,5.0,,,,...,,,,3.0,,,,,11.0,
1,ABW: Aruba,MG_INTNL_MG_CNTRY_DEST: International migrants...,_T: Total,SH_NAT_POP: Share of National Total Population,23.0,28.0,33.0,,,,...,,,,35.0,,,,,50.0,
2,ABW: Aruba,MG_INTNL_MG_CNTRY_DEST: International migrants...,_T: Total,_T: Total,14.0,22.0,30.0,,,,...,,,,36.0,,,,,54.0,
3,ABW: Aruba,"MG_RFGS_CNTRY_ASYLM: Refugees, by country of a...",_T: Total,_T: Total,,,,,,,...,,,,,,,,,,17000.0
4,ABW: Aruba,MG_RFGS_CNTRY_ASYLM_PER1000: Refugees by host ...,_T: Total,_T: Total,,,,,,,...,,,,,,,,,,160.0


In [186]:
df = df.drop('SOURCE_LINK:Citation of or link to the data source',axis=1)

In [187]:
df = df.drop('DATA_SOURCE:Data Source', axis=1)

In [188]:
df = df.drop('OBS_FOOTNOTE:Observation footnote', axis=1)

In [189]:
df = df.drop('UNIT_MEASURE:Unit of measure', axis=1)

In [190]:
df.head()

Unnamed: 0,REF_AREA:Geographic area,INDICATOR:Indicator,AGE:Current age,STAT_POP:Statistical Population,TIME_PERIOD:Time period,OBS_VALUE:Observation Value
1742,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,1990,15
1743,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,1995,18
1744,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,2000,19
1745,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,2005,22
1746,AFG: Afghanistan,MG_INTNL_MG_CNTRY_DEST: International migrants...,Y0T17: Under 18 years old,_T: Total,2010,25


In [191]:
df.to_csv('nuevo_archivo.csv', index=False)

In [None]:
from sklearn.linear_model import LinearRegression

# Filtrar los datos para los "under 18 years old"
df_filtered = df.loc[df['AGE:Current age'] == 'Y0T17: Under 18 years old']


In [None]:
# Seleccionar las columnas de años para el entrenamiento del modelo
years = df_filtered.columns[5:]  # Se asume que las columnas de años comienzan desde la posición 5                                                                                                                                              

# Crear una lista vacía para almacenar los resultados predichos
predicciones_2022 = []

# Iterar sobre las filas del DataFrame filtrado y entrenar un modelo de regresión lineal para cada fila
for _, row in df_filtered.iterrows():
    # Obtener los valores observados
    observados = row[years].dropna().values.reshape(-1, 1)
    
    # Crear el modelo de regresión lineal
    modelo = LinearRegression()
    
    # Entrenar el modelo
    modelo.fit(observados[:-1], observados[1:])
    
    # Predecir el valor para el año 2022
    prediccion_2022 = modelo.predict([[observados[-1]]])[0]
    
    # Agregar la predicción a la lista
    predicciones_2022.append(prediccion_2022)

# Agregar las predicciones al DataFrame filtrado
df_filtered['Prediccion_2022'] = predicciones_2022

# Mostrar los resultados
print(df_filtered)