In [15]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType

In [16]:
# Crear una sesión de Spark
spark = SparkSession.builder.appName("LifeExpectancyPrediction").getOrCreate()

In [17]:
# Cargar los datos de entrenamiento en un DataFrame
data = spark.read.format("parquet").option("header", "true").load("data")

In [18]:
print(data.columns)

['Year', 'Country Code', 'Country Name', 'life_expatancy', 'life_expatancy_male', 'life_expatancy_female', 'ratio_mort_neonatal', 'ratio_mort_one_year', 'ratio_mort_five_year', 'ratio_maternal_mort', 'ratio_mort_in_year', 'ratio_poblation_0_14_year', 'ratio_poblation_15_64_year', 'ratio_poblation_more_64_year', 'ratio_population_growth', 'ratio_mortality_CVD_cancer_diabetes_CRD', 'ratio_infectious_diseases', 'porc_prev_malnutrition', 'doctor_1000_people', 'bed_1000_people', 'porc_water_service', 'porc_sanitation_services', 'porc_electricity', 'porc_internet', 'porc_secundary_school_compl', 'porc_literacy', 'porc_unemployment', 'porc_below_poverty_line', 'PIB_growth_per_capita']


In [19]:
data.printSchema

<bound method DataFrame.printSchema of DataFrame[Year: string, Country Code: string, Country Name: string, life_expatancy: string, life_expatancy_male: string, life_expatancy_female: string, ratio_mort_neonatal: string, ratio_mort_one_year: string, ratio_mort_five_year: string, ratio_maternal_mort: string, ratio_mort_in_year: string, ratio_poblation_0_14_year: string, ratio_poblation_15_64_year: string, ratio_poblation_more_64_year: string, ratio_population_growth: string, ratio_mortality_CVD_cancer_diabetes_CRD: string, ratio_infectious_diseases: string, porc_prev_malnutrition: string, doctor_1000_people: string, bed_1000_people: string, porc_water_service: string, porc_sanitation_services: string, porc_electricity: string, porc_internet: string, porc_secundary_school_compl: string, porc_literacy: string, porc_unemployment: string, porc_below_poverty_line: string, PIB_growth_per_capita: string]>

In [20]:
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType

# Supongamos que tienes un DataFrame llamado "data" con las columnas "col1", "col2" y "col3" que son de tipo string
columns_to_convert = ['life_expatancy', 'life_expatancy_male', 'life_expatancy_female', 'ratio_mort_neonatal', 'ratio_mort_one_year', 'ratio_mort_five_year', 'ratio_maternal_mort', 'ratio_mort_in_year', 'ratio_poblation_0_14_year', 'ratio_poblation_15_64_year', 'ratio_poblation_more_64_year', 'ratio_population_growth', 'ratio_mortality_CVD_cancer_diabetes_CRD', 'ratio_infectious_diseases', 'porc_prev_malnutrition', 'doctor_1000_people', 'bed_1000_people', 'porc_water_service', 'porc_sanitation_services', 'porc_electricity', 'porc_internet', 'porc_secundary_school_compl', 'porc_literacy', 'porc_unemployment', 'porc_below_poverty_line', 'PIB_growth_per_capita']

for column in columns_to_convert:
    data = data.withColumn(column, col(column).cast(FloatType()))

In [21]:
data.printSchema

<bound method DataFrame.printSchema of DataFrame[Year: string, Country Code: string, Country Name: string, life_expatancy: float, life_expatancy_male: float, life_expatancy_female: float, ratio_mort_neonatal: float, ratio_mort_one_year: float, ratio_mort_five_year: float, ratio_maternal_mort: float, ratio_mort_in_year: float, ratio_poblation_0_14_year: float, ratio_poblation_15_64_year: float, ratio_poblation_more_64_year: float, ratio_population_growth: float, ratio_mortality_CVD_cancer_diabetes_CRD: float, ratio_infectious_diseases: float, porc_prev_malnutrition: float, doctor_1000_people: float, bed_1000_people: float, porc_water_service: float, porc_sanitation_services: float, porc_electricity: float, porc_internet: float, porc_secundary_school_compl: float, porc_literacy: float, porc_unemployment: float, porc_below_poverty_line: float, PIB_growth_per_capita: float]>

In [22]:
# Convertir las columnas seleccionadas a tipo Float
columns_to_convert = ["ratio_mort_neonatal", "ratio_mort_one_year", "ratio_mort_five_year", 'ratio_maternal_mort', 'ratio_mort_in_year']
for column in columns_to_convert:
    data = data.withColumn(column, col(column).cast(FloatType()))

In [23]:
# Convertir la variable categórica "país" a una representación numérica
indexer = StringIndexer(inputCol="Country Code", outputCol="country_index")
data = indexer.fit(data).transform(data)

In [24]:
# Crear el objeto OneHotEncoder
encoder = OneHotEncoder(inputCols=["country_index"], outputCols=["country_encoded"])

In [25]:
# Ajustar y transformar los datos
model = encoder.fit(data)
data = model.transform(data)

In [26]:
# Seleccionar las características y la variable objetivo
feature_columns = ["country_encoded", "ratio_mort_neonatal", "ratio_mort_one_year", "ratio_mort_five_year", 'ratio_maternal_mort', 'ratio_mort_in_year']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features", handleInvalid="keep")
data = assembler.transform(data)

In [27]:
# Seleccionar las columnas finales
data = data.select("Country Name", "features", "life_expatancy").withColumnRenamed("life_expatancy", "label")

In [28]:
# Mostrar modelo machine learning
data.select("Country Name","label").limit(70).pandas_api()



Unnamed: 0,Country Name,label
0,Haití,61.741001
1,Panamá,75.543999
2,Trinidad y Tobago,71.307999
3,Micronesia (Estados Federados de),66.859001
4,Jamaica,71.273003
5,Venezuela,72.161003
6,Kiribati,61.422001
7,Dominica,69.851997
8,Belice,69.887001
9,Suriname,72.241997
