# Tarea Apache Spark
>Importa las librerías necesarias dónde sea necesario


In [1]:
import pandas as pd 
from pyspark.sql import SparkSession

### SparkSession
>Crea un SparkSession para comenzar la tarea

In [3]:
spark = SparkSession.builder.appName("tarea").getOrCreate()

### Crear un DataFrame
>Lee el csv datosTarea.csv, mételo a un DF y muéstralo.

In [6]:
dfcsv = spark.read.csv('datosTarea.csv', header=True, inferSchema=True)
dfcsv.show()

+-----+---------------+--------------------+--------------------+--------------------+--------------------+-------+--------------------+-------------------+--------+-----------+------+
|Index|Organization Id|                Name|             Website|             Country|         Description|Founded|            Industry|Number of employees|Networth|stock_price|profit|
+-----+---------------+--------------------+--------------------+--------------------+--------------------+-------+--------------------+-------------------+--------+-----------+------+
|    1|FAB0d41d5b5d22c|         Ferrell LLC|  https://price.net/|    Papua New Guinea|Horizontal empowe...|   1990|            Plastics|               3498|  402269|         33| 12125|
|    2|6A7EdDEA9FaDC52|Mckinney, Riley a...|http://www.hall-b...|             Finland|User-centric syst...|   2015|Glass / Ceramics ...|               4952|  569480|         49| 12001|
|    3|0bFED1ADAE4bcC1|          Hester Ltd|http://sullivan-r...|          

### Filtro de datos
>Consigue todas las empresas que empiecen con 'M' y tengan entre 4000 y 7000 empleados. Sólo muestra los nombres y el número de empleados.

In [7]:
from pyspark.sql.functions import col

df_filtered = dfcsv.filter(
    (col('Name').startswith('M')) & 
    (col('Number of employees').between(4000, 7000))
)

df_result = df_filtered.select('Name', 'Number of employees')
df_result.show()

+--------------------+-------------------+
|                Name|Number of employees|
+--------------------+-------------------+
|Mckinney, Riley a...|               4952|
|       Mcintosh-Mora|               4389|
|     Mckenzie-Melton|               4589|
|          Massey LLC|               5004|
|        Mays-Preston|               5786|
+--------------------+-------------------+



>Consigue todos los países que no inicien con las letras 'b', 's' y 'm', pero que tampoco tengan un netword mayor a 500000. Muestra el nombre de la compañía, el país y el networth.

In [8]:
df_filtered = dfcsv.filter(
    (~col('Country').startswith('b')) & 
    (~col('Country').startswith('s')) & 
    (~col('Country').startswith('m')) & 
    (col('Networth') <= 500000)
)

df_result = df_filtered.select('Name', 'Country', 'Networth')
df_result.show()

+--------------------+--------------------+--------+
|                Name|             Country|Networth|
+--------------------+--------------------+--------+
|         Ferrell LLC|    Papua New Guinea|  402269|
|      Holder-Sellers|        Turkmenistan|  105914|
|Keller, Campos an...|             Liberia|  329130|
|      Pacheco-Spears|              Sweden|   88435|
|         Harrell LLC|          Guadeloupe|  251274|
|         Jenkins Inc|        South Africa|  139725|
|Dickson, Richmond...|      Czech Republic|  359030|
|        Prince-David|    Christmas Island|  120289|
|         Rivas Group|           Australia|  477824|
|Sloan, Mays and W...|                Chad|   41975|
|Glass, Barrera an...|     Kyrgyz Republic|  300150|
|          Pineda-Cox|             Bolivia|  150880|
|Baker, Mccann and...|               Kenya|  188370|
|            Hahn PLC|             Belarus|  427224|
|Valentine, Fergus...|              Jersey|  412274|
|           Walls LLC|          Cape Verde|  1

### Funciones
Crea una función con @pandas_udf que que le reste a los profits la media en cada renglón. Crea una nueva columna que muestre los resultados.

In [11]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql import types as T

@pandas_udf(T.DoubleType())
def adjust_profits(profit):
    return profit - profit.mean()

dfcsv = dfcsv.withColumn('profits_adjusted', adjust_profits(dfcsv['Profit']))
df = dfcsv.select('Name', 'profits_adjusted')
df.show()

                                                                                

+--------------------+----------------+
|                Name|profits_adjusted|
+--------------------+----------------+
|         Ferrell LLC|          -194.0|
|Mckinney, Riley a...|          -318.0|
|          Hester Ltd|          4802.0|
|      Holder-Sellers|         -4119.0|
|         Mayer Group|         -3720.0|
|      Henry-Thompson|         -6930.0|
|      Hansen-Everett|         -6619.0|
|       Mcintosh-Mora|         -6489.0|
|            Carr Inc|          3467.0|
|          Gaines Inc|          2516.0|
|          Kidd Group|          3105.0|
|        Crane-Clarke|         -4022.0|
|Keller, Campos an...|          4724.0|
|         Glover-Pope|          5490.0|
|      Pacheco-Spears|           358.0|
|         Hodge-Ayers|          3478.0|
|Bowers, Guerra an...|           211.0|
|     Mckenzie-Melton|          4639.0|
|         Branch-Mann|         -3574.0|
|      Weiss and Sons|          5760.0|
+--------------------+----------------+
only showing top 20 rows



### Grouping data
>Agrupa por industry y muestra cuáles son las empresas con el profit más alto. Muestra los primeros tres.

In [22]:
from pyspark.sql.functions import col, max

df_grouped = dfcsv.groupBy('Industry').agg(max('Profit').alias('max_profit'))
df_grouped = df_grouped.withColumnRenamed('Industry', 'Industry1')

df_joined = dfcsv.join(df_grouped, (dfcsv.Industry == df_grouped.Industry1) & (dfcsv.profit == df_grouped.max_profit))

df_result = df_joined.select('Industry', 'max_profit')
df_result.orderBy(col('max_profit').desc()).show(3)

+--------------------+----------+
|            Industry|max_profit|
+--------------------+----------+
|  Legislative Office|     19363|
|Museums / Institu...|     19079|
|             Farming|     18850|
+--------------------+----------+
only showing top 3 rows



>Agrupa por industry y calcula el promedio de empleados que tienen

In [24]:
from pyspark.sql.functions import avg

df_grouped = dfcsv.groupBy('Industry').agg(avg('Number of employees').alias('avg_empleados'))
df_grouped.show()

+--------------------+-----------------+
|            Industry|    avg_empleados|
+--------------------+-----------------+
|Primary / Seconda...|6457.666666666667|
|     Broadcast Media|           2589.0|
|           Wholesale|           5010.0|
|Investment Manage...|           3133.5|
|    Food / Beverages|           9011.0|
|  Gambling / Casinos|           4873.0|
|Logistics / Procu...|           4155.0|
|            Maritime|            769.0|
|            Wireless|           6146.0|
|Education Management|            339.0|
|       Arts / Crafts|           2800.0|
|           Insurance|           1215.0|
|  Financial Services|           5157.0|
|Business Supplies...|           9097.0|
|Consumer Electronics|           5022.0|
|       Public Safety|           5287.0|
|Information Techn...|           3934.0|
|Civic / Social Or...|           2442.0|
|      Consumer Goods|           9069.0|
|Glass / Ceramics ...|           4952.0|
+--------------------+-----------------+
only showing top

### SQL
>Usando Spark SQL, obtén cuántas empresas se fundaron despúes del 2000.

In [25]:
dfcsv.createOrReplaceTempView('companies')

result = spark.sql("""
SELECT COUNT(*) as count
FROM companies
WHERE Founded > 2000
""")

result.show()

+-----+
|count|
+-----+
|   38|
+-----+



### ML Regresión Lineal
>Con número de empleados, networth y stock price, obtén una predicción del profit a través de una regresión lineal.

In [36]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["Number of employees","Networth", "stock_price"],outputCol="Independent Features")
output=featureassembler.transform(dfcsv)
df_result = output.select('Name', 'Number of employees', 'Networth', 'stock_price', 'Independent Features')
df_result.show()

finalized_data=output.select("Independent Features","Networth")

from pyspark.ml.regression import LinearRegression

train_data,test_data=finalized_data.randomSplit([0.75, 0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Networth')
regressor=regressor.fit(train_data)

predictions = regressor.transform(test_data)

pred_results=regressor.evaluate(test_data)

predictions = pred_results.predictions
predictions.show()

+--------------------+-------------------+--------+-----------+--------------------+
|                Name|Number of employees|Networth|stock_price|Independent Features|
+--------------------+-------------------+--------+-----------+--------------------+
|         Ferrell LLC|               3498|  402269|         33|[3498.0,402269.0,...|
|Mckinney, Riley a...|               4952|  569480|         49|[4952.0,569480.0,...|
|          Hester Ltd|               5287|  608005|         26|[5287.0,608005.0,...|
|      Holder-Sellers|                921|  105914|         23|[921.0,105914.0,2...|
|         Mayer Group|               7870|  905049|         31|[7870.0,905049.0,...|
|      Henry-Thompson|               4914|  565110|         49|[4914.0,565110.0,...|
|      Hansen-Everett|               7832|  900679|         13|[7832.0,900679.0,...|
|       Mcintosh-Mora|               4389|  504734|         19|[4389.0,504734.0,...|
|            Carr Inc|               8167|  939204|         16|[8

23/12/09 16:37:40 WARN Instrumentation: [95829425] regParam is zero, which might cause numerical instability and overfitting.


+--------------------+--------+------------------+
|Independent Features|Networth|        prediction|
+--------------------+--------+------------------+
|[602.0,69230.0,31.0]|   69230|  69229.9999999994|
|[1046.0,120289.0,...|  120289|120288.99999999933|
|[1215.0,139725.0,...|  139725|139724.99999999919|
|[1678.0,192969.0,...|  192969|192968.99999999956|
|[1746.0,200789.0,...|  200789| 200788.9999999999|
|[2988.0,343620.0,...|  343620| 343619.9999999995|
|[3122.0,359030.0,...|  359030|359029.99999999936|
|[3715.0,427224.0,...|  427224| 427223.9999999993|
|[3816.0,438839.0,...|  438839|438839.00000000023|
|[3934.0,452409.0,...|  452409|452408.99999999924|
|[4155.0,477824.0,...|  477824| 477823.9999999998|
|[4942.0,568330.0,...|  568330| 568330.0000000003|
|[5130.0,589950.0,...|  589950| 589949.9999999992|
|[5571.0,640665.0,...|  640665| 640664.9999999994|
|[6135.0,705525.0,...|  705525| 705525.0000000003|
|[6146.0,706790.0,...|  706790| 706789.9999999993|
|[6168.0,709320.0,...|  709320|

>Una vez que obtengas los resultados, a través del api de pandas, conviértelo en un pandas on spark DataFrame y pásalo a csv.

In [45]:
pandas_df = predictions.toPandas()

pandas_df.to_csv('predictions.csv', index = False)