In [2]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [3]:
from pyspark.sql.types import StructType, DateType, IntegerType, StringType, DoubleType, BooleanType

In [4]:
from pyspark.sql.functions import datediff

In [5]:
sqlContext = pyspark.SQLContext(sc)

Date – The date of the file in yyyy-mm-dd format.

Serial Number – The manufacturer-assigned serial number of the drive.

Model – The manufacturer-assigned model number of the drive.

Capacity – The drive capacity in bytes.

Failure – Contains a “0” if the drive is OK. Contains a “1” if this is the last day the drive was operational before failing.


In [6]:
schema = StructType() \
      .add("Date", DateType(),True) \
      .add("Serial_Number",StringType(),True) \
      .add("Model",StringType(),True) \
      .add("Capacity",DoubleType(),True) \
      .add("Failure",IntegerType(),True)

In [7]:
dia = sqlContext.read.options(header='True', delimiter=',') \
        .schema(schema) \
        .csv("../bigdata/**")

In [8]:
dia.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Serial_Number: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Capacity: double (nullable = true)
 |-- Failure: integer (nullable = true)



In [9]:
dia.registerTempTable("diskData")

In [10]:
sqlContext.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        | diskdata|       true|
+--------+---------+-----------+



Cantidad de discos por modelo que existen.

In [None]:
sqlContext.sql('Select Count(1) as Cuenta, Model from diskData group by Model, Serial_Number order by Cuenta desc')\
          .coalesce(1)\
          .write.format("parquet")\
          .save("output/parquet/transacciones", mode="OVERWRITE")

In [16]:
sqlContext.sql('Select CAST(AVG(Diff) AS DECIMAL(10,2)), Model from (Select DATEDIFF(Max(Date), Min(Date)) as Diff, Serial_Number, Model from diskData group by Serial_Number, Model order by Model) As TableB Group by Model').show()

+--------------------------------+--------------------+
|CAST(avg(Diff) AS DECIMAL(10,2))|               Model|
+--------------------------------+--------------------+
|                         1353.36|         ST9250315AS|
|                         1575.19|         ST4000DM000|
|                          299.85|        WDC WD30EZRX|
|                          327.00|      WDC WD5000AAJS|
|                          793.39|       ST12000NM0007|
|                          733.59|         ST8000DM005|
|                          777.20|          ST320LT007|
|                         1021.00|         ST3500320AS|
|                          850.20| TOSHIBA MQ01ABF050M|
|                          951.67|      WDC WD5002ABYS|
|                         1152.00|      WDC WD2500BEVT|
|                         1142.29|        ST8000NM0055|
|                         1487.26|Hitachi HDS5C3030...|
|                          313.75|         ST2000DM001|
|                          568.50|Seagate BarraC

Cuantos días duran los discos duros segun su modelo en promedio

Cantidad de GB por modelo de disco duro

In [None]:
Modelo de disco más confiable por año

Modelo de disco menos confiable por año

In [None]:
sqlContext.sql('select DISTINCT count(1) as Cuenta, Model from diskData where group by Model, Serial_Number order by Cuenta desc').show(150)

In [None]:
sqlContext.sql('select Serial_Number, count(1) from diskData group by Serial_Number order by count(1) desc').show(150)

In [None]:
sqlContext.sql('select Date, Serial_Number, Model, Capacity, Failure from diskData where Failure = 1 and month(Date) = 2 and year(Date)=2020').show()

In [None]:
selectedData = dia.select("Date", "Serial_Number")

In [None]:
selectedData.write.csv('test2.csv')