In [1]:
!pip install -q cassandra-driver

In [3]:
from cassandra.cluster import Cluster

In [9]:
cluster = Cluster(['localhost'])

In [17]:
session = cluster.connect('projeto02')

In [18]:
linhas = session.execute('select * from TB_CONSULTA1 limit 5;')
for linha in linhas:
    print(linha)

In [19]:
cassandra_keyspace = "projeto02"

In [29]:
#Dataset de clientes
df_clientes = spark.read.csv("/home/felipe/Projeto2/dados/Clientes.csv", header = True, inferSchema = True)

In [30]:
df_clientes.show(5)

+----------+-------------------+--------------------+---------------+-----------+--------------------+--------------------+-----------------+------------+--------------------+--------+------+-----+
|ID_Cliente|               Nome|            Endereco|Data_Nascimento|     Cidade|       Data_Cadastro|               Email|         Latitude|   Longitude|               Senha|   Fonte|Estado|  Zip|
+----------+-------------------+--------------------+---------------+-----------+--------------------+--------------------+-----------------+------------+--------------------+--------+------+-----+
|         1|       Hudson Borer|9611-9809 West Ro...|     1986-12-12| Wood River|2017-10-06 17:04:...|borer-hudson@test...|40.71314890000001| -98.5259864|ccca881f-3e4b-4e5...| Twitter|    SP|68883|
|         2|Domenica Williamson|      101 4th Street|     1967-06-10|  Searsboro|2018-04-09 03:40:...|williamson-domeni...|       41.5813224| -92.6991321|eafc45bf-cf8e-4c9...|Afiliado|    SC|50242|
|         

In [32]:
#Dataset de pedidos
df_pedidos = spark.read.csv("/home/felipe/Projeto2/dados/Pedidos.csv", header = True, inferSchema = True)

In [33]:
df_pedidos.show(5)

+---------+--------------------+-----------------+----------+----------+------------------+-------+------------------+----------+
|ID_Pedido|         Data_Pedido|         Desconto|ID_Produto|Quantidade|          Subtotal|Imposto|             Total|ID_Cliente|
+---------+--------------------+-----------------+----------+----------+------------------+-------+------------------+----------+
|        1|2019-02-11 14:10:...|             null|        14|         2|37.648145389078365|   2.07|39.718145389078366|         1|
|        2|2018-05-14 23:34:...|             null|       123|         3|110.93145648834248|    6.1| 117.0376564084763|         1|
|        3|2019-12-06 13:52:...|6.416679208849759|       105|         2|52.723521442619514|    2.9| 55.62208681964182|         1|
|        4|2019-08-22 08:00:...|             null|        94|         6|109.21864156655383|   6.01| 115.2207354961295|         1|
|        5|2018-10-09 19:04:...|             null|       132|         5|127.88197029833711

In [None]:
##Juncao de tabelas

In [42]:
#Converte dados da tabela clientes
for i in df_clientes.columns:
    df_clientes = df_clientes.withColumnRenamed(i,i.lower())

In [43]:
#Converte os dados da tabela pedidos
for i in df_pedidos.columns:
    df_pedidos = df_pedidos.withColumnRenamed(i, i.lower())

In [45]:
#Realiza Inner Join das tabelas clientes e pedidos
df_join = df_pedidos.join(df_clientes, df_pedidos.id_cliente==df_clientes.id_cliente, how = 'inner')

In [46]:
#Schema
df_join.printSchema()

root
 |-- id_pedido: integer (nullable = true)
 |-- data_pedido: timestamp (nullable = true)
 |-- desconto: double (nullable = true)
 |-- id_produto: integer (nullable = true)
 |-- quantidade: integer (nullable = true)
 |-- subtotal: double (nullable = true)
 |-- imposto: double (nullable = true)
 |-- total: double (nullable = true)
 |-- id_cliente: integer (nullable = true)
 |-- id_cliente: integer (nullable = true)
 |-- nome: string (nullable = true)
 |-- endereco: string (nullable = true)
 |-- data_nascimento: string (nullable = true)
 |-- cidade: string (nullable = true)
 |-- data_cadastro: timestamp (nullable = true)
 |-- email: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- senha: string (nullable = true)
 |-- fonte: string (nullable = true)
 |-- estado: string (nullable = true)
 |-- zip: integer (nullable = true)



In [47]:
#Agregacoes
from pyspark.sql.functions import col

In [48]:
#Agregação de consulta pela fonte agrupando pela somatória do total de vendas
df_consulta1 = df_join.groupBy('fonte') \
.agg({'total':'sum'}).select('fonte', col('sum(total)').alias('total_vendas'))

In [50]:
df_consulta1.show(5)

[Stage 19:>                                                         (0 + 1) / 1]

+--------+-----------------+
|   fonte|     total_vendas|
+--------+-----------------+
| Twitter|319448.7269814239|
|Afiliado|297605.2749875903|
|  Google|325183.5607881129|
|Organico|319637.0497447002|
|Facebook|333453.5126582293|
+--------+-----------------+



                                                                                

In [67]:
import uuid

In [77]:
#Inserindo registro
session.execute(
    """
    INSERT INTO TB_CONSULTA1(ID, FONTE, TOTAL_VENDAS)
    VALUES(%s, %s, %s)
    """,
    (uuid.uuid1(), "Twitter",319448.72)
)

<cassandra.cluster.ResultSet at 0x7fe00a887c40>

In [78]:
#Consultando a tabela
linhas = session.execute('select * from TB_CONSULTA1')
for linha in linhas:
    print(linha)

Row(id=UUID('021ce460-7bf9-11ec-af24-080027e8eeed'), fonte='Twitter', total_vendas=319448.72)


In [85]:
#Coletando linhas do dataframe
rows = df_consulta1.collect()

#loop para imprimir linhas
for row in rows:
    print(row.fonte)
    print(row.total_vendas)

Twitter
319448.7269814239
Afiliado
297605.2749875903
Google
325183.5607881129
Organico
319637.0497447002
Facebook
333453.5126582293


In [86]:
#Inserindo registro
for i in rows:
    session.execute(
    """
    INSERT INTO TB_CONSULTA1(ID, FONTE, TOTAL_VENDAS)
    VALUES(%s, %s, %s)
    """,
    (uuid.uuid1(), i.fonte, i.total_vendas)
    )

In [87]:
#Consultando a tabela
linhas = session.execute('select * from TB_CONSULTA1')
for linha in linhas:
    print(linha)

Row(id=UUID('f88b5c5c-7bf9-11ec-8bd6-080027e8eeed'), fonte='Afiliado', total_vendas=297605.2749875903)
Row(id=UUID('f88ecef4-7bf9-11ec-81de-080027e8eeed'), fonte='Facebook', total_vendas=333453.5126582293)
Row(id=UUID('f88e35f8-7bf9-11ec-89c0-080027e8eeed'), fonte='Organico', total_vendas=319637.0497447002)
Row(id=UUID('f8884013-7bf9-11ec-9eed-080027e8eeed'), fonte='Twitter', total_vendas=319448.7269814239)
Row(id=UUID('f88c5cf3-7bf9-11ec-b47e-080027e8eeed'), fonte='Google', total_vendas=325183.5607881129)


In [53]:
#Agregação de consulta pelo estado agrupando pela somatória do total de vendas
df_consulta2 = df_join.groupBy('estado') \
.agg({'total':'sum'}).select('estado', col('sum(total)').alias('total_vendas'))

In [88]:
df_consulta2.show(5)

+------+------------------+
|estado|      total_vendas|
+------+------------------+
|    SC|  78672.3796432496|
|    PI|60885.920881314254|
|    AM| 32415.70034927819|
|    NJ| 6768.687499559493|
|    RR| 35598.05678620004|
+------+------------------+
only showing top 5 rows



In [99]:
#Coletando todas as linhas do dataframe
rows2 = df_consulta2.collect()

In [100]:
#Inserindo registro
for i in rows2:
    session.execute(
    """
    INSERT INTO TB_CONSULTA2(ID, ESTADO, TOTAL_VENDAS)
    VALUES(%s, %s, %s)
    """,
    (uuid.uuid1(), i.estado, i.total_vendas)
    )

In [103]:
#Consultando a tabela
linhas = session.execute('select * from TB_CONSULTA2')
for linha in linhas:
    print(linha)

Row(id=UUID('f68cecf9-7bfb-11ec-90f0-080027e8eeed'), estado='NM', total_vendas=12484.900805231957)
Row(id=UUID('f675c560-7bfb-11ec-955d-080027e8eeed'), estado='NC', total_vendas=48834.20587334776)
Row(id=UUID('1423b94c-7bfc-11ec-8a7b-080027e8eeed'), estado='AM', total_vendas=32415.70034927819)
Row(id=UUID('f6794369-7bfb-11ec-8950-080027e8eeed'), estado='MD', total_vendas=7031.354914905968)
Row(id=UUID('1435a033-7bfc-11ec-84c3-080027e8eeed'), estado='TN', total_vendas=37629.65108567451)
Row(id=UUID('f6a665b9-7bfb-11ec-8337-080027e8eeed'), estado='UT', total_vendas=7451.060411854994)
Row(id=UUID('14444080-7bfc-11ec-b76a-080027e8eeed'), estado='AK', total_vendas=36984.23330876673)
Row(id=UUID('f68e7bf2-7bfb-11ec-8a00-080027e8eeed'), estado='PA', total_vendas=35051.89459979438)
Row(id=UUID('f68610d3-7bfb-11ec-bbeb-080027e8eeed'), estado='MS', total_vendas=72978.18665670906)
Row(id=UUID('f6749bb8-7bfb-11ec-98aa-080027e8eeed'), estado='CT', total_vendas=2206.771524765544)
Row(id=UUID('f69022

In [63]:
#Aregação de consulta por média aritmética de desconto
df_consulta3 = df_join.groupBy('estado', 'data_pedido') \
.agg({'desconto':'avg'}).select('estado', 'data_pedido', col('avg(desconto)').alias('media_desconto'))

In [64]:
df_consulta3.show(15)

+------+-----------+-----------------+
|estado|data_pedido|   media_desconto|
+------+-----------+-----------------+
|    SC| 2018-05-28|             null|
|    DF| 2019-11-15|             null|
|    SP| 2018-01-18|             null|
|    MA| 2018-07-06|             null|
|    MA| 2020-01-31|             null|
|    MS| 2017-10-01|8.351780505676057|
|    MT| 2017-03-04|7.672988389721389|
|    KY| 2018-08-17|             null|
|    BA| 2018-03-25|3.115455689329357|
|    IN| 2018-03-19|             null|
|    AM| 2019-03-13|             null|
|    UT| 2018-11-30|             null|
|    MT| 2018-04-11|             null|
|    IN| 2019-02-28|             null|
|    DF| 2018-11-02|             null|
+------+-----------+-----------------+
only showing top 15 rows



In [58]:
#Limpeza de dados
from pyspark.sql.functions import to_date

In [60]:
#Ajusta coluna de data
df_join=df_join.withColumn('data_pedido', to_date(df_join.data_pedido, 'yyyy-mm-dd'))

In [61]:
df_join.show(5)

+---------+-----------+-----------------+----------+----------+------------------+-------+------------------+----------+----------+------------+--------------------+---------------+----------+--------------------+--------------------+-----------------+-----------+--------------------+-------+------+-----+
|id_pedido|data_pedido|         desconto|id_produto|quantidade|          subtotal|imposto|             total|id_cliente|id_cliente|        nome|            endereco|data_nascimento|    cidade|       data_cadastro|               email|         latitude|  longitude|               senha|  fonte|estado|  zip|
+---------+-----------+-----------------+----------+----------+------------------+-------+------------------+----------+----------+------------+--------------------+---------------+----------+--------------------+--------------------+-----------------+-----------+--------------------+-------+------+-----+
|        1| 2019-02-11|             null|        14|         2|37.6481453890783

In [104]:
#Carregando consulta 3
df_consulta3.show(5)

+------+-----------+--------------+
|estado|data_pedido|media_desconto|
+------+-----------+--------------+
|    SC| 2018-05-28|          null|
|    DF| 2019-11-15|          null|
|    SP| 2018-01-18|          null|
|    MA| 2018-07-06|          null|
|    MA| 2020-01-31|          null|
+------+-----------+--------------+
only showing top 5 rows



In [105]:
#Coletando todas as linhas do dataframe
rows3 = df_consulta3.collect()

In [106]:
#Inserindo registro
for i in rows3:
    
    if i.media_desconto==None:  
        session.execute(
            """
            INSERT INTO TB_CONSULTA3(ID, ESTADO, DATA_PEDIDO, MEDIA_DESCONTO)
            VALUES(%s, %s, %s, %s)
            """,
            (uuid.uuid1(), i.estado, i.data_pedido, 0))
    else:
        
        session.execute(
            """
            INSERT INTO TB_CONSULTA3(ID, ESTADO, DATA_PEDIDO, MEDIA_DESCONTO)
            VALUES(%s, %s, %s, %s)
            """,
            (uuid.uuid1(), i.estado, i.data_pedido, i.media_desconto))

In [107]:
#Consultando a tabela
linhas = session.execute('select * from TB_CONSULTA3 limit 25;')
for linha in linhas:
    print(linha)

Row(id=UUID('95936261-7bff-11ec-9cd4-080027e8eeed'), data_pedido=Date(17177), estado='AP', media_desconto=0.0)
Row(id=UUID('961dfbd0-7bff-11ec-8b2f-080027e8eeed'), data_pedido=Date(17462), estado='NC', media_desconto=8.324442762605162)
Row(id=UUID('956885ca-7bff-11ec-8d4d-080027e8eeed'), data_pedido=Date(17505), estado='RR', media_desconto=0.0)
Row(id=UUID('96ff8841-7bff-11ec-870e-080027e8eeed'), data_pedido=Date(17591), estado='MT', media_desconto=0.0)
Row(id=UUID('94961112-7bff-11ec-968e-080027e8eeed'), data_pedido=Date(17466), estado='PB', media_desconto=0.0)
Row(id=UUID('8f85ce43-7bff-11ec-98f5-080027e8eeed'), data_pedido=Date(18241), estado='PB', media_desconto=0.0)
Row(id=UUID('954b4870-7bff-11ec-816c-080027e8eeed'), data_pedido=Date(18366), estado='RN', media_desconto=0.0)
Row(id=UUID('980a2334-7bff-11ec-8bd3-080027e8eeed'), data_pedido=Date(17347), estado='RR', media_desconto=0.0)
Row(id=UUID('9565e494-7bff-11ec-924b-080027e8eeed'), data_pedido=Date(17724), estado='PA', media_d

In [None]:
#Fim