# Analysis orders
- Distribuição dos pedidos ao longo do tempo
- Correlação entre preço
- Frete e avaliações
- Análise de geolocalização dos clientes
- Previsão de atraso nas entregas

In [3]:
# Creating Session and import libs
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, count

# Create a Spark Session
spark = SparkSession.builder.appName("Order olist").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/27 09:32:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Analyzing the order over time

In [7]:
# Load the CSV files into PySpark DataFrames
orders = spark.read.csv('data/olist_orders_dataset.csv', header=True, inferSchema=True)

# Convert order_purchase_timestamp to date
orders = orders.withColumn('order_date', to_date('order_purchase_timestamp'))

# Group by order_date and count the number of orders
order_distribution = orders.groupBy('order_date').agg(count('order_id').alias('order_count'))

# Show the result
order_distribution.show()



+----------+-----------+
|order_date|order_count|
+----------+-----------+
|2017-09-11|        180|
|2018-08-10|        256|
|2018-05-28|        143|
|2017-08-11|        141|
|2018-06-06|        227|
|2018-03-17|        180|
|2017-01-06|          4|
|2018-08-08|        316|
|2018-06-26|        243|
|2018-08-11|        188|
|2016-10-03|          8|
|2018-06-30|        124|
|2017-01-27|         62|
|2017-02-26|         46|
|2017-09-28|        143|
|2017-01-24|         40|
|2017-09-29|        121|
|2017-06-29|        114|
|2017-07-31|        148|
|2018-03-23|        221|
+----------+-----------+
only showing top 20 rows



                                                                                

## Correlation between price, shipping and reviews

In [12]:
# Load the CSV files into PySpark DataFrames
order_itens = spark.read.csv('data/olist_order_items_dataset.csv', header=True, inferSchema=True)
reviews = spark.read.csv('data/olist_order_reviews_dataset.csv', header=True, inferSchema=True)

# Join order_items and reviews on order_id
joined_df = order_items.join(reviews, order_items.order_id == reviews.order_id, 'left')

# Select relevant columns
columns = ['price','freight_value','review_score']
correlation_df = joined_df.select(columns)

# calculation correlation between price, freight_value, and review_score
correlation_matrix = correlation_df.toPandas().corr()

# Show the correlation matrix
print(correlation_matrix)

                  price  freight_value  review_score
price          1.000000       0.414349     -0.003941
freight_value  0.414349       1.000000     -0.036179
review_score  -0.003941      -0.036179      1.000000


## Customer geolocation analysis

In [None]:
import pandas as pd
import folium

# Load the CSV files into Pandas DataFrames
customers = pd.read_csv('data/olist_customers_dataset.csv')
geolocation = pd.read.csv('data/olist_geolocation_dataset.csv')


