In [0]:
%sql

SELECT * FROM workspace.sc_gold.historico_de_servicos


In [0]:
%sql
DROP TABLE IF EXISTS sc_gold.historico_de_servicos_2;
CREATE TABLE sc_gold.historico_de_servicos_2 AS
SELECT viatura,descricao_servico_pos_venda,tipo_de_servico
FROM sc_gold.historico_de_servicos;

In [0]:
%sql

SELECT * FROM workspace.sc_gold.historico_de_servicos_2

In [0]:
%sql
CREATE OR REPLACE TABLE workspace.sc_gold.historico_de_servicos_2 AS
SELECT *
FROM workspace.sc_gold.historico_de_servicos_2
WHERE descricao_servico_pos_venda IS NOT NULL


In [0]:
%sql
CREATE OR REPLACE TABLE workspace.sc_gold.historico_de_servicos_2 AS
SELECT *
FROM workspace.sc_gold.historico_de_servicos_2
WHERE viatura IS NOT NULL

In [0]:
df = spark.table("workspace.sc_gold.historico_de_servicos_2")


In [0]:
from pyspark.sql import functions as F

table_name = "workspace.sc_gold.historico_de_servicos_2"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)

In [0]:
%sql
-- Count of distinct descriptions
SELECT COUNT(DISTINCT descricao_servico_pos_venda) AS distinct_count
FROM workspace.sc_gold.historico_de_servicos_2;

-- Show the distinct text values
SELECT DISTINCT descricao_servico_pos_venda
FROM workspace.sc_gold.historico_de_servicos_2
ORDER BY descricao_servico_pos_venda;


In [0]:
%sql
SELECT * FROM sc_gold.viaturas_2

In [0]:
%sql
CREATE OR REPLACE TABLE workspace.sc_gold.join_viatura_historico AS
SELECT
  v.*,
  h.*
FROM sc_gold.viaturas_2 AS v
LEFT JOIN workspace.sc_gold.historico_de_servicos_2 AS h
  ON v.id = h.viatura;


In [0]:
%sql
SELECT * FROM workspace.sc_gold.join_viatura_historico

In [0]:
from pyspark.sql import functions as F

# Load the new joined table
table_name = "workspace.sc_gold.join_viatura_historico"
df = spark.table(table_name)

# Count total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100).alias(c)
        for c in df.columns
    ])
)

display(null_percentages)


In [0]:
%sql
CREATE OR REPLACE TABLE workspace.sc_gold.join_viatura_historico AS
SELECT *
FROM workspace.sc_gold.join_viatura_historico
WHERE viatura IS NOT NULL;


In [0]:
# Load the existing Delta table into a Spark DataFrame
df = spark.table("workspace.sc_gold.join_viatura_historico")

# Preview the data
display(df)


In [0]:
# Keep only the columns you need (adjust names)
df= df.select("viatura", "modelo", "tipo_de_servico").dropna(subset=["viatura", "modelo", "tipo_de_servico"])

# Convert to Pandas for Apriori analysis
pdf = df_clean.toPandas()


In [0]:
# STEP 3: Build the "basket" — group all service types per vehicle
basket = pdf.groupby(["modelo"])["tipo_de_servico"].apply(list).reset_index()

basket.head()


In [0]:
%pip install mlxtend


In [0]:
# STEP 4: One-hot encode the list of service types
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(basket["tipo_de_servico"]).transform(basket["tipo_de_servico"])
basket_encoded = pd.DataFrame(te_ary, columns=te.columns_)

basket_encoded.head()


In [0]:
from mlxtend.frequent_patterns import apriori, association_rules

# Find frequent service combinations per model
frequent_itemsets = apriori(basket_encoded, min_support=0.05, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Convert sets to strings for visualization (Databricks safe)
rules["antecedents"] = rules["antecedents"].apply(lambda x: ', '.join(list(x)))
rules["consequents"] = rules["consequents"].apply(lambda x: ', '.join(list(x)))

# Sort and show
rules = rules.sort_values(by="lift", ascending=False)
display(rules)  # ✅ now safe


In [0]:
# Install libs (run once per cluster attach)
%pip install mlxtend

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [0]:
# Load the table (already in Databricks)
df = spark.table("workspace.sc_gold.join_viatura_historico")

# Keep only what we need and drop nulls
df_clean = (
    df.select("viatura", "modelo", "tipo_de_servico")
      .dropna(subset=["viatura", "modelo", "tipo_de_servico"])
)

# Convert to pandas for Apriori (mlxtend works on pandas)
pdf = df_clean.toPandas()

pdf.head()


In [0]:
# Each row = one modelo; items = list of service types seen for that modelo
basket_services_by_model = (
    pdf.groupby(["modelo"])["tipo_de_servico"]
       .apply(list)
       .reset_index()
       .rename(columns={"tipo_de_servico": "items"})
)

basket_services_by_model.head()


In [0]:
# One-hot encode
te = TransactionEncoder()
te_ary = te.fit(basket_services_by_model["items"]).transform(basket_services_by_model["items"])
basket_encoded_A = pd.DataFrame(te_ary, columns=te.columns_)

# Parameters you can tune
MIN_SUPPORT = 0.05     # try 0.02 if you need more rules
MIN_LIFT    = 1.0

# Frequent itemsets and rules
fi_A = apriori(basket_encoded_A, min_support=MIN_SUPPORT, use_colnames=True)
rules_A = association_rules(fi_A, metric="lift", min_threshold=MIN_LIFT).sort_values("lift", ascending=False)

# Make Arrow/Databricks-friendly (sets → strings)
rules_A["antecedents"] = rules_A["antecedents"].apply(lambda s: ", ".join(sorted(list(s))))
rules_A["consequents"] = rules_A["consequents"].apply(lambda s: ", ".join(sorted(list(s))))

display(rules_A)


In [0]:
# Build per-vehicle baskets that include BOTH model(s) and service types
# If a viatura has multiple registros of modelo, we keep all; duplicates don’t hurt Apriori.
basket_by_viatura = (
    pdf.groupby("viatura")[["modelo", "tipo_de_servico"]]
       .agg(list)
       .reset_index()
)

# Merge the two lists into one list of items (model(s) + service types)
basket_by_viatura["items"] = basket_by_viatura.apply(
    lambda row: list(map(str, row["modelo"])) + list(map(str, row["tipo_de_servico"])),
    axis=1
)

basket_by_viatura[["viatura", "items"]].head()


In [0]:
# One-hot encode
te2 = TransactionEncoder()
te2_ary = te2.fit(basket_by_viatura["items"]).transform(basket_by_viatura["items"])
basket_encoded_B = pd.DataFrame(te2_ary, columns=te2.columns_)

# Parameters (you can use different ones here if you want)
MIN_SUPPORT_B = 0.02   # models + services → typically need a slightly lower support
MIN_LIFT_B    = 1.0

# Frequent itemsets and rules
fi_B = apriori(basket_encoded_B, min_support=MIN_SUPPORT_B, use_colnames=True)
rules_B = association_rules(fi_B, metric="lift", min_threshold=MIN_LIFT_B)

# Convert sets to strings for display
rules_B["antecedents"] = rules_B["antecedents"].apply(lambda s: ", ".join(sorted(list(s))))
rules_B["consequents"] = rules_B["consequents"].apply(lambda s: ", ".join(sorted(list(s))))

# Optional: keep rules that connect a model with a service (at least one side is a model name)
# Adjust model filters to your naming (case-insensitive)
model_pattern = r"(i10|i20|i30|tucson|accent|elantra|sonata|creta|venue|kona|atos|avante|ix35)"
is_model_rule = (
    rules_B["antecedents"].str.contains(model_pattern, case=False) |
    rules_B["consequents"].str.contains(model_pattern, case=False)
)
rules_B = rules_B[is_model_rule]

# Sort strongest first
rules_B = rules_B.sort_values("lift", ascending=False)

display(rules_B)


In [0]:
import matplotlib.pyplot as plt

# Filter to keep only rules where antecedent is a model name and consequent is a service
rules_filtered = rules[
    rules['antecedents'].str.contains('TUCSON|I20|I30|CRETA|IX35', case=False)
    & ~rules['consequents'].str.contains('TUCSON|I20|I30|CRETA|IX35', case=False)
]

# Take top 10 by lift
top_rules = rules_filtered.nlargest(10, 'lift')

plt.figure(figsize=(10,6))
plt.barh(top_rules['antecedents'] + " → " + top_rules['consequents'], top_rules['lift'])
plt.xlabel("Lift (strength of relationship)")
plt.title("Top 10 Model → Service Associations")
plt.gca().invert_yaxis()
plt.show()


In [0]:
plt.figure(figsize=(8,6))
plt.scatter(rules_filtered['support'], rules_filtered['confidence'], alpha=0.7)
plt.xlabel("Support (frequency)")
plt.ylabel("Confidence (reliability)")
plt.title("Model → Service Rule Strength")
plt.grid(True)
plt.show()


In [0]:
import seaborn as sns

# Create a pivot table of lift
pivot = rules_filtered.pivot_table(index='antecedents', columns='consequents', values='lift')

plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title("Lift by Model and Service Type")
plt.ylabel("Model")
plt.xlabel("Service Type")
plt.show()


In [0]:
# Load from Databricks
df = spark.table("workspace.sc_gold.join_viatura_historico")

# Clean
df_clean = (
    df.select("viatura", "modelo", "tipo_de_servico")
      .dropna(subset=["viatura", "modelo", "tipo_de_servico"])
)
pdf = df_clean.toPandas()

# Build baskets per vehicle (each car instance)
baskets = pdf.groupby("viatura")[["modelo", "tipo_de_servico"]].agg(list).reset_index()

# Merge only services for Apriori analysis
baskets["services"] = baskets["tipo_de_servico"].apply(lambda x: list(set(x)))

# Keep model as metadata (not as part of items)
baskets["modelo"] = baskets["modelo"].apply(lambda x: x[0] if len(x) > 0 else None)

baskets.head()


In [0]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

model_rules_list = []

for model_name, subset in baskets.groupby("modelo"):
    if len(subset) < 5:
        continue  # skip models with too few records
    te = TransactionEncoder()
    te_ary = te.fit(subset["services"]).transform(subset["services"])
    df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
    
    # Apriori on services only
    frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
    
    rules["modelo"] = model_name
    model_rules_list.append(rules)

# Combine all models
model_service_rules = pd.concat(model_rules_list, ignore_index=True)

# Format for display
model_service_rules["antecedents"] = model_service_rules["antecedents"].apply(lambda x: ", ".join(list(x)))
model_service_rules["consequents"] = model_service_rules["consequents"].apply(lambda x: ", ".join(list(x)))

display(model_service_rules.sort_values("lift", ascending=False))
