In [None]:
import sklearn
import datetime
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib as m
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

# Formatação dos gráficos
plt.style.use('fivethirtyeight')
plt.figure(1 , figsize = (15 , 6))
%matplotlib inline

In [None]:
df_food_delivery = pd.read_csv("dados/dataset.csv", encoding = 'utf-8')
df_food_delivery.head()

In [None]:
# Verificando valores nulos
df_food_delivery.isnull().sum()

In [None]:
# Tipos de dados
df_food_delivery.dtypes

In [None]:
#transformando a coluna localidade em object
df_food_delivery['localidade'] = df_food_delivery['localidade'].astype('object')

In [None]:
df_food_delivery.dtypes

In [None]:
df_food_delivery.count()

In [None]:
# Criando uma tabela Pivot para mudar a perspectiva do dado
df_pivot = df_food_delivery.pivot_table(index = ['id_transacao'], columns = ['nome_item'], values = 'quantidade_item')

In [None]:
# Preenchendo os valores nulos criados no passo anterior e fazendo um merge para incluir a louna localidade e horario
df_pivot.head()
df_pivot = df_pivot.fillna(0).reset_index()
df_pivot = df_pivot.merge(df_food_delivery[['id_transacao', 'localidade', 'horario_pedido']])

In [None]:
df_pivot.count()

In [None]:
df_pivot = df_pivot.drop_duplicates()

In [None]:
df_pivot.count()

In [None]:
# Criando a coluna fim de semana
df_pivot['horario_pedido'] = pd.to_datetime(df_pivot['horario_pedido'])

# Função para verificar se a data é um fim de semana
def e_fim_de_semana(data):
    return data.weekday() in [5, 6]  # 5 representa sábado e 6 representa domingo

# Criar a nova coluna 'fim_de_semana'
df_pivot['fim_de_semana'] = df_pivot['horario_pedido'].apply(e_fim_de_semana)

In [None]:
df_pivot.count()

In [None]:
# Removendo as colunas que nao serao usadas no modelo
df_limpo = df_pivot.iloc[:,range(1,8)]

In [None]:
df_final =  df_limpo.drop('horario_pedido', axis=1)

In [None]:
df_final['fds'] = df_final['fim_de_semana'].astype(int)

In [None]:
df_final = df_final.drop('fim_de_semana', axis=1)

In [None]:
df_final.head()


In [None]:
df_final.count()

In [None]:
plt.figure(figsize=(7,7))
for i in range(len(df_final.columns)):
    plt.subplot(6, 1, i + 1)
    sns.histplot(df_final[df_final.columns[i]], kde=True)
plt.tight_layout();

In [None]:
correlacoes = df_final.corr()
f, ax = plt.subplots(figsize=(7,7))
sns.heatmap(correlacoes, annot=True)

In [None]:
# Padronizacao 
scaler = StandardScaler()
df_final_scaled = scaler.fit_transform(df_final)
df_final_scaled

In [None]:
wcss = []
range_values = range(1, 10)
for i in range_values:
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(df_final_scaled)
    wcss.append(kmeans.inertia_);


In [None]:
plt.plot(wcss, 'bx-')
plt.xlabel('Clusters')
plt.ylabel('wcss');

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_final_scaled)
labels = kmeans.labels_

In [None]:
labels, len(labels)

In [None]:
np.unique(labels, return_counts=True)

In [None]:
cluster_centers = pd.DataFrame(data= kmeans.cluster_centers_, columns=df_final.columns)
cluster_centers

In [None]:
type(cluster_centers)

In [None]:
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data= cluster_centers, columns=df_final.columns)
cluster_centers

In [None]:
# termina o teste 1

In [None]:
df_final2 = pd.DataFrame(data=df_final, columns=df_final.columns)
df_final2 = df_final2.drop(columns=['localidade', 'fds'], axis=1)
df_final2.reset_index() 
df_final2

In [None]:
df_final2.describe()

In [None]:
scaler2 = StandardScaler()
df_final_scaled2 = scaler2.fit_transform(df_final2)
df_final_scaled2

In [None]:
kmeans2 = KMeans(n_clusters=3)
kmeans2.fit(df_final_scaled2)
labels2 = kmeans2.labels_

In [None]:
labels2, len(labels2)

In [None]:
cluster_centers2 = pd.DataFrame(data= kmeans2.cluster_centers_, columns=df_final2.columns)
cluster_centers2

In [None]:
cluster_centers3 = scaler2.inverse_transform(cluster_centers2)
cluster_centers3 = pd.DataFrame(data=cluster_centers3, columns=df_final2.columns )
cluster_centers3



In [None]:
df_final2['cluster'] = labels2


In [None]:
df_final2.head()

In [None]:
box = df_final2

In [None]:
fig = plt.figure(figsize=(8,5))
sns.boxplot(x='cluster',
            y='bebida', 
            data=box);

In [None]:
fig = plt.figure(figsize=(8,5))
sns.boxplot(x='cluster',
            y='pizza', 
            data=box);

In [None]:
fig = plt.figure(figsize=(8,5))
sns.boxplot(x='cluster',
            y='sobremesa', 
            data=box);

In [None]:
fig = plt.figure(figsize=(8,5))
sns.boxplot(x='cluster',
            y='salada', 
            data=box);

In [None]:
arquivo = df_final2
arquivo

In [None]:
np.unique(labels2, return_counts=True)