In [1]:
import pandas as pd
import plotly.express as px
import re
from fuzzywuzzy import process



In [2]:
coffee_raw_df = pd.read_csv('./coffee_df.csv')

In [3]:
def printDataFrameInfo(df: pd.DataFrame)->None:
    """
    Função que printa as informações do df (df.info())
    e também printa as primeiras 5 linhas do df
    """
    print("--------------------")
    df.info()
    print("--------------------")
    print("--------------------")
    print(df.head())
    print("--------------------")


def convert_to_int_if_possible(col):
    """
    Função para checar se todos os valores da coluna são inteiros, se sim, converte para integer, se não retorna a original
    """
    # checando se todos os valores são inteiros
    if col.apply(float.is_integer).all():
        # Converte para integer
        return col.astype(int)
    else:
        # retorna a coluna original
        return col

## 1- Explorando os dados e primeiras impressões 

In [4]:
printDataFrameInfo(coffee_raw_df)

--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282 entries, 0 to 2281
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   slug         2282 non-null   object 
 1   all_text     2282 non-null   object 
 2   rating       2282 non-null   int64  
 3   roaster      2282 non-null   object 
 4   name         2282 non-null   object 
 5   location     2281 non-null   object 
 6   origin       2282 non-null   object 
 7   roast        2229 non-null   object 
 8   est_price    2277 non-null   object 
 9   review_date  2282 non-null   object 
 10  agtron       2282 non-null   object 
 11  aroma        2255 non-null   float64
 12  acid         1947 non-null   float64
 13  body         2279 non-null   float64
 14  flavor       2279 non-null   float64
 15  aftertaste   2279 non-null   float64
 16  with_milk    356 non-null    float64
 17  desc_1       2282 non-null   object 
 18  desc_2       2282 non-null 

- ```all_text``` é uma coluna que aparenta ter todas as informações em modo texto, no caso, não é relevante para este propósito
- ```aroma```, ```acid```, ```body```, ```aftertaste```, ```flavor``` - são colunas de avalição 0-5 de propriedades do café
- ```with_milk``` é um avaliação que apresenta muitos dados nulos, e também não achei interessante incluir na comparação, visto que não tomo café com leite :)
- ```agtron```é uma coluna meramente informativa, a respeito da coloração do café torrado - pode ser útil para compor uma página
- ```location``` aparenta ser uma coluna de onde o café foi torrado e embalado. As informações estão bem bagunçadas, misturando cidades, estados, provícias e países
- ```origin``` aparenta ser uma coluna de onde o café foi produzido. As informações estão bem bagunçadas, misturando cidades, estados, provícias e países
- ```est_price``` é um dado interessante para incluir, porém preciso normalizar, visto que possuem moedas diferentes e referências de peso diferentes.
- ```roast``` aparenta ter dados bem normalizados, poderei usar como categorização
- ```desc_x``` aparenta ser uma descrição do café, em modo texto, pode ser útil para compor uma página
- ```rating``` uma classificação de 0 a 100, com números inteiros
- ```slug```contem o link da review de onde os dados foram retirados, útil para redirecionar para o site original
- ```review_date``` é a data em que a review foi publicada, não vejo necessidades de trabalhar com essa informação, que não seja apenas exibi-la
- ```name```diz respeito ao nome do café
- ```roaster``` diz respeito ao nome da empresa que fez a torreifação


## 2- Limpeza e tratamento dos dados
### 2.1 - Plano de ação

- Dropar colunas: ```all_text```, ```with_milk```
- Dropar nulos
- Verificar se existem valores com vírgula, ou apenas inteiros: ```aroma```, ```acid```, ```body```, ```aftertaste```, ```flavor```
- Verificar únicos em ```roast``` e usar como categoria
- Normalizar: ```location```, ```origin```, ```est_price```

In [5]:
# Dropando colunas desejadas
coffee_raw_df.drop(columns=['all_text', 'with_milk'], inplace=True)
printDataFrameInfo(coffee_raw_df)

--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282 entries, 0 to 2281
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   slug         2282 non-null   object 
 1   rating       2282 non-null   int64  
 2   roaster      2282 non-null   object 
 3   name         2282 non-null   object 
 4   location     2281 non-null   object 
 5   origin       2282 non-null   object 
 6   roast        2229 non-null   object 
 7   est_price    2277 non-null   object 
 8   review_date  2282 non-null   object 
 9   agtron       2282 non-null   object 
 10  aroma        2255 non-null   float64
 11  acid         1947 non-null   float64
 12  body         2279 non-null   float64
 13  flavor       2279 non-null   float64
 14  aftertaste   2279 non-null   float64
 15  desc_1       2282 non-null   object 
 16  desc_2       2282 non-null   object 
 17  desc_3       2280 non-null   object 
dtypes: float64(5), int64(1), ob

In [6]:
# Dropando nulos
coffee_raw_df.dropna(inplace=True)
printDataFrameInfo(coffee_raw_df)

--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 1908 entries, 2 to 2280
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   slug         1908 non-null   object 
 1   rating       1908 non-null   int64  
 2   roaster      1908 non-null   object 
 3   name         1908 non-null   object 
 4   location     1908 non-null   object 
 5   origin       1908 non-null   object 
 6   roast        1908 non-null   object 
 7   est_price    1908 non-null   object 
 8   review_date  1908 non-null   object 
 9   agtron       1908 non-null   object 
 10  aroma        1908 non-null   float64
 11  acid         1908 non-null   float64
 12  body         1908 non-null   float64
 13  flavor       1908 non-null   float64
 14  aftertaste   1908 non-null   float64
 15  desc_1       1908 non-null   object 
 16  desc_2       1908 non-null   object 
 17  desc_3       1908 non-null   object 
dtypes: float64(5), int64(1), object(

In [7]:
# Convertendo (se possível) alguns tipos
columns_to_check = ['aroma', 'acid', 'body', 'aftertaste', 'flavor']
for col in columns_to_check:
    coffee_raw_df[col] = convert_to_int_if_possible(coffee_raw_df[col])
    
printDataFrameInfo(coffee_raw_df)

--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 1908 entries, 2 to 2280
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   slug         1908 non-null   object
 1   rating       1908 non-null   int64 
 2   roaster      1908 non-null   object
 3   name         1908 non-null   object
 4   location     1908 non-null   object
 5   origin       1908 non-null   object
 6   roast        1908 non-null   object
 7   est_price    1908 non-null   object
 8   review_date  1908 non-null   object
 9   agtron       1908 non-null   object
 10  aroma        1908 non-null   int64 
 11  acid         1908 non-null   int64 
 12  body         1908 non-null   int64 
 13  flavor       1908 non-null   int64 
 14  aftertaste   1908 non-null   int64 
 15  desc_1       1908 non-null   object
 16  desc_2       1908 non-null   object
 17  desc_3       1908 non-null   object
dtypes: int64(6), object(12)
memory usage: 283.2+ KB
----

In [8]:
# Verificar valores únicos em roast
print(coffee_raw_df['roast'].unique())

['Medium-Light' 'Medium' 'Light' 'Very Dark' 'Medium-Dark' 'Dark']


In [9]:
coffee_raw_df['roast'] = coffee_raw_df['roast'].astype('category')
printDataFrameInfo(coffee_raw_df)

--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 1908 entries, 2 to 2280
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   slug         1908 non-null   object  
 1   rating       1908 non-null   int64   
 2   roaster      1908 non-null   object  
 3   name         1908 non-null   object  
 4   location     1908 non-null   object  
 5   origin       1908 non-null   object  
 6   roast        1908 non-null   category
 7   est_price    1908 non-null   object  
 8   review_date  1908 non-null   object  
 9   agtron       1908 non-null   object  
 10  aroma        1908 non-null   int64   
 11  acid         1908 non-null   int64   
 12  body         1908 non-null   int64   
 13  flavor       1908 non-null   int64   
 14  aftertaste   1908 non-null   int64   
 15  desc_1       1908 non-null   object  
 16  desc_2       1908 non-null   object  
 17  desc_3       1908 non-null   object  
dtypes: category(

### Limpando ```location``` e ```origin```

In [10]:
## LIMPANDO location e origin

print(coffee_raw_df["location"].value_counts())
print('------------------------')
print(coffee_raw_df["origin"].value_counts())

location
Madison, Wisconsin                        137
Chia-Yi, Taiwan                           108
Minneapolis, Minnesota                    107
San Diego, California                     100
Taipei, Taiwan                             85
                                         ... 
Glendora, California                        1
Songshan District, Taipei City, Taiwan      1
Kountze, Texas                              1
Savannah, Georgia                           1
Bigfork, Montana                            1
Name: count, Length: 272, dtype: int64
------------------------
origin
Yirgacheffe growing region, southern Ethiopia                  77
Guji Zone, Oromia Region, southern Ethiopia                    58
Nyeri growing region, south-central Kenya                      53
Boquete growing region, western Panama                         34
Sidamo (also Sidama) growing region, south-central Ethiopia    32
                                                               ..
Kona growing regio

#### Conclusões
- Percebi que ambas estão no seguinte padrão:
    - Se há o delimitador ";" quer dizer que são duas regiões diferentes, como se os dados estivessem em uma lista. Ex: Brazil;Ecuador;Ethiopia
    - Se há o delimitador "," quer dizer que há uma subregião (menor) e uma região (maior)Ex: Cidade, Estado ou Província, País.
    - Há a combinação de ambos casos. Ex: Bazil; Oromia Region, Ethiopia
- Quero manter apenas os países criando duas colunas novas ```location_country``` e ```origin_country```

In [11]:
def extract_country(location_string:str)->str:
    # Dividindo para pegar as diferentes regiões
    regions = location_string.split(';')
    countries = []
    
    for region in regions:
        # Dividindo por vírgulas e pegando a última parte
        parts = region.split(',')
        countries.append(parts[-1].strip())
    return '; '.join(countries)

coffee_raw_df['location_country'] = coffee_raw_df['location'].apply(extract_country)
coffee_raw_df['origin_country'] = coffee_raw_df['origin'].apply(extract_country)

printDataFrameInfo(coffee_raw_df)

--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 1908 entries, 2 to 2280
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   slug              1908 non-null   object  
 1   rating            1908 non-null   int64   
 2   roaster           1908 non-null   object  
 3   name              1908 non-null   object  
 4   location          1908 non-null   object  
 5   origin            1908 non-null   object  
 6   roast             1908 non-null   category
 7   est_price         1908 non-null   object  
 8   review_date       1908 non-null   object  
 9   agtron            1908 non-null   object  
 10  aroma             1908 non-null   int64   
 11  acid              1908 non-null   int64   
 12  body              1908 non-null   int64   
 13  flavor            1908 non-null   int64   
 14  aftertaste        1908 non-null   int64   
 15  desc_1            1908 non-null   object  
 16  desc_2  

### Conclusões
- Percebi que:
    - Muitos países acompanham termos de direção como southern $Country ou south-central $Country
    - Apesar de termos muitos nomes de países, alguns estão escritos incorretamente
    - Muitos "países" são estados, províncias dos EUA, Canadá, México e etc...
- Logo:
    - Vou fazer uma lista de palavras relacionadas à direção e remover das strings
    - Padronizar os nomes dos países, e usar a biblioteca FuzzyWuzzy para encontrar possíveis erros de digitação e corrigí-los
    - Fazer uma lista de estados e províncias dos EUA, México, Canadá e Taiwan para subsituir pelo nome do país apenas

In [12]:
def remove_directional_terms(location_string:str)->str:
    direction_keywords = [
        'southern',
        'northern',
        'eastern',
        'western',
        'south-central',
        'north-central',
        'west-central',
        'east-central',
        "north-eastern",
        "south-eastern",
        "north-western",
        "south-western",
        "northeastern",
        "southeastern",
        "southwestern",
        "northwest",
        "northeast",
        "southeast",
        "southwest",
        "northwestern",
        "central",
        'south-',
        'north-',
        'west-',
        'east-',
        "far",
        ]
    
    countries = location_string.split(';')
    countries_fixed = []
    
    for country in countries:
        country = re.sub(r'[“”"\'()]', '', country)
        
        for word in direction_keywords:
            country = re.sub(rf'\b{word}\b', '', country, flags=re.IGNORECASE).strip()
        
        countries_fixed.append(country)
    
    return ';'.join(countries_fixed)

coffee_raw_df['location_country'] = coffee_raw_df['location_country'].apply(remove_directional_terms)
coffee_raw_df['origin_country'] = coffee_raw_df['origin_country'].apply(remove_directional_terms)
printDataFrameInfo(coffee_raw_df)
print()
print( coffee_raw_df['origin_country'].unique())
print()
print( coffee_raw_df['location_country'].unique())

--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 1908 entries, 2 to 2280
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   slug              1908 non-null   object  
 1   rating            1908 non-null   int64   
 2   roaster           1908 non-null   object  
 3   name              1908 non-null   object  
 4   location          1908 non-null   object  
 5   origin            1908 non-null   object  
 6   roast             1908 non-null   category
 7   est_price         1908 non-null   object  
 8   review_date       1908 non-null   object  
 9   agtron            1908 non-null   object  
 10  aroma             1908 non-null   int64   
 11  acid              1908 non-null   int64   
 12  body              1908 non-null   int64   
 13  flavor            1908 non-null   int64   
 14  aftertaste        1908 non-null   int64   
 15  desc_1            1908 non-null   object  
 16  desc_2  

In [13]:
# Ajustes Manuais

#Correções nos nomes dos países
name_mapping = {
    "Hawai'i": "Hawaii",
    "Hawai’i": "Hawaii",
    "Big Island of Hawai'i": "Hawaii",
    "Big Island of Hawai’i": "Hawaii",
    "DRC Congo": "Democratic Republic of the Congo",
    "Democratic Republic of Congo": "Democratic Republic of the Congo",
    "Columbia": "Colombia",
    "Costa RIca": "Costa Rica",
    "East Malaysia": "Malaysia",
    "west- Colombia": "Colombia",
    "south- Ecuador": "Ecuador",
    "west- Bolivia": "Bolivia",
    "the Democratic Republic of the Congo": "Democratic Republic of the Congo"
}


# Mapeamento de estados/províncias
state_to_country_mapping = {
    'Montana': 'United States',
    'Washington': 'United States',
    'Virginia': 'United States',
    'Connecticut': 'United States',
    'California': 'United States',
    'Wisconsin': 'United States',
    'Kansas': 'United States',
    'Minnesota': 'United States',
    'Florida': 'United States',
    'Hawaii': 'United States',
    'Wyoming': 'United States',
    'Oregon': 'United States',
    'Colorado': 'United States',
    'Maine': 'United States',
    'Kentucky': 'United States',
    'New Hampshire': 'United States',
    'New Jersey': 'United States',
    'Massachusetts': 'United States',
    'Illinois': 'United States',
    'Texas': 'United States',
    'Pennsylvania': 'United States',
    'North Carolina': 'United States',
    'New Mexico': 'United States',
    'Idaho': 'United States',
    'Ohio': 'United States',
    'Tennessee': 'United States',
    'Oklahoma': 'United States',
    'New York': 'United States',
    'Vermont': 'United States',
    'Georgia': 'United States',
    'Michigan': 'United States',
    'Louisiana': 'United States',
    'Mississippi': 'United States',
    'Alabama': 'United States',
    'D.C.': 'United States',
    'Maryland': 'United States',
    'Nevada': 'United States',
    'Iowa': 'United States',
    'Missouri': 'United States',
    'Alaska': 'United States',
    'Arizona': 'United States',
    'Canada': 'Canada',
    'Taiwan': 'Taiwan',
    'China': 'China',
    'Australia': 'Australia',
    'South Korea': 'South Korea',
    'United Arab Emirates': 'United Arab Emirates',
    'Uganda': 'Uganda',
    'Mexico': 'Mexico',
    'Indonesia': 'Indonesia',
    'Japan': 'Japan',
    'Peru': 'Peru',
    'Honduras': 'Honduras',
    'Colombia': 'Colombia',
    'England': 'United Kingdom',
    'Guatemala': 'Guatemala',
    'Kenya': 'Kenya',
    'Californiaa': 'United States',
    'Calfornia': 'United States',
    'Washingto': 'United States',
    'MInnesota': 'United States',
    'Los Angeles': 'United States',
    'Branford Connecticut': 'United States'
}

def normalize_origin_manually(location):
    location = location.strip()
    
    # Map to standard names
    for key, value in name_mapping.items():
        if key in location:
            location = location.replace(key, value)

    return location

def normalize_location_manually(location:str)->str:
    return state_to_country_mapping.get(location, location)

coffee_raw_df['origin_country'] = coffee_raw_df['origin_country'].apply(normalize_origin_manually)
coffee_raw_df['location_country'] = coffee_raw_df['location_country'].apply(normalize_location_manually)

printDataFrameInfo(coffee_raw_df)
print()
print( coffee_raw_df['origin_country'].unique())
print()
print( coffee_raw_df['location_country'].unique())

--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 1908 entries, 2 to 2280
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   slug              1908 non-null   object  
 1   rating            1908 non-null   int64   
 2   roaster           1908 non-null   object  
 3   name              1908 non-null   object  
 4   location          1908 non-null   object  
 5   origin            1908 non-null   object  
 6   roast             1908 non-null   category
 7   est_price         1908 non-null   object  
 8   review_date       1908 non-null   object  
 9   agtron            1908 non-null   object  
 10  aroma             1908 non-null   int64   
 11  acid              1908 non-null   int64   
 12  body              1908 non-null   int64   
 13  flavor            1908 non-null   int64   
 14  aftertaste        1908 non-null   int64   
 15  desc_1            1908 non-null   object  
 16  desc_2  

In [14]:

# coffee_raw_df['countries_separated'] = coffee_raw_df['origin_country'].apply(lambda x: x.split(';') if x else [])
# df_uni = coffee_raw_df.explode('countries_separated')

### Normalizando ```est_price```

In [15]:
# Normalizando est_price para USD/100g

def extract_price_info(price_info:str)->pd.Series:
    parts = price_info.split("/")
    weight_measurement_unit = None
    currency = None
    weight = None
    price = None
    try:
        weight_measurement_unit = parts[1].split(" ")[1].replace(";","").replace("*","")
        currency_value = parts[0].split(" ")
        if(len(currency_value)==2):
            currency = currency_value[0].replace("$","")
            price = float(currency_value[1].replace("$","").replace(",","."))
        elif(len(currency_value)==1):
            currency = "USD"
            price = float(currency_value[0].replace("$","").replace(",","."))
        weight = float(parts[1].split(" ")[0].replace(",","."))

    except:
        pass
    return pd.Series([currency, price, weight, weight_measurement_unit])


coffee_raw_df[['currency', 'price', 'weight', 'weight_measurement_unit']] = coffee_raw_df['est_price'].apply(extract_price_info)
coffee_raw_df.dropna(inplace=True)
print(coffee_raw_df['currency'].unique())
print(coffee_raw_df['weight_measurement_unit'].unique())

['USD' 'CAD' 'NT' 'HKD' 'AUD' 'KRW' 'AED' 'NTD' 'IDR' 'US' '¥']
['ounces' 'grams' 'capsules' 'pounds' 'sticks']


In [16]:
coffee_raw_df = coffee_raw_df[coffee_raw_df['weight_measurement_unit']!='sticks']
coffee_raw_df = coffee_raw_df[coffee_raw_df['weight_measurement_unit']!='capsules']
print(coffee_raw_df['currency'].unique())
print(coffee_raw_df['weight_measurement_unit'].unique())

['USD' 'CAD' 'NT' 'HKD' 'AUD' 'KRW' 'AED' 'NTD' 'IDR' 'US' '¥']
['ounces' 'grams' 'pounds']


In [17]:
conversion_rates = {
    'USD': 1,
    'CAD': 0.75,
    'NT': 0.032,
    'HKD': 0.13,
    'AUD': 0.65,
    'KRW': 0.00075,
    'AED': 0.27,
    'NTD': 0.032,
    'IDR': 0.000065,
    'US': 1,
    '¥': 0.0068
}

# Function to convert weights to grams
def convert_weight_to_grams(row):
    weight = row['weight']
    unit = row['weight_measurement_unit']
    if unit == 'ounces':
        return weight * 28.3495
    elif unit == 'pounds':
        return weight * 453.592
    return weight

# Function to convert prices to USD
def convert_price_to_usd(row):
    price = row['price']
    currency = row['currency']
    
    conversion_rate = conversion_rates.get(currency, 1) 
    return price * conversion_rate

def normalize_price_per_100g(row):
    weight_in_grams = row['weight_in_grams']
    price_in_usd = row['price_in_usd']
    
    if weight_in_grams > 0:
        return (price_in_usd / weight_in_grams) * 100
    return None


coffee_raw_df['weight_in_grams'] = coffee_raw_df.apply(convert_weight_to_grams, axis=1)
coffee_raw_df['price_in_usd'] = coffee_raw_df.apply(convert_price_to_usd, axis=1)
coffee_raw_df['price_per_100g'] = coffee_raw_df.apply(normalize_price_per_100g, axis=1)
coffee_raw_df.dropna(inplace=True)
printDataFrameInfo(coffee_raw_df)

--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 1864 entries, 2 to 2280
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   slug                     1864 non-null   object  
 1   rating                   1864 non-null   int64   
 2   roaster                  1864 non-null   object  
 3   name                     1864 non-null   object  
 4   location                 1864 non-null   object  
 5   origin                   1864 non-null   object  
 6   roast                    1864 non-null   category
 7   est_price                1864 non-null   object  
 8   review_date              1864 non-null   object  
 9   agtron                   1864 non-null   object  
 10  aroma                    1864 non-null   int64   
 11  acid                     1864 non-null   int64   
 12  body                     1864 non-null   int64   
 13  flavor                   1864 non-null   int64 

In [18]:
coffee_raw_df.drop(columns=['location', 'origin', 'price_in_usd', 'weight_in_grams', 'weight_measurement_unit', 'weight', 'price', 'currency', 'est_price'], inplace=True)
printDataFrameInfo(coffee_raw_df)

--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 1864 entries, 2 to 2280
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   slug              1864 non-null   object  
 1   rating            1864 non-null   int64   
 2   roaster           1864 non-null   object  
 3   name              1864 non-null   object  
 4   roast             1864 non-null   category
 5   review_date       1864 non-null   object  
 6   agtron            1864 non-null   object  
 7   aroma             1864 non-null   int64   
 8   acid              1864 non-null   int64   
 9   body              1864 non-null   int64   
 10  flavor            1864 non-null   int64   
 11  aftertaste        1864 non-null   int64   
 12  desc_1            1864 non-null   object  
 13  desc_2            1864 non-null   object  
 14  desc_3            1864 non-null   object  
 15  location_country  1864 non-null   object  
 16  origin_c

In [19]:
coffee_raw_df.to_csv('data/clean_coffee_data.csv', index=False)

## Testando Algumas Visualizações

In [21]:
import plotly.express as px
import pandas as pd
import random

# Pegando duas linhas aleatórias do DataFrame
random_index_1 = random.randint(0, len(coffee_raw_df) - 1)
random_index_2 = random.randint(0, len(coffee_raw_df) - 1)

# Selecionando os valores de uma linha aleatória
row_1 = coffee_raw_df.loc[random_index_1, ['aroma', 'acid', 'body', 'flavor', 'aftertaste']]
row_2 = coffee_raw_df.loc[random_index_2, ['aroma', 'acid', 'body', 'flavor', 'aftertaste']]

# Criando o gráfico de radar para a primeira linha
fig_1 = px.line_polar(r=row_1.values, theta=row_1.index, line_close=True, 
                      title=f'Gráfico de Radar - Linha {random_index_1}')
fig_1.show()

# Criando o gráfico de radar para a segunda linha
fig_2 = px.line_polar(r=row_2.values, theta=row_2.index, line_close=True, 
                      title=f'Gráfico de Radar - Linha {random_index_2}')
fig_2.show()

In [22]:
# Criando o histograma das notas (ratings)
fig_hist = px.histogram(coffee_raw_df, x='rating', nbins=20, title='Histograma de Notas (Ratings)')
fig_hist.show()

In [23]:
# Criando o gráfico de dispersão preço vs rating
fig_scatter = px.scatter(coffee_raw_df, x='price_per_100g', y='rating', 
                         title='Dispersão: Preço vs Nota (Rating)',
                         labels={'price_per_100g': 'Preço por 100g (USD)', 'rating': 'Nota (Rating)'})
fig_scatter.show()