In [1]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

#-------------------------------------
from datetime import datetime

In [2]:
df = pd.read_csv('./files/df_arabica_clean.csv')

In [3]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,ID,Country of Origin,Farm Name,Lot Number,Mill,ICO Number,Company,Altitude,Region,Producer,Number of Bags,Bag Weight,In-Country Partner,Harvest Year,Grading Date,Owner,Variety,Status,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Overall,Defects,Total Cup Points,Moisture Percentage,Category One Defects,Quakers,Color,Category Two Defects,Expiration,Certification Body,Certification Address,Certification Contact
0,0,0,Colombia,Finca El Paraiso,CQU2022015,Finca El Paraiso,,Coffee Quality Union,1700-1930,"Piendamo,Cauca",Diego Samuel Bermudez,1,35 kg,Japan Coffee Exchange,2021 / 2022,"September 21st, 2022",Coffee Quality Union,Castillo,Completed,Double Anaerobic Washed,8.58,8.5,8.42,8.58,8.25,8.42,10.0,10.0,10.0,8.58,0.0,89.33,11.8,0,0,green,3,"September 21st, 2023",Japan Coffee Exchange,"〒413-0002 静岡県熱海市伊豆山１１７３−５８ 1173-58 Izusan, Ata...",松澤　宏樹　Koju Matsuzawa - +81(0)9085642901


In [4]:
df.columns

Index(['Unnamed: 0', 'ID', 'Country of Origin', 'Farm Name', 'Lot Number',
       'Mill', 'ICO Number', 'Company', 'Altitude', 'Region', 'Producer',
       'Number of Bags', 'Bag Weight', 'In-Country Partner', 'Harvest Year',
       'Grading Date', 'Owner', 'Variety', 'Status', 'Processing Method',
       'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance',
       'Uniformity', 'Clean Cup', 'Sweetness', 'Overall', 'Defects',
       'Total Cup Points', 'Moisture Percentage', 'Category One Defects',
       'Quakers', 'Color', 'Category Two Defects', 'Expiration',
       'Certification Body', 'Certification Address', 'Certification Contact'],
      dtype='object')

In [5]:
df_c = df.drop(columns=['Unnamed: 0','Lot Number','ICO Number','Number of Bags', 'Bag Weight','Grading Date','Status','Expiration','Certification Address', 'Certification Contact'])

In [6]:
df_c.head(1)

Unnamed: 0,ID,Country of Origin,Farm Name,Mill,Company,Altitude,Region,Producer,In-Country Partner,Harvest Year,Owner,Variety,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Overall,Defects,Total Cup Points,Moisture Percentage,Category One Defects,Quakers,Color,Category Two Defects,Certification Body
0,0,Colombia,Finca El Paraiso,Finca El Paraiso,Coffee Quality Union,1700-1930,"Piendamo,Cauca",Diego Samuel Bermudez,Japan Coffee Exchange,2021 / 2022,Coffee Quality Union,Castillo,Double Anaerobic Washed,8.58,8.5,8.42,8.58,8.25,8.42,10.0,10.0,10.0,8.58,0.0,89.33,11.8,0,0,green,3,Japan Coffee Exchange


In [7]:
df_c.shape

(207, 31)

In [8]:
#df['Farm Name'].unique()

In [9]:
patron_regex = '[^\\u4e00-\\u9fff]'

In [10]:
#pip install pypinyin 
from pypinyin import lazy_pinyin

In [11]:
def limpiar_nom(nombre):
    if pd.isna(nombre):  # Manejar valores nulos
        return nombre
    
    # Caso 1: Solo caracteres latinos y espacios (ASCII)
    if nombre.isascii():  # Si todos los caracteres son ASCII
        return nombre
    
    # Caso 2: Mixto (latinos y chinos): eliminar caracteres chinos
    contiene_latinos = any(char.isascii() for char in nombre)
    contiene_chinos = any('\u4e00' <= char <= '\u9fff' for char in nombre)
    
    if contiene_latinos and contiene_chinos:
        # Eliminar caracteres chinos, mantener el resto
        return ''.join(char for char in nombre if not ('\u4e00' <= char <= '\u9fff'))
    
    # Caso 3: Solo caracteres chinos, convertir a pinyin
    if contiene_chinos and not contiene_latinos:
        # Convertir caracteres chinos a pinyin
        return ''.join(lazy_pinyin(nombre))
    
    # Si no entra en ningún caso, devolver el valor original
    return nombre

In [12]:
df_c['Region'] = df_c['Region'].apply(limpiar_nom)

In [13]:
grupos_mixtos = df_c[['Farm Name','Mill', 'Company', 'Region', 'Producer', 'In-Country Partner','Owner','Certification Body']]

In [14]:
for columna in grupos_mixtos:
    df_c[columna] = df_c[columna].apply(lambda x: limpiar_nom(x) if pd.notnull(x) else x)
    print(f'Estos son los valores únicos de la columna {columna.upper()}\n: {df_c[columna].unique()}\n')
    print('--------------------\n')

Estos son los valores únicos de la columna FARM NAME
: ['Finca El Paraiso' 'Royal Bean Geisha Estate' 'OKLAO coffee farms'
 'La Cumbre' 'Finca Santuario' 'La Colina' ' Melastoma Coffee Estate'
 ' Chi Tsai Liu Li Ecological Farm' ' Liang Xuan Coffee Farm'
 'MASHIMA AMCOS' 'TADE GG' 'Karen Acajabon Coffee Farm'
 ' Goodfun Coffee Farm' 'Gelana Geisha' 'La Gaitania'
 'qingyekafeizhuangyuan' 'Halo Bariti Cooprative' 'dongbikafeizhuangyuan'
 'yingtaoguogukengkafeizhuangyuan' 'BURKA' 'Finca Vista Hermosa'
 'Uncle Chung.s Coffee Farm' 'Hom Doi' 'El Diamante'
 'yongshunkafeizhuangyuan' 'Fazenda Recreio' 'ZouZhouYuan'
 'songyuekafeizhuangyuan' 'tafuyayoujinongyuan' 'Hokukano Ranch'
 'Siang-Ting Organic Farm' '(Jinzan Coffee Estate)' 'Eshetu farm'
 'Uraga Bisrat washing station' 'YHAENU PLC FARM'
 'zhuowushankafeinongchang' ' YU SIANG Coffee Estate' 'Small Holder'
 'huangtingkafeizhuangyuan'
 '（）Agriculture Production and Marketing Groups of Hualien Shlin township special crop (coffee) 1st class'

In [15]:
df_c.head()

Unnamed: 0,ID,Country of Origin,Farm Name,Mill,Company,Altitude,Region,Producer,In-Country Partner,Harvest Year,Owner,Variety,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Overall,Defects,Total Cup Points,Moisture Percentage,Category One Defects,Quakers,Color,Category Two Defects,Certification Body
0,0,Colombia,Finca El Paraiso,Finca El Paraiso,Coffee Quality Union,1700-1930,"Piendamo,Cauca",Diego Samuel Bermudez,Japan Coffee Exchange,2021 / 2022,Coffee Quality Union,Castillo,Double Anaerobic Washed,8.58,8.5,8.42,8.58,8.25,8.42,10.0,10.0,10.0,8.58,0.0,89.33,11.8,0,0,green,3,Japan Coffee Exchange
1,1,Taiwan,Royal Bean Geisha Estate,Royal Bean Geisha Estate,Taiwan Coffee Laboratory,1200,Chiayi,cengfusen,Taiwan Coffee Laboratory,2021 / 2022,Taiwan Coffee Laboratory,Gesha,Washed / Wet,8.5,8.5,7.92,8.0,7.92,8.25,10.0,10.0,10.0,8.5,0.0,87.58,10.5,0,0,blue-green,0,Taiwan Coffee Laboratory
2,2,Laos,OKLAO coffee farms,oklao coffee processing plant,Taiwan Coffee Laboratory,1300,Laos Borofen Plateau,WU TAO CHI,Taiwan Coffee Laboratory,2021 / 2022,Taiwan Coffee Laboratory,Java,Semi Washed,8.33,8.42,8.08,8.17,7.92,8.17,10.0,10.0,10.0,8.33,0.0,87.42,10.4,0,0,yellowish,2,Taiwan Coffee Laboratory
3,3,Costa Rica,La Cumbre,La Montana Tarrazu MIll,Coffee Quality Union,1900,"Los Santos,Tarrazu",Santa Maria de Dota,Japan Coffee Exchange,2022,Coffee Quality Union,Gesha,Washed / Wet,8.08,8.17,8.17,8.25,8.17,8.08,10.0,10.0,10.0,8.25,0.0,87.17,11.8,0,0,green,0,Japan Coffee Exchange
4,4,Colombia,Finca Santuario,Finca Santuario,Coffee Quality Union,1850-2100,"Popayan,Cauca",Camilo Merizalde,Japan Coffee Exchange,2022,Coffee Quality Union,Red Bourbon,"Honey,Mossto",8.33,8.33,8.08,8.25,7.92,7.92,10.0,10.0,10.0,8.25,0.0,87.08,11.6,0,2,yellow-green,2,Japan Coffee Exchange


In [16]:
import re
def limpiar_simb(text):
    if pd.isnull(text): # Maneja valores nulos 
        return text 
    return re.sub(r'[()]', '', text).strip().title() # Elimina solo los paréntesis 
# Aplicar filtro a todas las columnas que comparten características 


In [17]:
for columna in grupos_mixtos:
    df_c[columna] = df_c[columna].apply(lambda x: limpiar_simb(x) if pd.notnull(x) else x)
    print(f'Estos son los valores únicos de la columna {columna.upper()}\n: {df_c[columna].unique()}\n')
    print('--------------------\n')

Estos son los valores únicos de la columna FARM NAME
: ['Finca El Paraiso' 'Royal Bean Geisha Estate' 'Oklao Coffee Farms'
 'La Cumbre' 'Finca Santuario' 'La Colina' 'Melastoma Coffee Estate'
 'Chi Tsai Liu Li Ecological Farm' 'Liang Xuan Coffee Farm'
 'Mashima Amcos' 'Tade Gg' 'Karen Acajabon Coffee Farm'
 'Goodfun Coffee Farm' 'Gelana Geisha' 'La Gaitania'
 'Qingyekafeizhuangyuan' 'Halo Bariti Cooprative' 'Dongbikafeizhuangyuan'
 'Yingtaoguogukengkafeizhuangyuan' 'Burka' 'Finca Vista Hermosa'
 'Uncle Chung.S Coffee Farm' 'Hom Doi' 'El Diamante'
 'Yongshunkafeizhuangyuan' 'Fazenda Recreio' 'Zouzhouyuan'
 'Songyuekafeizhuangyuan' 'Tafuyayoujinongyuan' 'Hokukano Ranch'
 'Siang-Ting Organic Farm' 'Jinzan Coffee Estate' 'Eshetu Farm'
 'Uraga Bisrat Washing Station' 'Yhaenu Plc Farm'
 'Zhuowushankafeinongchang' 'Yu Siang Coffee Estate' 'Small Holder'
 'Huangtingkafeizhuangyuan'
 '（）Agriculture Production And Marketing Groups Of Hualien Shlin Township Special Crop Coffee 1St Class'
 'Bayiga

In [18]:
df_c['Region'] = df_c['Region'].str.capitalize()

In [19]:
df_c.head()

Unnamed: 0,ID,Country of Origin,Farm Name,Mill,Company,Altitude,Region,Producer,In-Country Partner,Harvest Year,Owner,Variety,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Overall,Defects,Total Cup Points,Moisture Percentage,Category One Defects,Quakers,Color,Category Two Defects,Certification Body
0,0,Colombia,Finca El Paraiso,Finca El Paraiso,Coffee Quality Union,1700-1930,"Piendamo,cauca",Diego Samuel Bermudez,Japan Coffee Exchange,2021 / 2022,Coffee Quality Union,Castillo,Double Anaerobic Washed,8.58,8.5,8.42,8.58,8.25,8.42,10.0,10.0,10.0,8.58,0.0,89.33,11.8,0,0,green,3,Japan Coffee Exchange
1,1,Taiwan,Royal Bean Geisha Estate,Royal Bean Geisha Estate,Taiwan Coffee Laboratory,1200,Chiayi,Cengfusen,Taiwan Coffee Laboratory,2021 / 2022,Taiwan Coffee Laboratory,Gesha,Washed / Wet,8.5,8.5,7.92,8.0,7.92,8.25,10.0,10.0,10.0,8.5,0.0,87.58,10.5,0,0,blue-green,0,Taiwan Coffee Laboratory
2,2,Laos,Oklao Coffee Farms,Oklao Coffee Processing Plant,Taiwan Coffee Laboratory,1300,Laos borofen plateau,Wu Tao Chi,Taiwan Coffee Laboratory,2021 / 2022,Taiwan Coffee Laboratory,Java,Semi Washed,8.33,8.42,8.08,8.17,7.92,8.17,10.0,10.0,10.0,8.33,0.0,87.42,10.4,0,0,yellowish,2,Taiwan Coffee Laboratory
3,3,Costa Rica,La Cumbre,La Montana Tarrazu Mill,Coffee Quality Union,1900,"Los santos,tarrazu",Santa Maria De Dota,Japan Coffee Exchange,2022,Coffee Quality Union,Gesha,Washed / Wet,8.08,8.17,8.17,8.25,8.17,8.08,10.0,10.0,10.0,8.25,0.0,87.17,11.8,0,0,green,0,Japan Coffee Exchange
4,4,Colombia,Finca Santuario,Finca Santuario,Coffee Quality Union,1850-2100,"Popayan,cauca",Camilo Merizalde,Japan Coffee Exchange,2022,Coffee Quality Union,Red Bourbon,"Honey,Mossto",8.33,8.33,8.08,8.25,7.92,7.92,10.0,10.0,10.0,8.25,0.0,87.08,11.6,0,2,yellow-green,2,Japan Coffee Exchange


In [20]:
#eliminar espacios antes de títulos
#eliminar paréntesis 
#revisar: '\t\t', 'N / A', '-'
#harvest year: eliminar el más antiguo
#gestión nulos en Procesing Method
#unificación de caracteres: 
    #capital: Country, Region, Producer, In-Country Partner, Owner, Variety, Certification Body
    #minus: Processing Methof, Color 


In [21]:
#nos quedamos con la altitud más alta porque es la más influye en la calidad del café y sus aromas
df_c['Altitude'] = df_c['Altitude'].apply(
    lambda x: str(x).split('-')[1].strip() if isinstance(x, str) and '-' in x else x)

In [22]:
#nos quedamos con el año más reciente porque es el relevante en el estudio
df_c['Harvest Year'] = df_c['Harvest Year'].apply(
    lambda x: str(x).split(' / ')[1].strip() if isinstance(x, str) and ' / ' in x else x)

In [23]:
#nos quedamos con la provincia por simplificar las gráficas 
df_c['Region'] = df_c['Region'].apply(
    lambda x: str(x).split(',')[1].strip() if isinstance(x, str) and ',' in x else x).str.title()

In [24]:
df_c['Variety'].unique()

array(['Castillo', 'Gesha', 'Java', 'Red Bourbon', 'Sl34+Gesha', 'SL34',
       'Bourbon', 'Ethiopian Heirlooms', 'Caturra',
       'Wolishalo,Kurume,Dega', 'Typica', 'Catimor',
       'Castillo Paraguaycito', nan, 'SL28', 'SL14', 'Catuai',
       'Yellow Bourbon', 'Catrenic', 'unknown', 'Pacamara',
       'Castillo and Colombia blend', 'Jember,TIM-TIM,Ateng',
       'BOURBON, CATURRA Y CATIMOR', 'Bourbon Sidra', 'Sarchimor',
       'Catimor,Catuai,Caturra,Bourbon', 'Parainema', 'SHG',
       'Typica + SL34',
       'MARSELLESA, CATUAI, CATURRA & MARSELLESA, ANACAFE 14, CATUAI',
       'Mundo Novo', 'Red Bourbon,Caturra', 'Lempira', 'Typica Gesha',
       'Gayo', 'Bourbon, Catimor, Caturra, Typica', 'unknow',
       'Maragogype', 'Caturra-Catuai', 'SL28,SL34,Ruiru11',
       'Yellow Catuai', 'Catucai', 'Santander',
       'Typica Bourbon Caturra Catimor', 'Caturra,Colombia,Castillo',
       'Castillo,Caturra,Bourbon', 'Pacas', 'Catuai and Mundo Novo'],
      dtype=object)

In [25]:
df_c.head()

Unnamed: 0,ID,Country of Origin,Farm Name,Mill,Company,Altitude,Region,Producer,In-Country Partner,Harvest Year,Owner,Variety,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Overall,Defects,Total Cup Points,Moisture Percentage,Category One Defects,Quakers,Color,Category Two Defects,Certification Body
0,0,Colombia,Finca El Paraiso,Finca El Paraiso,Coffee Quality Union,1930,Cauca,Diego Samuel Bermudez,Japan Coffee Exchange,2022,Coffee Quality Union,Castillo,Double Anaerobic Washed,8.58,8.5,8.42,8.58,8.25,8.42,10.0,10.0,10.0,8.58,0.0,89.33,11.8,0,0,green,3,Japan Coffee Exchange
1,1,Taiwan,Royal Bean Geisha Estate,Royal Bean Geisha Estate,Taiwan Coffee Laboratory,1200,Chiayi,Cengfusen,Taiwan Coffee Laboratory,2022,Taiwan Coffee Laboratory,Gesha,Washed / Wet,8.5,8.5,7.92,8.0,7.92,8.25,10.0,10.0,10.0,8.5,0.0,87.58,10.5,0,0,blue-green,0,Taiwan Coffee Laboratory
2,2,Laos,Oklao Coffee Farms,Oklao Coffee Processing Plant,Taiwan Coffee Laboratory,1300,Laos Borofen Plateau,Wu Tao Chi,Taiwan Coffee Laboratory,2022,Taiwan Coffee Laboratory,Java,Semi Washed,8.33,8.42,8.08,8.17,7.92,8.17,10.0,10.0,10.0,8.33,0.0,87.42,10.4,0,0,yellowish,2,Taiwan Coffee Laboratory
3,3,Costa Rica,La Cumbre,La Montana Tarrazu Mill,Coffee Quality Union,1900,Tarrazu,Santa Maria De Dota,Japan Coffee Exchange,2022,Coffee Quality Union,Gesha,Washed / Wet,8.08,8.17,8.17,8.25,8.17,8.08,10.0,10.0,10.0,8.25,0.0,87.17,11.8,0,0,green,0,Japan Coffee Exchange
4,4,Colombia,Finca Santuario,Finca Santuario,Coffee Quality Union,2100,Cauca,Camilo Merizalde,Japan Coffee Exchange,2022,Coffee Quality Union,Red Bourbon,"Honey,Mossto",8.33,8.33,8.08,8.25,7.92,7.92,10.0,10.0,10.0,8.25,0.0,87.08,11.6,0,2,yellow-green,2,Japan Coffee Exchange


In [26]:
df_c['Variety'] = df_c['Variety'].apply(lambda x: str(x).split(',')[0].strip() if isinstance(x, str) and ',' in x else x).apply(lambda x: str(x).split('+') if isinstance(x, str) and '+' in x else x).str.title()

In [27]:
df_c['Variety'].unique()

array(['Castillo', 'Gesha', 'Java', 'Red Bourbon', nan, 'Sl34', 'Bourbon',
       'Ethiopian Heirlooms', 'Caturra', 'Wolishalo', 'Typica', 'Catimor',
       'Castillo Paraguaycito', 'Sl28', 'Sl14', 'Catuai',
       'Yellow Bourbon', 'Catrenic', 'Unknown', 'Pacamara',
       'Castillo And Colombia Blend', 'Jember', 'Bourbon Sidra',
       'Sarchimor', 'Parainema', 'Shg', 'Marsellesa', 'Mundo Novo',
       'Lempira', 'Typica Gesha', 'Gayo', 'Unknow', 'Maragogype',
       'Caturra-Catuai', 'Yellow Catuai', 'Catucai', 'Santander',
       'Typica Bourbon Caturra Catimor', 'Pacas', 'Catuai And Mundo Novo'],
      dtype=object)

In [28]:
df_c[df_c.isnull().any(axis=1)]

Unnamed: 0,ID,Country of Origin,Farm Name,Mill,Company,Altitude,Region,Producer,In-Country Partner,Harvest Year,Owner,Variety,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Overall,Defects,Total Cup Points,Moisture Percentage,Category One Defects,Quakers,Color,Category Two Defects,Certification Body
7,7,Taiwan,Chi Tsai Liu Li Ecological Farm,Qicailiulikafeizhuangyuan,Taiwan Coffee Laboratory,1200.0,Chiayi,Zhuangjiarong,Taiwan Coffee Laboratory,2022,Taiwan Coffee Laboratory,,Natural / Dry,8.25,8.25,8.17,8.0,7.92,8.08,10.0,10.0,10.0,8.08,0.0,86.75,10.0,0,0,yellow green,1,Taiwan Coffee Laboratory
23,23,Colombia,El Diamante,El Diamante,Coffee Quality Union,1350.0,Quindio,Farm Bedoya Arango,Japan Coffee Exchange,2022,Coffee Quality Union,Castillo Paraguaycito,,8.08,8.0,7.83,8.17,7.75,7.83,10.0,10.0,10.0,8.0,0.0,85.67,11.3,0,0,brownish,2,Japan Coffee Exchange
25,25,Brazil,Fazenda Recreio,Dry Mill,"Wit Wealth, Investments & Trust",1250.0,Região Vulcânica,Diogo T. Dias De Macedo,Brazil Specialty Coffee Association,2022,Andre Clark,,Pulped natural / honey,7.83,8.08,7.83,7.92,8.0,7.83,10.0,10.0,10.0,8.0,0.0,85.5,11.3,2,0,green,3,Brazil Specialty Coffee Association
44,44,Taiwan,Igfarm,Igfarm,Taiwan Coffee Laboratory,150.0,Shoufeng Township,Liu Meng Tsung,Taiwan Coffee Laboratory,2022,Liu Meng Tsung,Typica,,7.92,8.0,7.92,8.0,7.58,7.75,10.0,10.0,10.0,7.83,0.0,85.0,10.9,0,0,green,0,Taiwan Coffee Laboratory
51,51,Indonesia,Pt Indo Cafco North Sumatera,Pt Indo Cafco,Interamerican Coffee,1200.0,Aceh Tengah,Pt Indo Cafco North Sumatera,Nkg Quality Service A Division Of Bernhard Rot...,2022,Charles Umeano,Catimor,,7.83,7.92,7.75,7.83,7.83,7.83,10.0,10.0,10.0,7.83,0.0,84.83,11.9,0,3,bluish-green,2,Nkg Quality Service A Division Of Bernhard Rot...
74,74,Guatemala,Finca Alta Luz,,"Retrillas Del Pacifico, S.A.",1400.0,Huehuetenango,Maria De Los Angeles Perez,Asociacion Nacional Del Café,2023,Yesica Alejandra Martìnez Vàsquez,Bourbon,Washed / Wet,7.67,7.83,7.75,7.75,7.83,7.75,10.0,10.0,10.0,7.75,0.0,84.33,9.2,0,1,green,5,Asociacion Nacional Del Café
97,97,Colombia,,"Racafé & Cia S.C.A, Km 5 Vía Pereira",Marubeni Corporation,1411.0,Pereira,Racafe & Cia Sca,Japan Coffee Exchange,2023,Ikuto Uehara,,Washed / Wet,7.67,7.75,7.67,7.75,7.67,7.67,10.0,10.0,10.0,7.67,0.0,83.83,11.9,0,1,green,1,Japan Coffee Exchange
98,98,Guatemala,Finca Alta Luz,,"Retrillas Del Pacifico, S. A.",1400.0,Huehuetenango,Maria De Los Angeles Perez,Asociacion Nacional Del Café,2022,Angelica Paola Citan Lopez,Bourbon,Natural / Dry,7.58,7.83,7.67,7.67,7.67,7.67,10.0,10.0,10.0,7.75,0.0,83.83,10.8,0,0,green,4,Asociacion Nacional Del Café
105,105,Colombia,,,Coffee Quality Institute,,,,Barista And Coffee Academy Of Asia,2022,Coffee Quality Institute,,,7.83,7.75,7.5,7.58,7.67,7.67,10.0,10.0,10.0,7.67,0.0,83.67,12.4,1,0,greenish,9,Barista And Coffee Academy Of Asia
119,119,Taiwan,Xiangxiangjiuyikafeizhuangyuan,Xiangxiangjiuyikafeizhuangyuan,Taiwan Coffee Laboratory,1100.0,Chiayi,Wuzhaoyun,Taiwan Coffee Laboratory,2022,Wu Jhao Yun,,Washed / Wet,7.75,7.92,7.67,7.92,7.75,7.83,8.67,10.0,10.0,7.92,0.0,83.42,10.1,0,0,blue-green,0,Taiwan Coffee Laboratory


In [39]:
df_c.loc[::97]

Unnamed: 0,ID,Country of Origin,Farm Name,Mill,Company,Altitude,Region,Producer,In-Country Partner,Harvest Year,Owner,Variety,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Overall,Defects,Total Cup Points,Moisture Percentage,Category One Defects,Quakers,Color,Category Two Defects,Certification Body
0,0,Colombia,Finca El Paraiso,Finca El Paraiso,Coffee Quality Union,1930,Cauca,Diego Samuel Bermudez,Japan Coffee Exchange,2022,Coffee Quality Union,Castillo,Double Anaerobic Washed,8.58,8.5,8.42,8.58,8.25,8.42,10.0,10.0,10.0,8.58,0.0,89.33,11.8,0,0,green,3,Japan Coffee Exchange
97,97,Colombia,,"Racafé & Cia S.C.A, Km 5 Vía Pereira",Marubeni Corporation,1411,Pereira,Racafe & Cia Sca,Japan Coffee Exchange,2023,Ikuto Uehara,,Washed / Wet,7.67,7.75,7.67,7.75,7.67,7.67,10.0,10.0,10.0,7.67,0.0,83.83,11.9,0,1,green,1,Japan Coffee Exchange
194,194,Brazil,Various Farm,Dry Mill Of Cocapil,Marubeni Corporation,1200,Alta Mogiana-Ibiraci,Cocapil,Japan Coffee Exchange,2023,Ikuto Uehara,,Natural / Dry,7.42,7.42,7.25,7.25,7.17,7.25,10.0,10.0,10.0,7.25,0.0,81.0,10.6,0,2,greenish,1,Japan Coffee Exchange


In [29]:
df_c['Color'].unique()

array(['green', 'blue-green', 'yellowish', 'yellow-green', 'yellow green',
       'greenish', 'brownish', 'yellow- green', 'browish-green',
       'bluish-green', 'pale yellow', 'yello-green'], dtype=object)

In [32]:
dicc_color = {'yellow green':'yellow-green','yellow- green':'yellow-green','yello-green':'yellow-green','pale yellow':'pale-yellow'}
df_c['Color'] = df_c['Color'].map(dicc_color)
df_c['Color'].unique()


array([nan], dtype=object)

In [30]:
df_c['Color'] = df_c['Color'].replace('yellow green','yellow-green').replace('yellow- green','yellow-green').replace('yello-green','yellow-green').replace('pale yellow','pale-yellow')
df_c['Color'].unique()

array(['green', 'blue-green', 'yellowish', 'yellow-green', 'greenish',
       'brownish', 'browish-green', 'bluish-green', 'pale-yellow'],
      dtype=object)