# Data Preprocessing

## Data Source: [Agricultura in Brazil (Kaggle)](https://www.kaggle.com/calvom/agricultura)

Columns:
* ID Provincia          = province_id
* Provincia             = province
* ID Departamento       = department_id
* Departamento          = department
* Id Cultivo	        = cultivation_id
* Cultivo	            = cultivation
* ID Campa�a	        = campaign_id
* Campana	            = campaign
* Temperatura (Ce)      = average_temperature (in degrees Celsius)
* Sup. Sembrada (Ha)    = area_sowed (hectars)
* Sup. Cosechada (Ha)   = area_harvested (hectars)
* Producci�n (Tn)       = production (in tons)
* Rendimiento (Kg/Ha)   = performance (kg / ha)
* Calidad               = quality

### Agricultura

In [10]:
import pandas as pd
df = pd.read_csv('../data/raw/agricultura.csv', sep=',', low_memory=False)
df.head()

Unnamed: 0,ID Provincia,Provincia,ID Departamento,Departamento,Id Cultivo,Cultivo,ID Campa�a,Campana,Temperatura (Ce),Sup. Sembrada (Ha),Sup. Cosechada (Ha),Producci�n (Tn),Rendimiento (Kg/Ha),Calidad
0,6,BUENOS AIRES,854,25 DE MAYO,1,Ajo,1,1969/70,23,3.0,3.0,10,3.333,Alta
1,6,BUENOS AIRES,854,25 DE MAYO,1,Ajo,2,1970/71,21,1.0,1.0,3,3.0,Promedio
2,6,BUENOS AIRES,14,ADOLFO GONZALES CHAVES,1,Ajo,1,1969/70,30,15.0,15.0,82,5.467,Promedio
3,6,BUENOS AIRES,14,ADOLFO GONZALES CHAVES,1,Ajo,2,1970/71,31,10.0,10.0,55,5.5,Alta
4,6,BUENOS AIRES,14,ADOLFO GONZALES CHAVES,1,Ajo,3,1971/72,26,8.0,8.0,44,5.5,Alta


In [11]:
# Rename columns
df.columns = ['province_id', 'province', 'department_id', 'department','cultivation_id', 'cultivation', 'year_id', 'year', 'average_temperature','area_sowed', 'area_harvested', 'production','performance', 'quality']
df.columns

Index(['province_id', 'province', 'department_id', 'department',
       'cultivation_id', 'cultivation', 'year_id', 'year',
       'average_temperature', 'area_sowed', 'area_harvested', 'production',
       'performance', 'quality'],
      dtype='object')

### Translate cultivation and quality columns

In [12]:
df.quality.unique()

array(['Alta', 'Promedio', 'Baja'], dtype=object)

In [13]:
quality_items = ['Alta', 'Promedio', 'Baja']
quality_items_english = {
    'Alta': 'high',
    'Promedio': 'middle',
    'Baja': 'low'
}

for q in quality_items:
    df_q = df[df.quality == q].index
    df.loc[df_q, "quality"] = quality_items_english[q]

In [15]:
df.head()

Unnamed: 0,province_id,province,department_id,department,cultivation_id,cultivation,year_id,year,average_temperature,area_sowed,area_harvested,production,performance,quality
0,6,BUENOS AIRES,854,25 DE MAYO,1,Ajo,1,1969/70,23,3.0,3.0,10,3.333,high
1,6,BUENOS AIRES,854,25 DE MAYO,1,Ajo,2,1970/71,21,1.0,1.0,3,3.0,middle
2,6,BUENOS AIRES,14,ADOLFO GONZALES CHAVES,1,Ajo,1,1969/70,30,15.0,15.0,82,5.467,middle
3,6,BUENOS AIRES,14,ADOLFO GONZALES CHAVES,1,Ajo,2,1970/71,31,10.0,10.0,55,5.5,high
4,6,BUENOS AIRES,14,ADOLFO GONZALES CHAVES,1,Ajo,3,1971/72,26,8.0,8.0,44,5.5,high


In [17]:
df.cultivation.unique()

array(['Ajo', 'Algodon', 'Alpiste', 'Arroz', 'Avena', 'Banana',
       'Cana de azucar', 'Cartamo', 'Cebada cervecera',
       'Cebada forrajera', 'Cebolla total', 'Centeno', 'Colza', 'Girasol',
       'Jojoba', 'Limon', 'Lino', 'Maiz', 'Mandarina', 'Mani', 'Mijo',
       'Naranja', 'Papa total', 'Pomelo', 'Poroto seco', 'Soja', 'Sorgo',
       'Te', 'Trigo', 'Trigo candeal', 'Tung', 'Yerba mate'], dtype=object)

In [20]:
cultivation_items = ['Ajo', 'Algodon', 'Alpiste', 'Arroz', 'Avena', 'Banana', 'Cana de azucar', 'Cartamo', 'Cebada cervecera', 'Cebada forrajera', 'Cebolla total', 'Centeno', 'Colza', 'Girasol', 'Jojoba', 'Limon', 'Lino', 'Maiz', 'Mandarina', 'Mani', 'Mijo', 'Naranja', 'Papa total', 'Pomelo', 'Poroto seco', 'Soja', 'Sorgo', 'Te', 'Trigo', 'Trigo candeal', 'Tung', 'Yerba mate']
cultivation_items_english = {
    'Ajo': 'garlic', 
    'Algodon': 'cotton', 
    'Alpiste': 'birdseed', 
    'Arroz': 'rice', 
    'Avena': 'oat', 
    'Banana': 'banana',
    'Cana de azucar': 'sugar cane', 
    'Cartamo': 'safflower', 
    'Cebada cervecera': 'beer barley',
    'Cebada forrajera': 'fodder barley', 
    'Cebolla total': 'onion', 
    'Centeno': 'rye', 
    'Colza': 'rape seed', 
    'Girasol': 'sunflower',
    'Jojoba': 'jojoba', 
    'Limon': 'lemon', 
    'Lino': 'linen', 
    'Maiz': 'corn', 
    'Mandarina': 'tangerine', 
    'Mani': 'peanut', 
    'Mijo': 'millet',
    'Naranja': 'orange', 
    'Papa total': 'potato', 
    'Pomelo': 'grapefruit', 
    'Poroto seco': 'dry bean', 
    'Soja': 'soy', 
    'Sorgo': 'sorghum',
    'Te': 'tea', 
    'Trigo': 'wheat', 
    'Trigo candeal': 'candelabra wheat', 
    'Tung': 'tung', 
    'Yerba mate': 'yerba mate'
}

for c in cultivation_items:
    df_c = df[df.cultivation == c].index
    df.loc[df_c, "cultivation"] = cultivation_items_english[c]

In [21]:
df

Unnamed: 0,province_id,province,department_id,department,cultivation_id,cultivation,year_id,year,average_temperature,area_sowed,area_harvested,production,performance,quality
0,6,BUENOS AIRES,854,25 DE MAYO,1,garlic,1,1969/70,23,3.00,3.00,10,3.333,high
1,6,BUENOS AIRES,854,25 DE MAYO,1,garlic,2,1970/71,21,1.00,1.00,3,3.000,middle
2,6,BUENOS AIRES,14,ADOLFO GONZALES CHAVES,1,garlic,1,1969/70,30,15.00,15.00,82,5.467,middle
3,6,BUENOS AIRES,14,ADOLFO GONZALES CHAVES,1,garlic,2,1970/71,31,10.00,10.00,55,5.500,high
4,6,BUENOS AIRES,14,ADOLFO GONZALES CHAVES,1,garlic,3,1971/72,26,8.00,8.00,44,5.500,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132764,54,MISIONES,112,SAN PEDRO,31,yerba mate,40,2008/09,31,5.50,4.80,23.760,4.950,middle
132765,54,MISIONES,112,SAN PEDRO,31,yerba mate,41,2009/10,31,5.50,4.80,24.480,5.100,middle
132766,54,MISIONES,112,SAN PEDRO,31,yerba mate,43,2011/12,31,5.85,5.15,23.175,4.500,low
132767,54,MISIONES,112,SAN PEDRO,31,yerba mate,45,2013/14,31,5.85,4.85,26.190,5.400,low


In [22]:
df.to_csv('../data/cleaned/agricultura.csv')