## Previsão de doenças em grãos de soja

### Importação de bibliotecas, importação da base de dados e verificação/tratamento de dados

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

In [2]:
df = pd.read_csv('soybean.csv', sep = ',')
df.head()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
0,october,normal,gt-norm,norm,yes,same-lst-yr,low-areas,pot-severe,none,90-100,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
1,august,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,severe,fungicide,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
2,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,fungicide,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
3,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,none,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
4,october,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,pot-severe,none,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker


In [3]:
df.shape, df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             683 non-null    object
 1   plant-stand      683 non-null    object
 2   precip           683 non-null    object
 3   temp             683 non-null    object
 4   hail             683 non-null    object
 5   crop-hist        683 non-null    object
 6   area-damaged     683 non-null    object
 7   severity         683 non-null    object
 8   seed-tmt         683 non-null    object
 9   germination      683 non-null    object
 10  plant-growth     683 non-null    object
 11  leaves           683 non-null    object
 12  leafspots-halo   683 non-null    object
 13  leafspots-marg   683 non-null    object
 14  leafspot-size    683 non-null    object
 15  leaf-shread      683 non-null    object
 16  leaf-malf        683 non-null    object
 17  leaf-mild        683 non-null    ob

((683, 36), None)

In [4]:
df.columns

Index(['date', 'plant-stand', 'precip', 'temp', 'hail', 'crop-hist',
       'area-damaged', 'severity', 'seed-tmt', 'germination', 'plant-growth',
       'leaves', 'leafspots-halo', 'leafspots-marg', 'leafspot-size',
       'leaf-shread', 'leaf-malf', 'leaf-mild', 'stem', 'lodging',
       'stem-cankers', 'canker-lesion', 'fruiting-bodies', 'external-decay',
       'mycelium', 'int-discolor', 'sclerotia', 'fruit-pods', 'fruit-spots',
       'seed', 'mold-growth', 'seed-discolor', 'seed-size', 'shriveling',
       'roots', 'class'],
      dtype='object')

In [5]:
#Visualização da classificação de doenças 
df.groupby(df['class']).size()

class
2-4-d-injury                   16
alternarialeaf-spot            91
anthracnose                    44
bacterial-blight               20
bacterial-pustule              20
brown-spot                     92
brown-stem-rot                 44
charcoal-rot                   20
cyst-nematode                  14
diaporthe-pod-&-stem-blight    15
diaporthe-stem-canker          20
downy-mildew                   20
frog-eye-leaf-spot             91
herbicide-injury                8
phyllosticta-leaf-spot         20
phytophthora-rot               88
powdery-mildew                 20
purple-seed-stain              20
rhizoctonia-root-rot           20
dtype: int64

### Transformação de variáveis categóricas para numéricas

In [6]:
encoder = LabelEncoder()
df_transform = pd.DataFrame()
df_transform['date'] = encoder.fit_transform(df['date'])
df_transform['plant-stand'] = encoder.fit_transform(df['plant-stand'])
df_transform['precip'] = encoder.fit_transform(df['precip'])
df_transform['temp'] = encoder.fit_transform(df['temp'])
df_transform['hail'] = encoder.fit_transform(df['hail'])
df_transform['crop-hist'] = encoder.fit_transform(df['crop-hist'])
df_transform['area-damaged'] = encoder.fit_transform(df['area-damaged'])
df_transform['severity'] = encoder.fit_transform(df['severity'])
df_transform['seed-tmt'] = encoder.fit_transform(df['seed-tmt'])
df_transform['germination'] = encoder.fit_transform(df['germination'])
df_transform['plant-growth'] = encoder.fit_transform(df['plant-growth'])
df_transform['leaves'] = encoder.fit_transform(df['leaves'])
df_transform['leafspots-halo'] = encoder.fit_transform(df['leafspots-halo'])
df_transform['leafspots-marg'] = encoder.fit_transform(df['leafspots-marg'])
df_transform['leafspot-size'] = encoder.fit_transform(df['leafspot-size'])
df_transform['leaf-shread'] = encoder.fit_transform(df['leaf-shread'])
df_transform['leaf-malf'] = encoder.fit_transform(df['leaf-malf'])
df_transform['leaf-mild'] = encoder.fit_transform(df['leaf-mild'])
df_transform['stem'] = encoder.fit_transform(df['stem'])
df_transform['lodging'] = encoder.fit_transform(df['lodging'])
df_transform['stem-cankers'] = encoder.fit_transform(df['stem-cankers'])
df_transform['canker-lesion'] = encoder.fit_transform(df['canker-lesion'])
df_transform['fruiting-bodies'] = encoder.fit_transform(df['fruiting-bodies'])
df_transform['external-decay'] = encoder.fit_transform(df['external-decay'])
df_transform['mycelium'] = encoder.fit_transform(df['mycelium'])
df_transform['int-discolor'] = encoder.fit_transform(df['int-discolor'])
df_transform['sclerotia'] = encoder.fit_transform(df['sclerotia'])
df_transform['fruit-pods'] = encoder.fit_transform(df['fruit-pods'])
df_transform['fruit-spots'] = encoder.fit_transform(df['fruit-spots'])
df_transform['seed'] = encoder.fit_transform(df['seed'])
df_transform['mold-growth'] = encoder.fit_transform(df['mold-growth'])
df_transform['seed-discolor'] = encoder.fit_transform(df['seed-discolor'])
df_transform['seed-size'] = encoder.fit_transform(df['seed-size'])
df_transform['shriveling'] = encoder.fit_transform(df['shriveling'])
df_transform['roots'] = encoder.fit_transform(df['roots'])
df_transform['class'] = encoder.fit_transform(df['class'])

#df_transform

In [7]:
#Definição de previsores e classe
prev = df_transform.iloc[:,0:35].values
clas = df_transform['class'].values

### Preparação do modelo de previsão, ajuste Oversample e treinamento

In [8]:
x_treino,x_teste, y_treino, y_teste =  train_test_split(prev, clas, 
                                                        test_size = 0.3, 
                                                        random_state= 3)

In [9]:
mod = ExtraTreesClassifier()
ros = RandomOverSampler(random_state = 0)
x_res, y_res = ros.fit_resample(x_treino, y_treino)
mod.fit(x_treino,y_treino)
mod_balance = mod.predict(x_teste)

### Matriz de confusão e taxa de acerto do modelo

In [10]:
matriz_balanceado = confusion_matrix(y_teste,mod_balance)
matriz_balanceado

array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0, 31,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0,  0,
         0,  0,  0],
       [ 0,  0, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  1,  0,  0,  0, 31,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0

In [11]:
taxa_acerto_balanceado = accuracy_score(y_teste, mod_balance)
taxa_acerto_balanceado

0.9512195121951219

## Criando um novo DataFrame com valores aleatórios como forma de predição de um caso real

### criando listas com os possiveis valores para cada coluna (baseados nos dados históricos)

In [12]:
date = ['october', 'august', 'july', 'september', 'may', 'april', 'june', '?']
plant_stand = ['normal', 'lt-normal', '?']
precip = ['gt-norm', 'lt-norm', 'norm', '?']
temp = ['norm', 'gt-norm', 'lt-norm', '?']
hail = ['yes', 'no', '?']
crop_hist = ['same-lst-yr', 'same-lst-two-yrs', 'same-lst-sev-yrs', 'diff-lst-year', '?']
area_damaged = ['low-areas', 'scattered', 'whole-field', 'upper-areas', '?']
severity = ['pot-severe', 'severe', '?', 'minor']
seed_tmt = ['none', 'fungicide', '?', 'other']
germination = ['90-100', '80-89', 'lt-80', '?']
plant_growth = ['abnorm', 'norm', '?']
leaves = ['abnorm', 'norm']
leafspots_halo = ['absent', '?', 'no-yellow-halos', 'yellow-halos']
leafspots_marg = ['dna', '?', 'w-s-marg', 'no-w-s-marg']
leafspot_size = ['dna', '?', 'gt-1/8', 'lt-1/8']
leaf_shread = ['absent', '?', 'present']
leaf_malf = ['absent', '?', 'present']
leaf_mild = ['absent', '?', 'upper-surf', 'lower-surf']
stem = ['abnorm', 'norm', '?']
lodging = ['no', 'yes', '?']
stem_cankers = ['above-sec-nde', 'absent', 'below-soil', 'above-soil', '?']
canker_lesion = ['brown', 'dna', 'tan', 'dk-brown-blk', '?']
fruiting_bodies = ['present', 'absent', '?']
external_decay = ['firm-and-dry', 'absent', '?', 'watery']
mycelium = ['absent', 'present', '?']
int_discolor = ['none', 'black', 'brown', '?']
sclerotia = ['absent', 'present', '?']
fruit_pods = ['norm', 'dna', '?', 'diseased', 'few-present']
fruit_spots = ['?', 'absent', 'brown-w/blk-specks', 'colored', 'dna']
seed = ['norm', '?', 'abnorm']
mold_growth = ['absent', '?', 'present']
seed_discolor = ['absent', '?', 'present'] 
seed_size = ['norm', '?', 'lt-norm']
shriveling = ['absent', '?', 'present']
roots = ['norm', 'rotted', 'galls-cysts', '?']

n_linhas = 15263
n_colunas = 1


### Fazendo um random choice para gerar valores aleatórios para cada linha da coluna

In [13]:
d1 = np.random.choice(date, size=(n_linhas, n_colunas))
d2 = np.random.choice(plant_stand, size=(n_linhas, n_colunas))
d3 = np.random.choice(precip, size=(n_linhas, n_colunas))
d4 = np.random.choice(temp, size=(n_linhas, n_colunas))
d5 = np.random.choice(hail, size=(n_linhas, n_colunas))
d6 = np.random.choice(crop_hist, size=(n_linhas, n_colunas))
d7 = np.random.choice(area_damaged, size=(n_linhas, n_colunas))
d8 = np.random.choice(severity, size=(n_linhas, n_colunas))
d9 = np.random.choice(seed_tmt, size=(n_linhas, n_colunas))
d10 = np.random.choice(germination, size=(n_linhas, n_colunas))
d11 = np.random.choice(plant_growth, size=(n_linhas, n_colunas))
d12 = np.random.choice(leaves, size=(n_linhas, n_colunas))
d13 = np.random.choice(leafspots_halo, size=(n_linhas, n_colunas))
d14 = np.random.choice(leafspots_marg, size=(n_linhas, n_colunas))
d15 = np.random.choice(leafspot_size, size=(n_linhas, n_colunas))
d16 = np.random.choice(leaf_shread, size=(n_linhas, n_colunas))
d17 = np.random.choice(leaf_malf, size=(n_linhas, n_colunas))
d18 = np.random.choice(leaf_mild, size=(n_linhas, n_colunas))
d19 = np.random.choice(stem, size=(n_linhas, n_colunas))
d20 = np.random.choice(lodging, size=(n_linhas, n_colunas))
d21 = np.random.choice(stem_cankers, size=(n_linhas, n_colunas))
d22 = np.random.choice(canker_lesion, size=(n_linhas, n_colunas))
d23 = np.random.choice(fruiting_bodies, size=(n_linhas, n_colunas))
d24 = np.random.choice(external_decay, size=(n_linhas, n_colunas))
d25 = np.random.choice(mycelium, size=(n_linhas, n_colunas))
d26 = np.random.choice(int_discolor, size=(n_linhas, n_colunas))
d27 = np.random.choice(sclerotia, size=(n_linhas, n_colunas))
d28 = np.random.choice(fruit_pods, size=(n_linhas, n_colunas))
d29 = np.random.choice(fruit_spots, size=(n_linhas, n_colunas))
d30 = np.random.choice(seed, size=(n_linhas, n_colunas))
d31 = np.random.choice(mold_growth, size=(n_linhas, n_colunas))
d32 = np.random.choice(seed_discolor, size=(n_linhas, n_colunas))
d33 = np.random.choice(seed_size, size=(n_linhas, n_colunas))
d34 = np.random.choice(shriveling, size=(n_linhas, n_colunas))
d35 = np.random.choice(roots, size=(n_linhas, n_colunas))


### Fazendo os dataframes com valores aleatórios

In [14]:
df1 = pd.DataFrame(d1, columns=['date'])
df2 = pd.DataFrame(d2, columns=['plant_stand'])
df3 = pd.DataFrame(d3, columns=['precip'])
df4 = pd.DataFrame(d4, columns=['temp'])
df5 = pd.DataFrame(d5, columns=['hail'])
df6 = pd.DataFrame(d6, columns=['crop_hist'])
df7 = pd.DataFrame(d7, columns=['area_damaged'])
df8 = pd.DataFrame(d8, columns=['severity'])
df9 = pd.DataFrame(d9, columns=['seed_tmt'])
df10 = pd.DataFrame(d10, columns=['germination'])
df11 = pd.DataFrame(d11, columns=['plant_growth'])
df12 = pd.DataFrame(d12, columns=['leaves'])
df13 = pd.DataFrame(d13, columns=['leafspots_halo'])
df14 = pd.DataFrame(d14, columns=['leafspots_marg'])
df15 = pd.DataFrame(d15, columns=['leafspot_size'])
df16 = pd.DataFrame(d16, columns=['leaf_shread'])
df17 = pd.DataFrame(d17, columns=['leaf_malf'])
df18 = pd.DataFrame(d18, columns=['leaf_mild'])
df19 = pd.DataFrame(d19, columns=['stem'])
df20 = pd.DataFrame(d20, columns=['lodging'])
df21 = pd.DataFrame(d21, columns=['stem_cankers'])
df22 = pd.DataFrame(d22, columns=['canker_lesion'])
df23 = pd.DataFrame(d23, columns=['fruiting_bodies'])
df24 = pd.DataFrame(d24, columns=['external_decay'])
df25 = pd.DataFrame(d25, columns=['mycelium'])
df26 = pd.DataFrame(d26, columns=['int_discolor']) 
df27 = pd.DataFrame(d27, columns=['sclerotia'])
df28 = pd.DataFrame(d28, columns=['fruit_pods'])
df29 = pd.DataFrame(d29, columns=['fruit_spots'])
df30 = pd.DataFrame(d30, columns=['seed'])
df31 = pd.DataFrame(d31, columns=['mold_growth'])
df32 = pd.DataFrame(d32, columns=['seed_discolor'])
df33 = pd.DataFrame(d33, columns=['seed_size'])
df34 = pd.DataFrame(d34, columns=['shriveling'])
df35 = pd.DataFrame(d35, columns=['roots'])


### Concatenando os DataFrames em um único

In [15]:
df_novo = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,df17,df18,df19,df20,df21,df22,df23,df24,df25,
df26,df27,df28,df29,df30,df31,df32,df33,df34,df35], axis = 1)

df_novo

Unnamed: 0,date,plant_stand,precip,temp,hail,crop_hist,area_damaged,severity,seed_tmt,germination,...,int_discolor,sclerotia,fruit_pods,fruit_spots,seed,mold_growth,seed_discolor,seed_size,shriveling,roots
0,june,normal,gt-norm,norm,yes,diff-lst-year,whole-field,pot-severe,none,80-89,...,brown,present,dna,?,abnorm,present,?,lt-norm,present,?
1,july,normal,lt-norm,lt-norm,no,same-lst-sev-yrs,?,severe,other,lt-80,...,?,absent,norm,dna,abnorm,present,?,lt-norm,absent,rotted
2,july,lt-normal,?,lt-norm,no,?,low-areas,minor,fungicide,?,...,brown,absent,diseased,?,?,present,?,norm,?,rotted
3,may,?,norm,norm,no,diff-lst-year,low-areas,minor,fungicide,lt-80,...,?,?,few-present,?,norm,absent,absent,?,?,galls-cysts
4,?,?,norm,lt-norm,no,same-lst-yr,?,pot-severe,none,80-89,...,none,present,dna,?,norm,absent,?,?,present,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15258,april,lt-normal,gt-norm,?,?,same-lst-yr,low-areas,minor,none,90-100,...,black,?,norm,dna,norm,present,absent,norm,?,?
15259,?,lt-normal,norm,lt-norm,?,same-lst-yr,scattered,minor,other,90-100,...,brown,present,diseased,?,abnorm,absent,?,norm,present,rotted
15260,may,?,norm,?,?,same-lst-yr,?,pot-severe,?,90-100,...,none,?,?,dna,norm,absent,present,lt-norm,present,norm
15261,?,?,lt-norm,norm,?,same-lst-yr,whole-field,severe,other,lt-80,...,black,present,?,absent,abnorm,absent,?,lt-norm,absent,norm


### Transformando variaveis categóricas em numéricas

In [16]:
df_novo_transform = pd.DataFrame()
df_novo_transform['date'] = encoder.fit_transform(df1['date'])
df_novo_transform['plant_stand'] = encoder.fit_transform(df2['plant_stand'])
df_novo_transform['precip'] = encoder.fit_transform(df3['precip'])
df_novo_transform['temp'] = encoder.fit_transform(df4['temp'])
df_novo_transform['hail'] = encoder.fit_transform(df5['hail'])
df_novo_transform['crop_hist'] = encoder.fit_transform(df6['crop_hist'])
df_novo_transform['area_damaged'] = encoder.fit_transform(df7['area_damaged'])
df_novo_transform['severity'] = encoder.fit_transform(df8['severity'])
df_novo_transform['seed_tmt'] = encoder.fit_transform(df9['seed_tmt'])
df_novo_transform['germination'] = encoder.fit_transform(df10['germination'])
df_novo_transform['plant_growth'] = encoder.fit_transform(df11['plant_growth'])
df_novo_transform['leaves'] = encoder.fit_transform(df12['leaves'])
df_novo_transform['leafspots_halo'] = encoder.fit_transform(df13['leafspots_halo'])
df_novo_transform['leafspots_marg'] = encoder.fit_transform(df14['leafspots_marg'])
df_novo_transform['leafspot_size'] = encoder.fit_transform(df15['leafspot_size'])
df_novo_transform['leaf_shread'] = encoder.fit_transform(df16['leaf_shread'])
df_novo_transform['leaf_malf'] = encoder.fit_transform(df17['leaf_malf'])
df_novo_transform['leaf_mild'] = encoder.fit_transform(df18['leaf_mild'])
df_novo_transform['stem'] = encoder.fit_transform(df19['stem'])
df_novo_transform['lodging'] = encoder.fit_transform(df20['lodging'])
df_novo_transform['stem_cankers'] = encoder.fit_transform(df21['stem_cankers'])
df_novo_transform['canker_lesion'] = encoder.fit_transform(df22['canker_lesion'])
df_novo_transform['fruiting_bodies'] = encoder.fit_transform(df23['fruiting_bodies'])
df_novo_transform['external_decay'] = encoder.fit_transform(df24['external_decay'])
df_novo_transform['mycelium'] = encoder.fit_transform(df25['mycelium'])
df_novo_transform['int_discolor'] = encoder.fit_transform(df26['int_discolor'])
df_novo_transform['sclerotia'] = encoder.fit_transform(df27['sclerotia'])
df_novo_transform['fruit_pods'] = encoder.fit_transform(df28['fruit_pods'])
df_novo_transform['fruit_spots'] = encoder.fit_transform(df29['fruit_spots'])
df_novo_transform['seed'] = encoder.fit_transform(df30['seed'])
df_novo_transform['mold_growth'] = encoder.fit_transform(df31['mold_growth'])
df_novo_transform['seed_discolor'] = encoder.fit_transform(df32['seed_discolor'])
df_novo_transform['seed_size'] = encoder.fit_transform(df33['seed_size'])
df_novo_transform['shriveling'] = encoder.fit_transform(df34['shriveling'])
df_novo_transform['roots'] = encoder.fit_transform(df35['roots'])

df_novo_transform

Unnamed: 0,date,plant_stand,precip,temp,hail,crop_hist,area_damaged,severity,seed_tmt,germination,...,int_discolor,sclerotia,fruit_pods,fruit_spots,seed,mold_growth,seed_discolor,seed_size,shriveling,roots
0,4,2,1,3,2,1,4,2,2,0,...,2,2,2,0,1,2,0,1,2,0
1,3,2,2,2,1,2,0,3,3,3,...,0,1,4,4,1,2,0,1,1,3
2,3,1,0,2,1,0,1,1,1,2,...,2,1,1,0,0,2,0,2,0,3
3,5,0,3,3,1,1,1,1,1,3,...,0,0,3,0,2,1,1,0,0,1
4,0,0,3,2,1,4,0,2,2,0,...,3,2,2,0,2,1,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15258,1,1,1,0,0,4,1,1,2,1,...,1,0,4,4,2,2,1,2,0,0
15259,0,1,3,2,0,4,2,1,3,1,...,2,2,1,0,1,1,0,2,2,3
15260,5,0,3,0,0,4,0,2,0,1,...,3,0,0,4,2,1,2,1,2,2
15261,0,0,2,3,0,4,4,3,3,3,...,1,2,0,1,1,1,0,1,1,2


In [17]:
prev_novos_dados = mod.predict(df_novo_transform)
prev_novos_dados = pd.DataFrame(prev_novos_dados, columns = ['class'])
prev_novos_dados
resultados = pd.concat([df_novo, prev_novos_dados], axis = 1)
resultados



Unnamed: 0,date,plant_stand,precip,temp,hail,crop_hist,area_damaged,severity,seed_tmt,germination,...,sclerotia,fruit_pods,fruit_spots,seed,mold_growth,seed_discolor,seed_size,shriveling,roots,class
0,june,normal,gt-norm,norm,yes,diff-lst-year,whole-field,pot-severe,none,80-89,...,present,dna,?,abnorm,present,?,lt-norm,present,?,2
1,july,normal,lt-norm,lt-norm,no,same-lst-sev-yrs,?,severe,other,lt-80,...,absent,norm,dna,abnorm,present,?,lt-norm,absent,rotted,6
2,july,lt-normal,?,lt-norm,no,?,low-areas,minor,fungicide,?,...,absent,diseased,?,?,present,?,norm,?,rotted,2
3,may,?,norm,norm,no,diff-lst-year,low-areas,minor,fungicide,lt-80,...,?,few-present,?,norm,absent,absent,?,?,galls-cysts,8
4,?,?,norm,lt-norm,no,same-lst-yr,?,pot-severe,none,80-89,...,present,dna,?,norm,absent,?,?,present,?,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15258,april,lt-normal,gt-norm,?,?,same-lst-yr,low-areas,minor,none,90-100,...,?,norm,dna,norm,present,absent,norm,?,?,8
15259,?,lt-normal,norm,lt-norm,?,same-lst-yr,scattered,minor,other,90-100,...,present,diseased,?,abnorm,absent,?,norm,present,rotted,2
15260,may,?,norm,?,?,same-lst-yr,?,pot-severe,?,90-100,...,?,?,dna,norm,absent,present,lt-norm,present,norm,15
15261,?,?,lt-norm,norm,?,same-lst-yr,whole-field,severe,other,lt-80,...,present,?,absent,abnorm,absent,?,lt-norm,absent,norm,2


In [18]:
prev_novos_dados.groupby(['class']).size()

class
0      159
1       14
2     3992
3       11
4      767
5      330
6      972
7      111
8      353
9      704
10      39
11      78
12    2643
13     408
14       6
15    3846
16     119
17     503
18     208
dtype: int64

# converter o código da classe pelo nome das doenças

In [19]:
prev_novos_dados['class'].replace({0:'2-4-d-injury', 1:'alternarialeaf-spot', 2:'anthracnose', 3:'bacterial-blight', 4:'bacterial-pustule', 5:'brown-spot',
                                   6:'brown-stem-rot', 7:'charcoal-rot', 8:'cyst-nematode', 9:'diaporthe-pod-&-stem-blight', 10:'diaporthe-stem-canker',
                                   11:'downy-mildew', 12:'frog-eye-leaf-spot', 13:'herbicide-injury', 14:'herbicide-injury', 15:'phytophthora-rot',
                                   16:'powdery-mildew', 17:'purple-seed-stain', 18:'rhizoctonia-root-rot'}, inplace=True)

In [20]:
prev_novos_dados.groupby(['class']).size()

class
2-4-d-injury                    159
alternarialeaf-spot              14
anthracnose                    3992
bacterial-blight                 11
bacterial-pustule               767
brown-spot                      330
brown-stem-rot                  972
charcoal-rot                    111
cyst-nematode                   353
diaporthe-pod-&-stem-blight     704
diaporthe-stem-canker            39
downy-mildew                     78
frog-eye-leaf-spot             2643
herbicide-injury                414
phytophthora-rot               3846
powdery-mildew                  119
purple-seed-stain               503
rhizoctonia-root-rot            208
dtype: int64