In [1]:
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_Capsicum_cultivars'
data = pd.read_html(url, flavor='bs4', header=0, encoding='UTF8')

In [2]:
# Let's remove the last table 
del data[-1]

In [3]:
species = ['Capsicum annum', 'Capsicum baccatum', 'Capsicum chinense',
          'Capsicum frutescens', 'Capsicum pubescens']


for i in range(len(species)):
    data[i]['Species'] = species[i]

In [5]:
df = pd.concat(data, sort=False) 
df.head()

Unnamed: 0,Image,Name,Type,Origin,Heat,Pod size,Description,Species
0,,Aleppo,,Syria and Turkey,"15,000 SR",,"Grown in Syria and Turkey and used, in coarsel...",Capsicum annum
1,,Anaheim[14],Anaheim,United States,"500–2,500 SR",15 cm (5.9 in),A mild variety of New Mexico chile. It was lat...,Capsicum annum
2,,Banana,Waxy,,0–500 SR,15 cm (5.9 in),Often it is pickled and used as an ingredient ...,Capsicum annum
3,,Bird's Eye,Small hot,Southeast Asia,"50,000–100,000[15] SR",4 cm (1.6 in),A Southeast Asian cultivar known by many local...,Capsicum annum
4,,Black Hungarian[16],Ornamental/ Culinary,Hungary,"5,000–10,000 SR",5–7 cm (≈ 2–3 in),Grows in a conical shape with a slight curve n...,Capsicum annum


In [8]:
import re
import numpy as np

# Remove brackets and whats between them (e.g. [14])
df['Name'] = df['Name'].map(lambda x: re.sub("[\(\[].*?[\)\]]", "", x)
                                         if isinstance(x, str) else np.NaN)

# Pod Size get cm
df['Pod size'] = df['Pod size'].map(lambda x: x.split(' ', 1)[0].rstrip('cm') 
                                              if isinstance(x, str) else np.NaN)

# Taking the largest number in a range and convert all values to float
df['Pod size']  = df['Pod size'].map(lambda x: x.split('–', 1)[-1]
                                              if isinstance(x, str) else np.NaN)
# Convert to float
df['Pod size'] = df['Pod size'].map(lambda x: float(x))

# Taking the largest SHU
df['Heat'] = df['Heat'].map(lambda x: re.sub("[\(\[].*?[\)\]]", "", x) 
                            if isinstance(x, str) else np.NaN)
df['Heat'] = df['Heat'].str.replace(',', '')
df['Heat'] = df['Heat'].map(lambda x: float(re.findall(r'\d+(?:,\d+)?', x)[-1])
                            if isinstance(x, str) else np.NaN)

In [9]:
df.isna().sum()

Image          65
Name            0
Type           39
Origin         28
Heat            9
Pod size       33
Description     8
Species         0
dtype: int64

In [10]:
df['Species'].value_counts()

Capsicum annum         39
Capsicum chinense      17
Capsicum frutescens     4
Capsicum baccatum       3
Capsicum pubescens      2
Name: Species, dtype: int64