In [128]:
import pandas as pd
import numpy as np
import csv
import re
from bs4 import BeautifulSoup

from tabula import read_pdf
from tabulate import tabulate
from itertools import chain
import matplotlib as plt

from ctypes.util import find_library
import camelot 


%matplotlib inline
%config InlineBackend.figure_format='retina'
# If encounter SSL certify error - go to your Python 3.x.x folder and install Certification 

In [129]:
csv = pd.read_csv('farm_smoke.csv')
sonoma_data = csv.query('County == "Sonoma"').copy().reset_index(drop=True)
sonoma_data

Unnamed: 0,County,Year,Links
0,Sonoma,2021,https://sonomacounty.ca.gov//Main%20County%20S...
1,Sonoma,2020,https://sonomacounty.ca.gov//Main%20County%20S...
2,Sonoma,2019,https://sonomacounty.ca.gov//Main%20County%20S...
3,Sonoma,2018,https://sonomacounty.ca.gov//Main%20County%20S...
4,Sonoma,2017,https://sonomacounty.ca.gov//Main%20County%20S...
...,...,...,...
89,Sonoma,1932,https://sonomacounty.ca.gov//Main%20County%20S...
90,Sonoma,1931,https://sonomacounty.ca.gov//Main%20County%20S...
91,Sonoma,1930,https://sonomacounty.ca.gov//Main%20County%20S...
92,Sonoma,1929,https://sonomacounty.ca.gov//Main%20County%20S...


In [106]:
url = sonoma_data['Links'][0]
url

'https://sonomacounty.ca.gov//Main%20County%20Site/Natural%20Resources/Agricultural%2C%20Weights%20%26%20Measures/Documents/Crop%20Reports/2021-Sonoma-County-Crop-Report.pdf'

In [107]:
# utilize camelot to read pdf file at page 12, capture precisely with stream and close up the row gap with row_tol = 10
tables = camelot.read_pdf('1111.pdf', pages='12', flavor='stream', row_tol=10)
data = tables[0].df
dat = tables[0].parsing_report
dat

{'accuracy': 99.66, 'whitespace': 9.68, 'order': 1, 'page': 12}

In [108]:
data

Unnamed: 0,0,1,2,3,4,5,6,7
0,,,TOP 13 BY VALUE - LISTED ALPHABETICALLY,,,,,
1,,,,Acreage,,,Production,
2,Varietal,Year,Bearing,Non-Bearing,Total,Tons,$/Ton,Total Value
3,Cabernet Franc,2021,552.0,33.0,585.0,1150.4,"$3,866.85","$4,448,400"
4,,2020,557.0,29.0,586.0,942.4,"$3,534.95","$3,331,300"
5,Cabernet,2021,12293.0,429.0,12722.0,41114.6,"$2,728.69","$112,189,000"
6,Sauvignon,2020,12089.0,631.0,12720.0,31137.9,"$2,460.15","$76,603,900"
7,Carignane,2021,156.0,2.0,158.0,280.7,"$2,633.52","$739,200"
8,,2020,158.0,6.0,164.0,257.9,"$2,447.30","$631,200"
9,Grenache,2021,211.0,55.0,266.0,898.4,"$3,101.70","$2,786,600"


In [109]:
# drop rows that is not helpful
data = data.drop([0,1,2], axis=0)

In [110]:
data

Unnamed: 0,0,1,2,3,4,5,6,7
3,Cabernet Franc,2021,552.0,33.0,585.0,1150.4,"$3,866.85","$4,448,400"
4,,2020,557.0,29.0,586.0,942.4,"$3,534.95","$3,331,300"
5,Cabernet,2021,12293.0,429.0,12722.0,41114.6,"$2,728.69","$112,189,000"
6,Sauvignon,2020,12089.0,631.0,12720.0,31137.9,"$2,460.15","$76,603,900"
7,Carignane,2021,156.0,2.0,158.0,280.7,"$2,633.52","$739,200"
8,,2020,158.0,6.0,164.0,257.9,"$2,447.30","$631,200"
9,Grenache,2021,211.0,55.0,266.0,898.4,"$3,101.70","$2,786,600"
10,,2020,203.0,58.0,261.0,751.7,"$2,509.56","$1,886,400"
11,Malbec,2021,422.0,26.0,448.0,1577.2,"$3,081.97","$4,860,900"
12,,2020,414.0,16.0,430.0,1004.8,"$2,643.18","$2,655,900"


In [111]:
# rename my columns
new_columns = ['Varietal', 'Year', 'Bearing' , 'Non-Bearing', 'Total', 'Tons', '$/Ton', 'Total Value']
data = data.rename(columns=dict(zip(data.columns,new_columns)))
data

Unnamed: 0,Varietal,Year,Bearing,Non-Bearing,Total,Tons,$/Ton,Total Value
3,Cabernet Franc,2021,552.0,33.0,585.0,1150.4,"$3,866.85","$4,448,400"
4,,2020,557.0,29.0,586.0,942.4,"$3,534.95","$3,331,300"
5,Cabernet,2021,12293.0,429.0,12722.0,41114.6,"$2,728.69","$112,189,000"
6,Sauvignon,2020,12089.0,631.0,12720.0,31137.9,"$2,460.15","$76,603,900"
7,Carignane,2021,156.0,2.0,158.0,280.7,"$2,633.52","$739,200"
8,,2020,158.0,6.0,164.0,257.9,"$2,447.30","$631,200"
9,Grenache,2021,211.0,55.0,266.0,898.4,"$3,101.70","$2,786,600"
10,,2020,203.0,58.0,261.0,751.7,"$2,509.56","$1,886,400"
11,Malbec,2021,422.0,26.0,448.0,1577.2,"$3,081.97","$4,860,900"
12,,2020,414.0,16.0,430.0,1004.8,"$2,643.18","$2,655,900"


In [112]:
data['Varietal'].unique()

array(['Cabernet Franc', '', 'Cabernet', 'Sauvignon', 'Carignane',
       'Grenache', 'Malbec', 'Mataro/', 'Mouvedere', 'Merlot',
       'Petite Sirah', 'Petit Verdot', 'Pinot Noir', 'Sangiovese',
       'Syrah-Shiraz', 'Zinfandel', 'TOTAL ALL REDS',
       'including other reds'], dtype=object)

In [113]:
data['Varietal'] = data['Varietal'].replace('Mataro/', 'Mataro/Mouvedere')
data['Varietal'] = data['Varietal'].replace('Cabernet', 'Cabernet Sauvignon')
data['Varietal'] = data['Varietal'].replace(['', 'including other reds', 'Mouvedere', 'Sauvignon'], np.nan)
data = data.reset_index(drop=True)

data['Varietal'].unique()

array(['Cabernet Franc', nan, 'Cabernet Sauvignon', 'Carignane',
       'Grenache', 'Malbec', 'Mataro/Mouvedere', 'Merlot', 'Petite Sirah',
       'Petit Verdot', 'Pinot Noir', 'Sangiovese', 'Syrah-Shiraz',
       'Zinfandel', 'TOTAL ALL REDS'], dtype=object)

In [114]:
data['Varietal'] = data['Varietal'].dropna().repeat(2).reset_index(drop=True)
data['County'] = 'Sonoma'

In [125]:
first_column = data.pop('County')
data.insert(0, 'County', first_column)
data

Unnamed: 0,County,Varietal,Year,Bearing,Non-Bearing,Total,Tons,$/Ton,Total Value
0,Sonoma,Cabernet Franc,2021,552.0,33.0,585.0,1150.4,"$3,866.85","$4,448,400"
1,Sonoma,Cabernet Franc,2020,557.0,29.0,586.0,942.4,"$3,534.95","$3,331,300"
2,Sonoma,Cabernet Sauvignon,2021,12293.0,429.0,12722.0,41114.6,"$2,728.69","$112,189,000"
3,Sonoma,Cabernet Sauvignon,2020,12089.0,631.0,12720.0,31137.9,"$2,460.15","$76,603,900"
4,Sonoma,Carignane,2021,156.0,2.0,158.0,280.7,"$2,633.52","$739,200"
5,Sonoma,Carignane,2020,158.0,6.0,164.0,257.9,"$2,447.30","$631,200"
6,Sonoma,Grenache,2021,211.0,55.0,266.0,898.4,"$3,101.70","$2,786,600"
7,Sonoma,Grenache,2020,203.0,58.0,261.0,751.7,"$2,509.56","$1,886,400"
8,Sonoma,Malbec,2021,422.0,26.0,448.0,1577.2,"$3,081.97","$4,860,900"
9,Sonoma,Malbec,2020,414.0,16.0,430.0,1004.8,"$2,643.18","$2,655,900"


In [127]:
data.to_csv('example.csv', index=False, encoding='utf-8')

In [156]:
# data['Varietal'] = data['Varietal'].replace('', method='ffill')
# data