This program creates a class `County_AgWr` that imports and models weather and agricultural data on the county level. This notebook focuses on Colorado, but with minimal user input it could examine weather and agricultural data throughout the U.S.

Instantiations of the class `County_AgWr` will have multiple `pandas.DataFrame`s self-referenced: 
*  `.wr` for weather.
*  `.crops` for crops.
*  `.models` for models and their statistics.

There are specific methods for creating and analyzing these dataframes:
*  `.add_weather()` scrapes, cleans, and concatenates weather data from the [Western Regional Climate Center](https://wrcc.dri.edu).
*  `.add_crops()` imports, wrangles, and merges crop data already downloaded as CSVs from the[ National Agricultural Statistics Service](https://quickstats.nass.usda.gov/) of the USDA.
*  `.crop_weather()` merges the crops yield data with weather data. Only those crops with more than 40 years of yield data are saved as a .csv and ready for X, y modeling.
*  `.fit_models(thru_month_num)` fits regression models from the `.crop_weather` CSVs with X as weather data for the year through month number ("thru_month_num") of the year,  and y as crop yields. Only those models with r2 scores > 0.02 are saved to `.models`.
*  `.fit_all_models()` cycles through the possible months to choose (April to October) and saves all models with r2 scores > 0.02 to `.models`.



In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import time
import os
import pickle
from google.colab import drive
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import (LinearRegression, Ridge, RidgeCV, Lasso, 
                                  ElasticNet, ElasticNetCV, BayesianRidge)
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

plt.style.use('seaborn-whitegrid')

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
class County_AgWr:
    """County instance to join crop and weather data and predict n years forward."""
    def __init__(self, name):
        self.name = name
        self.wr = pd.DataFrame
        self.crops = pd.DataFrame
        self.models = pd.DataFrame(columns=['crop', 'thru_month', 'model', 
                                            'r2_score', 'rmse'])
        self.crop_doc_count = 0

    def add_weather(self):
        if os.path.exists(file_path + f'/Data/raw/{self.name}_wr.csv'):
            wr_cat = pd.read_csv(file_path + f'/Data/raw/{self.name}_wr.csv', index_col='Year')
        else:
            counties = pd.read_csv(file_path + '/Data/raw/counties_ag.csv', index_col=[0])
            counties['Station ID'] = counties['Station ID'].astype(int)
            county = counties[counties['County'] == self.name]
            id = county.iloc[0, 2]
            urls = {'-p':f'https://wrcc.dri.edu/WRCCWrappers.py?sodxtrmts+0{id}+por+por+pcpn+none+msum+5+01+F',
                    '-txa':f'https://wrcc.dri.edu/WRCCWrappers.py?sodxtrmts+0{id}+por+por+maxt+none+mave+5+01+F',
                    '-taa':f'https://wrcc.dri.edu/WRCCWrappers.py?sodxtrmts+0{id}+por+por+avgt+none+mave+5+01+F',
                    '-tna':f'https://wrcc.dri.edu/WRCCWrappers.py?sodxtrmts+0{id}+por+por+mint+none+mave+5+01+F',
                    '-txm':f'https://wrcc.dri.edu/WRCCWrappers.py?sodxtrmts+0{id}+por+por+maxt+none+mmax+5+01+F',
                    '-tnm':f'https://wrcc.dri.edu/WRCCWrappers.py?sodxtrmts+0{id}+por+por+mint+none+mmin+5+01+F'
                    }   ## listed out, as the string formatting seemed to get lost in the pd.read_csv function.
            wr_cat = pd.DataFrame()
            for key, value in urls.items():
                id = county.iloc[0, 2]
                result = requests.get(value)
                soup = BeautifulSoup(result.text, 'html.parser')
                table = soup.find('table')
                data = pd.read_html(str(table))
                df = data[0]
                df.columns = df.iloc[0]
                df = df.drop([0])
                df = df.iloc[-68:-8, :]
                df = df.rename(columns={'YEAR(S)':'Year'})
                df['Year'] = pd.to_datetime(df['Year'], format='%Y')
                df = df.set_index('Year')
                df = df.dropna(axis=1)
                df = df.replace(to_replace='-----', value=np.nan)
                df = df.astype('float64')
                df = df.fillna(df.mean().round(2))
                cols_to_drop = ['NOV', 'DEC', 'ANN']
                df.drop(columns=cols_to_drop, inplace=True)
                df = df.add_suffix(key)
                wr_cat = pd.concat([wr_cat, df], axis=1)
                print(f'Weather data from {self.name.title()} County saved.')
                time.sleep(1.7)
            wr_cat.to_csv(file_path + f'/Data/raw/{self.name}_wr.csv')
        self.wr = wr_cat
        print(f'Weather added to `{self.name.lower()}.wr`')
    
    def add_crops(self):
        """Imports and cleans up ag-survey CSV files."""
        crops = ['corn', 'barley', 'beans', 'oats', 'potatoes', 'sorghum', 
                 'sunflowers', 'wheat']
        for crop in crops:
            df = pd.read_csv(file_path + f'/Data/raw/{crop}_colo.csv')
            col_to_drop = ['Program', 'Period', 'Week Ending', 'Geo Level', 'State', 
                            'State ANSI', 'Zip Code', 'watershed_code', 
                            'Watershed', 'Data Item', 'Domain', 'Domain Category', 
                            'Region', 'CV (%)']
            df.drop(col_to_drop, axis=1, inplace=True)
            df = df[(df['Value'] != ' (D)')&(df['Value'] != ' (Z)')]
            df.replace(to_replace=r',', value='', regex=True, inplace=True)
            df['Value'] = df['Value'].astype('int')
            df.rename(columns={'Value':f'Yield_{crop}'}, inplace=True)
            df['Year'] = pd.to_datetime(df['Year'], format='%Y')
            df = df[df['County'] == self.name]
            df2 = df.iloc[:, [0,3,-1]]
            if self.crop_doc_count == 0:
                self.crops = df2
            else:
                self.crops = pd.merge(left=self.crops, right=df2, 
                                            how='left', on=['Year', 'County'])
            self.crop_doc_count += 1
            print(f'{self.name.title()} County grows {crop}.')
        self.crops.set_index('Year', inplace=True)
        print(f'Crops added to `{self.name.lower()}.crops`')

    def crop_weather(self):
        """Combines weather df with crop yields with >40 non-null values."""
        crops = ['corn', 'barley', 'beans', 'oats', 'potatoes', 'sorghum', 
                 'sunflowers', 'wheat']
        for crop in crops:
            cp_y = self.crops[f'Yield_{crop}'].dropna()
            df = pd.merge(left=cp_y, right=self.wr, 
                          how='left', left_index=True, right_index=True)
            if len(df) < 40:
                continue
            else:
                df2 = df
                df2.to_csv(file_path + f'/Data/inter/{self.name}_{crop}.csv')
                print(f'{self.name}_{crop}  ' + str(df2.shape))

    def fit_models(self, thru_month_num=(datetime.now().month - 1)):
        """Takes crop_weather outputs through month (chosen by thru_month_num) 
        and fits regression models. Saves those models with r2 scores > 0.1."""
        regressors = [LinearRegression(), Ridge(), RidgeCV(), Lasso(),
              ElasticNet(), BayesianRidge(), RandomForestRegressor(), 
              GradientBoostingRegressor(), DecisionTreeRegressor(), SVR()]
        crops = ['corn', 'barley', 'beans', 'oats', 'potatoes', 'sorghum', 
                 'sunflowers', 'wheat']
        for crop in crops:
            if os.path.exists(file_path + f'/Data/inter/{self.name}_{crop}.csv'):
                df = pd.read_csv(file_path + f'/Data/inter/{self.name}_{crop}.csv', index_col='Year')
                suffixes = ['-p', '-txa', '-taa', '-tna', '-txm', '-tnm']
                col_2_drop = {4:['MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT'],
                            5:['JUN', 'JUL', 'AUG', 'SEP', 'OCT'],
                            6:['JUL', 'AUG', 'SEP', 'OCT'],
                            7:['AUG', 'SEP', 'OCT'],
                            8:['SEP', 'OCT'],
                            9:['OCT'],
                            10:[]}
                col_d = col_2_drop[thru_month_num]
                for col in col_d:
                    for suf in suffixes:
                        cs = col + suf
                        df.drop(columns=cs, axis=1, inplace=True)
                X = df.iloc[:, 1:]
                y = df.iloc[:, 0]
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=314)
                print(f'{crop} modeled...')
                for reg in regressors:
                    reg.fit(X_train, y_train) 
                    name = re.split("[(]", str(reg))[0]
                    y_pred = reg.predict(X_test)
                    rmse = np.sqrt(mean_squared_error(y_test, y_pred)).round(2)
                    r2 = r2_score(y_test, y_pred).round(3)
                    # print(f'{crop} {name} modeled.')
                    if r2 > 0.02:
                        new_row = {'crop':crop, 'thru_month':thru_month_num, 'model':name, 'r2_score':r2,
                                    'rmse':rmse}
                        self.models = self.models.append(new_row, ignore_index=True)
                    else:
                        continue
            else:
                continue
        print('Fit_models finished.')
    
    def fit_all_models(self):
        for i in range(4, 11):
            self.fit_models(thru_month_num=i)
        self.models.to_csv(file_path + f'/Data/proc/{self.name}_model_list.csv')
        print('April through October models finished.')


In [None]:
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/Data Science Certificate Program - May 2020/Students/Capstone/Chris'

To start, we need to choose a county as a capitalized string as an argument to `County_AgWr('NAME')`:

In [None]:
counties = pd.read_csv(file_path + '/Data/raw/counties_ag.csv')
choose_county = list(counties['County'])
choose_county

In [None]:
os.listdir(file_path + '/Data/proc')

In [None]:
boulder = County_AgWr('BOULDER')
boulder.name

In [None]:
boulder.add_weather()

In [None]:
boulder.wr.head()

In [None]:
boulder.add_crops()

In [None]:
boulder.crops.head()

In [None]:
boulder.crop_weather()

In [None]:
boulder.fit_all_models()

In [None]:
boulder.models

In [None]:
os.listdir(file_path + '/Data/proc')

In [None]:
os.listdir(file_path + '/Data/inter')