In [55]:
import pickle
import os
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [56]:
# Set up the different data paths
ROOT = "c:/Users/summe/Git Repos/ml-agricultural-census-utah"
DATA_DIR = os.path.join(ROOT, "initial_analysis")
MODEL_EVAL_DIR = os.path.join(ROOT, "ml_model_finetuning")
MODEL_DIR = os.path.join(MODEL_EVAL_DIR, "models")
DATASET_PREDICT = os.path.join(DATA_DIR, "wheat_prod_unlabeled.csv")
DATASET_LABELED = os.path.join(DATA_DIR, "wheat_prod_labeled.csv")
XGB_MODEL = os.path.join(MODEL_DIR, "agricultural_census_model_xgb.pkl")

# Show the current paths 
print("ROOT: ", ROOT)
print("DATA_DIR: ", DATA_DIR)
print("DATASET_PREDICT: ", DATASET_PREDICT)
print("DATASET_LABELED: ", DATASET_LABELED)
print("MODEL_DIR: ", MODEL_DIR)
print("XGB_MODEL: ", XGB_MODEL)

ROOT:  c:/Users/summe/Git Repos/ml-agricultural-census-utah
DATA_DIR:  c:/Users/summe/Git Repos/ml-agricultural-census-utah\initial_analysis
DATASET_PREDICT:  c:/Users/summe/Git Repos/ml-agricultural-census-utah\initial_analysis\wheat_prod_unlabeled.csv
DATASET_LABELED:  c:/Users/summe/Git Repos/ml-agricultural-census-utah\initial_analysis\wheat_prod_labeled.csv
MODEL_DIR:  c:/Users/summe/Git Repos/ml-agricultural-census-utah\ml_model_finetuning\models
XGB_MODEL:  c:/Users/summe/Git Repos/ml-agricultural-census-utah\ml_model_finetuning\models\agricultural_census_model_xgb.pkl


In [57]:
# preview the prediction dataset
df = pd.read_csv(DATASET_PREDICT)
df.head()

Unnamed: 0,year,county,countyansi,wheatprod,pop,wheatprice,wheatprice_l1,wheatprice_l2,precipitation,maximum_temperature,...,county23,county24,county25,county26,county27,year1,year2,year3,year4,year5
0,2002,beaver,1,,6058,4.65,3.3,3.25,7.8,64.400002,...,0,0,0,0,0,1,0,0,0,0
1,2007,beaver,1,,6266,8.3,4.85,3.8,9.84,64.599998,...,0,0,0,0,0,0,1,0,0,0
2,2017,beaver,1,,6402,5.2,4.3,5.18,11.37,65.400002,...,0,0,0,0,0,0,0,0,1,0
3,2022,beaver,1,,7327,9.15,7.1,5.43,10.76,64.5,...,0,0,0,0,0,0,0,0,0,1
4,2002,carbon,7,,20114,4.65,3.3,3.25,9.84,58.0,...,0,0,0,0,0,1,0,0,0,0


In [58]:
# show the column names
df.columns

Index(['year', 'county', 'countyansi', 'wheatprod', 'pop', 'wheatprice',
       'wheatprice_l1', 'wheatprice_l2', 'precipitation',
       'maximum_temperature', 'palmer_modified_drought_index_pm',
       'average_temperature', 'palmer_zindex', 'minimum_temperature',
       'palmer_hydrological_drought_inde', 'palmer_drought_severity_index_pd',
       'heating_degree_days', 'cooling_degree_days', 'precipitation_l1',
       'precipitation_l2', 'allindustrytotal', 'privateindustries',
       'agricultureforestryfishingandhun', 'miningquarryingandoilandgasextra',
       'utilities', 'construction', 'manufacturing',
       'durablegoodsmanufacturing', 'nondurablegoodsmanufacturing',
       'wholesaletrade', 'retailtrade', 'transportationandwarehousing',
       'information', 'financeinsurancerealestaterental',
       'financeandinsurance', 'realestateandrentalandleasing',
       'professionalandbusinessservices', 'professionalscientificandtechnic',
       'managementofcompaniesandenterpri',

In [59]:
# check wheat production unique values
print(df['wheatprod'].unique())
# check logwheatprod unique values
print(df['logwheatprod'].unique())


[nan]
[nan]


In [60]:
# show any missing values by column that have more than 0 missing values
missing = df.isnull().sum()
missing = missing[missing > 0]
print(missing)

wheatprod       36
logwheatprod    36
dtype: int64


In [61]:
county = df['county']

In [62]:
df = df.drop(columns=['logwheatprod', 'wheatprod', 'county'])
df.head()

Unnamed: 0,year,countyansi,pop,wheatprice,wheatprice_l1,wheatprice_l2,precipitation,maximum_temperature,palmer_modified_drought_index_pm,average_temperature,...,county23,county24,county25,county26,county27,year1,year2,year3,year4,year5
0,2002,1,6058,4.65,3.3,3.25,7.8,64.400002,-3.04,49.700001,...,0,0,0,0,0,1,0,0,0,0
1,2007,1,6266,8.3,4.85,3.8,9.84,64.599998,-1.04,50.0,...,0,0,0,0,0,0,1,0,0,0
2,2017,1,6402,5.2,4.3,5.18,11.37,65.400002,-2.19,51.299999,...,0,0,0,0,0,0,0,0,1,0
3,2022,1,7327,9.15,7.1,5.43,10.76,64.5,0.35,50.0,...,0,0,0,0,0,0,0,0,0,1
4,2002,7,20114,4.65,3.3,3.25,9.84,58.0,-0.96,44.799999,...,0,0,0,0,0,1,0,0,0,0


In [63]:
# create feature matrix
X = df

# scaled features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# load the model
with open(XGB_MODEL, 'rb') as file:
    model = pickle.load(file)

# make predictions
y_pred = model.predict(X_scaled)

# add the predictions to the dataframe
df['wheatprod'] = y_pred

In [64]:
df['county'] = county