In [1]:
#Imports 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
#Read data
soiCropYield = pd.read_csv('DATA/crop_yield_plus_soi.csv')

In [3]:
soiCropYield.head()

Unnamed: 0,Year,Corn Yield,Cotton Yield,Rice Yield,Soybean Yield,Wheat Yield,Yield Units,DJFM Mean SOI
0,2023,147.0,41.0,167.0,45.0,70.0,bu / acre,1.28
1,2022,161.0,39.0,176.0,45.5,60.0,bu / acre,1.23
2,2021,159.0,39.0,179.0,49.0,65.0,bu / acre,1.4
3,2020,171.0,36.0,161.0,51.0,62.0,bu / acre,-0.15
4,2019,155.0,37.0,164.0,46.0,63.0,bu / acre,-0.17


In [4]:
#Add name of SOI phase to dataframe: SOI phase
phase = []

for i in soiCropYield.index:
    if soiCropYield['DJFM Mean SOI'][i] < 0:
        phase.append('El Nino')
    elif soiCropYield['DJFM Mean SOI'][i] > 0:
        phase.append('La Nina')
    else:
        phase.append('Neutral')

soiCropYield['SOI phase'] = phase

In [5]:
soiCropYield.head()

Unnamed: 0,Year,Corn Yield,Cotton Yield,Rice Yield,Soybean Yield,Wheat Yield,Yield Units,DJFM Mean SOI,SOI phase
0,2023,147.0,41.0,167.0,45.0,70.0,bu / acre,1.28,La Nina
1,2022,161.0,39.0,176.0,45.5,60.0,bu / acre,1.23,La Nina
2,2021,159.0,39.0,179.0,49.0,65.0,bu / acre,1.4,La Nina
3,2020,171.0,36.0,161.0,51.0,62.0,bu / acre,-0.15,El Nino
4,2019,155.0,37.0,164.0,46.0,63.0,bu / acre,-0.17,El Nino


In [6]:
soiCropYield['SOI phase'].value_counts()

La Nina    40
El Nino    30
Neutral     2
Name: SOI phase, dtype: int64

In [7]:
#Create dummy features for categorical variables 
soiCropYield_dummies = pd.get_dummies(soiCropYield, columns=['SOI phase'], drop_first=True, prefix='Phase')

In [8]:
soiCropYield_dummies.head()

Unnamed: 0,Year,Corn Yield,Cotton Yield,Rice Yield,Soybean Yield,Wheat Yield,Yield Units,DJFM Mean SOI,Phase_La Nina,Phase_Neutral
0,2023,147.0,41.0,167.0,45.0,70.0,bu / acre,1.28,1,0
1,2022,161.0,39.0,176.0,45.5,60.0,bu / acre,1.23,1,0
2,2021,159.0,39.0,179.0,49.0,65.0,bu / acre,1.4,1,0
3,2020,171.0,36.0,161.0,51.0,62.0,bu / acre,-0.15,0,0
4,2019,155.0,37.0,164.0,46.0,63.0,bu / acre,-0.17,0,0


In [9]:
#Standardize the magnitude of numeric features using a scaler
soiCropYield_dummies.dtypes

Year               int64
Corn Yield       float64
Cotton Yield     float64
Rice Yield       float64
Soybean Yield    float64
Wheat Yield      float64
Yield Units       object
DJFM Mean SOI    float64
Phase_La Nina      uint8
Phase_Neutral      uint8
dtype: object

In [10]:
print(soiCropYield_dummies["Phase_La Nina"].value_counts())

1    40
0    32
Name: Phase_La Nina, dtype: int64


In [11]:
print(soiCropYield_dummies["Phase_Neutral"].value_counts())

0    70
1     2
Name: Phase_Neutral, dtype: int64


In [12]:
print(soiCropYield_dummies.shape)

(72, 10)


In [13]:
numeric_columns = soiCropYield_dummies.select_dtypes(include=["float"])

In [14]:
numeric_columns.head(5)

Unnamed: 0,Corn Yield,Cotton Yield,Rice Yield,Soybean Yield,Wheat Yield,DJFM Mean SOI
0,147.0,41.0,167.0,45.0,70.0,1.28
1,161.0,39.0,176.0,45.5,60.0,1.23
2,159.0,39.0,179.0,49.0,65.0,1.4
3,171.0,36.0,161.0,51.0,62.0,-0.15
4,155.0,37.0,164.0,46.0,63.0,-0.17


In [15]:
print(numeric_columns.shape)

(72, 6)


In [16]:
feature_names = numeric_columns.columns
print(feature_names)

Index(['Corn Yield', 'Cotton Yield', 'Rice Yield', 'Soybean Yield',
       'Wheat Yield', 'DJFM Mean SOI'],
      dtype='object')


In [17]:
scaler = StandardScaler()
scaler.fit(numeric_columns)
numeric_columns_scaled = scaler.transform(numeric_columns)

In [18]:
df_numeric_columns_scaled = pd.DataFrame(numeric_columns_scaled, columns=feature_names)

In [19]:
df_numeric_columns_scaled.head()

Unnamed: 0,Corn Yield,Cotton Yield,Rice Yield,Soybean Yield,Wheat Yield,DJFM Mean SOI
0,1.208568,2.158421,1.66079,1.507605,2.25347,1.125658
1,1.562553,1.931053,1.944596,1.562845,1.432576,1.073604
2,1.511983,1.931053,2.039198,1.949528,1.843023,1.250586
3,1.815398,1.59,1.471586,2.17049,1.596754,-0.363073
4,1.410845,1.703684,1.566188,1.618086,1.678844,-0.383895


In [20]:
df_dummies = soiCropYield_dummies[["Phase_La Nina", "Phase_Neutral"]]

In [21]:
df_dummies.head()

Unnamed: 0,Phase_La Nina,Phase_Neutral
0,1,0
1,1,0
2,1,0
3,0,0
4,0,0


In [22]:
df_concat = pd.concat([df_numeric_columns_scaled, df_dummies], axis=1)

In [23]:
df_concat.head()

Unnamed: 0,Corn Yield,Cotton Yield,Rice Yield,Soybean Yield,Wheat Yield,DJFM Mean SOI,Phase_La Nina,Phase_Neutral
0,1.208568,2.158421,1.66079,1.507605,2.25347,1.125658,1,0
1,1.562553,1.931053,1.944596,1.562845,1.432576,1.073604,1,0
2,1.511983,1.931053,2.039198,1.949528,1.843023,1.250586,1,0
3,1.815398,1.59,1.471586,2.17049,1.596754,-0.363073,0,0
4,1.410845,1.703684,1.566188,1.618086,1.678844,-0.383895,0,0


In [None]:
#Split data into training and testing datasets
#In my Capstone Two project, I do not have a target variable.  Consequently, splitting the data into training
#and testing datasets does not make any sense.  See my "Capstone Two - EDA" Jupiter notebook for further insight
#into scope of my project.  DO I need to revamp my initial project?
#
#NOTE: I do know how to split data into trianing and testing datasets
#
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
#     where, X is a DataFrame/matrix of features 
#            y is a Series/vector of target variable 
