In [1]:
import pandas as pd

CORN = '/kaggle/input/corn-farming-data/corn_data.csv'

df = pd.read_csv(filepath_or_buffer=CORN).drop(columns=['County', 'Crop', 'Power source', 'Water source', 'Crop insurance'])
df.head()

Unnamed: 0,Farmer,Education,Gender,Age bracket,Household size,Acreage,Fertilizer amount,Laborers,Yield,Main credit source,Farm records,Main advisory source,Extension provider,Advisory format,Advisory language,Latitude,Longitude
0,fmr_65,Certificate,Male,36-45,7,2.0,50,2,300,Credit groups,Yes,Radio,Private Provider,Phone Calls,Vernacular,-3.46,38.35
1,fmr_77,Certificate,Male,36-45,7,0.25,50,2,270,Credit groups,Yes,Radio,County Government,SMS text,Kiswahili,-3.31,38.4
2,fmr_89,Certificate,Male,36-45,7,3.0,251,2,270,Credit groups,Yes,Radio,Private Provider,Phone Calls,Vernacular,-3.41,38.37
3,fmr_102,Certificate,Male,36-45,7,1.5,300,3,200,Credit groups,Yes,Radio,County Government,SMS text,Kiswahili,-3.39,38.37
4,fmr_25,Certificate,Male,46-55,3,,50,2,180,Credit groups,Yes,Radio,Private Provider,Phone Calls,Vernacular,-3.39,38.33


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Farmer                422 non-null    object 
 1   Education             396 non-null    object 
 2   Gender                422 non-null    object 
 3   Age bracket           422 non-null    object 
 4   Household size        422 non-null    int64  
 5   Acreage               351 non-null    float64
 6   Fertilizer amount     422 non-null    int64  
 7   Laborers              422 non-null    int64  
 8   Yield                 422 non-null    int64  
 9   Main credit source    422 non-null    object 
 10  Farm records          422 non-null    object 
 11  Main advisory source  422 non-null    object 
 12  Extension provider    422 non-null    object 
 13  Advisory format       422 non-null    object 
 14  Advisory language     422 non-null    object 
 15  Latitude              4

In [3]:
from plotly.express import histogram
for column in ['Household size', 'Acreage', 'Fertilizer amount', 'Yield']:
    histogram(data_frame=df, x=column).show()

In [4]:
df.columns

Index(['Farmer', 'Education', 'Gender', 'Age bracket', 'Household size',
       'Acreage', 'Fertilizer amount', 'Laborers', 'Yield',
       'Main credit source', 'Farm records', 'Main advisory source',
       'Extension provider', 'Advisory format', 'Advisory language',
       'Latitude', 'Longitude'],
      dtype='object')

In [5]:
from plotly.express import scatter_geo
for column in ['Education', 'Gender', 'Age bracket','Main credit source', 'Farm records', 'Main advisory source', 'Extension provider', 'Advisory format', 'Advisory language',]:
    scatter_geo(data_frame=df[['Farmer', column, 'Latitude', 'Longitude']].drop_duplicates(ignore_index=True), lat='Latitude', lon='Longitude', fitbounds='locations', hover_name='Farmer', color=column ).show()

In [6]:
df.nunique()

Farmer                  422
Education                 5
Gender                    2
Age bracket               5
Household size            9
Acreage                  10
Fertilizer amount        33
Laborers                  6
Yield                    34
Main credit source        3
Farm records              2
Main advisory source      5
Extension provider        4
Advisory format           2
Advisory language         3
Latitude                 36
Longitude                25
dtype: int64

In [7]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, init='pca', verbose=1, random_state=2024)
mean_acreage = df['Acreage'].dropna().mean()
tsne_df = pd.DataFrame(data=tsne.fit_transform(X=df[[
    'Household size', 
    'Fertilizer amount', 
    'Laborers', 
    'Latitude', 
    'Longitude',
    'Acreage',
]].drop_duplicates(ignore_index=True).fillna(value=mean_acreage)), columns=['t0', 't1'])
tsne_df['yield'] = df['Yield']
tsne_df['acreage'] = df['Acreage'].fillna(value=mean_acreage)


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 421 samples in 0.001s...
[t-SNE] Computed neighbors for 421 samples in 0.009s...
[t-SNE] Computed conditional probabilities for sample 421 / 421
[t-SNE] Mean sigma: 1.392876
[t-SNE] KL divergence after 250 iterations with early exaggeration: 43.316269
[t-SNE] KL divergence after 1000 iterations: 0.127584


In [8]:
from plotly.express import scatter
scatter(data_frame=tsne_df, x='t0', y='t1', color='yield', size='acreage')

It seems unlikely from looking at this plot that we have enough data to predict yield, and it seems unlikely that adding the non-numerical values will make a model predictive.

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(df[['Longitude', 'Acreage',]].fillna(value=mean_acreage), df['Yield'].values, test_size=0.2, random_state=2024)
model = LinearRegression().fit(X=X_train, y=y_train)

print(r2_score(y_true=y_test, y_pred = model.predict(X=X_test)))
scatter(x=y_test, y=model.predict(X=X_test)).show()

0.27138188769100513


This choice of variables maximizes our R2 score, which is poor but consistent with our exploration using TSNE above.