# Importing

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Adding dataset

In [None]:
df_raw = pd.read_csv('yield_df.csv')

# Visualizing and describing data

In [None]:
df_raw.head()

In [None]:
df_raw.tail()

In [None]:
df_raw['Area'].describe()

In [None]:
df_raw['Item'].describe()

In [None]:
df_raw['Year'].describe()

In [None]:
df_raw['hg/ha_yield'].describe()

In [None]:
df_raw['average_rain_fall_mm_per_year'].describe()

In [None]:
df_raw['pesticides_tonnes'].describe()

In [None]:
df_raw['avg_temp'].describe()

# Data Cleaning

In [None]:
df_raw.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df_raw.shape

In [None]:
rows_before = df_raw.shape[0]
df = df_raw.dropna() # drop rows with  NaN's
rows_after = df.shape[0]

In [None]:
# number of rows deleted from the dataset
rows_before-rows_after

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# find number of duplicated entries
df.duplicated().sum()

In [None]:
# remove duplicated entries from the dataset
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

# Transforming Average yearly rainfall
In summary, this code identifies the indices of rows in the DataFrame df where the values in the column 'average_rain_fall_mm_per_year' are not numeric strings. These rows can be considered for removal or further processing, depending on the specific use case.

In [None]:
def isStr(obj):
    try:
        float(obj)
        return False
    except:
        return True
to_drop = df[df['average_rain_fall_mm_per_year'].apply(isStr)].index

In [None]:
df = df.drop(to_drop)

In [None]:
df

In [None]:
df['average_rain_fall_mm_per_year'] = df['average_rain_fall_mm_per_year'].astype(np.float64)

# Graph Frequency vs Area

In [None]:
len(df['Area'].unique())

In [None]:
plt.figure(figsize=(15,20))
sns.countplot(y=df['Area'])
plt.show()

In [None]:
(df['Area'].value_counts() < 500).sum()

# Yield per Country

In [None]:
country = df['Area'].unique()
yield_per_country = []
for state in country:
    yield_per_country.append(df[df['Area']==state]['hg/ha_yield'].sum())


In [None]:
df['hg/ha_yield'].sum()

In [None]:
yield_per_country

# Yield Per Country Graph

In [None]:
plt.figure(figsize=(15, 20))
sns.barplot(y=country, x=yield_per_country)

# Graph Frequency vs Item

In [None]:
sns.countplot(y=df['Item'])

# Yield Vs Item

In [None]:
crops = df['Item'].unique()
yield_per_crop = []
for crop in crops:
    yield_per_crop.append(df[df['Item']==crop]['hg/ha_yield'].sum())

In [None]:
sns.barplot(y=crops,x=yield_per_crop)

# Train Test split Rearranging Columns

In [None]:
col = ['Year', 'average_rain_fall_mm_per_year','pesticides_tonnes', 'avg_temp', 'Area', 'Item', 'hg/ha_yield']
df = df[col]
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0, shuffle=True)

# Converting Categorical to Numerical and Scaling the values

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
ohe = OneHotEncoder(drop='first')
scale = StandardScaler()

preprocesser = ColumnTransformer(
        transformers = [
            ('StandardScale', scale, [0, 1, 2, 3]),
            ('OHE', ohe, [4, 5]),
        ],
        remainder='passthrough'
)

In [None]:
X_train_dummy = preprocesser.fit_transform(X_train)
X_test_dummy = preprocesser.transform(X_test)

In [None]:
preprocesser.get_feature_names_out(col[:-1])

# Let's train our model

In [None]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,r2_score


models = {
    'lr':LinearRegression(),
    'lss':Lasso(),
    'Rid':Ridge(),
    'Dtr':DecisionTreeRegressor()
}
for name, md in models.items():
    md.fit(X_train_dummy,y_train)
    y_pred = md.predict(X_test_dummy)
    
    print(f"{name} : mae : {mean_absolute_error(y_test,y_pred)} score : {r2_score(y_test,y_pred)}")

# Select model

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train_dummy,y_train)
dtr.predict(X_test_dummy)

# Predictive System

In [None]:
def prediction(Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item):
    # Create an array of the input features
    features = np.array([[Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item]], dtype=object)

    # Transform the features using the preprocessor
    transformed_features = preprocesser.transform(features)

    # Make the prediction
    predicted_yield = dtr.predict(transformed_features).reshape(1, -1)

    return predicted_yield[0]


In [None]:
Year = 1990
average_rain_fall_mm_per_year =1485.0
pesticides_tonnes = 121.00
avg_temp = 16.37                   
Area = 'Albania'
Item = 'Wheat'
result = prediction(Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item)

In [None]:
result

# Pickle Files

In [None]:
import pickle
pickle.dump(dtr,open('dtr.pkl','wb'))
pickle.dump(preprocesser,open('preprocessor.pkl','wb'))

In [None]:
import sklearn
print(sklearn.__version__)