### Crop yeild data preprocessing

#### Importing the libraries 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#### Importing the dataset 

In [2]:
crop_df = pd.read_csv('apy.csv')

#### Understanding the data 

In [3]:
crop_df.shape

(246091, 7)

In [4]:
crop_df.describe()

Unnamed: 0,Crop_Year,Area,Production
count,246091.0,246091.0,242361.0
mean,2005.643018,12002.82,582503.4
std,4.952164,50523.4,17065810.0
min,1997.0,0.04,0.0
25%,2002.0,80.0,88.0
50%,2006.0,582.0,729.0
75%,2010.0,4392.0,7023.0
max,2015.0,8580100.0,1250800000.0


In [5]:
crop_df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [6]:
#Storing crop names to array
crop_names = crop_df['Crop'].unique()

In [7]:
crop_names

array(['Arecanut', 'Other Kharif pulses', 'Rice', 'Banana', 'Cashewnut',
       'Coconut ', 'Dry ginger', 'Sugarcane', 'Sweet potato', 'Tapioca',
       'Black pepper', 'Dry chillies', 'other oilseeds', 'Turmeric',
       'Maize', 'Moong(Green Gram)', 'Urad', 'Arhar/Tur', 'Groundnut',
       'Sunflower', 'Bajra', 'Castor seed', 'Cotton(lint)', 'Horse-gram',
       'Jowar', 'Korra', 'Ragi', 'Tobacco', 'Gram', 'Wheat', 'Masoor',
       'Sesamum', 'Linseed', 'Safflower', 'Onion', 'other misc. pulses',
       'Samai', 'Small millets', 'Coriander', 'Potato',
       'Other  Rabi pulses', 'Soyabean', 'Beans & Mutter(Vegetable)',
       'Bhindi', 'Brinjal', 'Citrus Fruit', 'Cucumber', 'Grapes', 'Mango',
       'Orange', 'other fibres', 'Other Fresh Fruits', 'Other Vegetables',
       'Papaya', 'Pome Fruit', 'Tomato', 'Rapeseed &Mustard', 'Mesta',
       'Cowpea(Lobia)', 'Lemon', 'Pome Granet', 'Sapota', 'Cabbage',
       'Peas  (vegetable)', 'Niger seed', 'Bottle Gourd', 'Sannhamp',
       'Va

In [8]:
crop_df['Season'].unique()

array(['Kharif     ', 'Whole Year ', 'Autumn     ', 'Rabi       ',
       'Summer     ', 'Winter     '], dtype=object)

#### Renaming the columns for consistancy in the dataset 

In [9]:
#renaming the columns
crop_df['Season'] = crop_df['Season'].replace(['Winter     '],'Rabi')
crop_df['Season'] = crop_df['Season'].replace(['Autumn     '],'Rabi')
crop_df['Season'] = crop_df['Season'].replace(['Rabi       '],'Rabi')
crop_df['Season'] = crop_df['Season'].replace(['Kharif     '],'Kharif')
crop_df['Season'] = crop_df['Season'].replace(['Whole Year '],'whole_year')
crop_df['Season'] = crop_df['Season'].replace(['Summer     '],'Summer')

In [10]:
crop_df['Season'].unique()

array(['Kharif', 'whole_year', 'Rabi', 'Summer'], dtype=object)

In [11]:
crop_df.dtypes

State_Name        object
District_Name     object
Crop_Year          int64
Season            object
Crop              object
Area             float64
Production       float64
dtype: object

In [12]:
#Converting Season name to season id
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
seasonsI = crop_df['Season']
crop_df['Season_ID'] = label_encoder.fit_transform(seasonsI)
print(crop_df['Season_ID'])

0         0
1         0
2         0
3         3
4         3
         ..
246086    2
246087    2
246088    3
246089    1
246090    1
Name: Season_ID, Length: 246091, dtype: int32


#### Converting the casing of strings to match the other datasets while merging 

In [13]:
crop_df["State_Name"] = crop_df["State_Name"].str.lower()
crop_df["District_Name"] = crop_df["District_Name"].str.lower()

In [14]:
crop_df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,Season_ID
0,andaman and nicobar islands,nicobars,2000,Kharif,Arecanut,1254.0,2000.0,0
1,andaman and nicobar islands,nicobars,2000,Kharif,Other Kharif pulses,2.0,1.0,0
2,andaman and nicobar islands,nicobars,2000,Kharif,Rice,102.0,321.0,0
3,andaman and nicobar islands,nicobars,2000,whole_year,Banana,176.0,641.0,3
4,andaman and nicobar islands,nicobars,2000,whole_year,Cashewnut,720.0,165.0,3


#### Removing Null values 

In [15]:
#check if there are null values in the dataset
crop_df.isnull().sum()

State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
Season_ID           0
dtype: int64

In [16]:
#Drop the rows with null values
crop_df = crop_df.dropna()

In [17]:
#check if there are null values in the dataset
crop_df.isnull().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
Season_ID        0
dtype: int64

#### converting final dataframe to csv 

In [18]:
crop_df.to_csv('crop_yield.csv')