In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go

df = pd.read_csv("/kaggle/input/crop-data-of-the-indian-states/Crop_Data.csv")
df.head()

Unnamed: 0,Crop,Season,State,Area,Soil Type,Pesticide Usage,pH,Temperature,Fertilizer Usage,Price,Rainfall
0,Cotton(lint),Kharif,Assam,1739.0,Black Soil,low,Average,Medium,High,2490.092421,3373.2
1,Onion,Whole Year,Assam,7832.0,Alluvial Soil,high,Low,Low,Medium,2348.904922,3520.7
2,Potato,Whole Year,Assam,75259.0,Laterile Soil,low,Average,High,High,2448.287513,2957.4
3,Rice,Autumn,Assam,607358.0,Alluvial Soil,low,Low,High,High,2714.419518,3079.6
4,Rice,Summer,Assam,174974.0,Black Soil,high,Low,Low,Low,2883.356997,2566.7


In [4]:
# Remove NaNs and duplicates
df = df.dropna()
df = df.drop_duplicates()

In [5]:
# We only want to consider the crop Potato
p = df.groupby(['Crop','Area','Soil Type','Pesticide Usage','pH','Temperature','Fertilizer Usage','Rainfall'])['Price'].sum()
p = p['Potato'].reset_index()
p

Unnamed: 0,Area,Soil Type,Pesticide Usage,pH,Temperature,Fertilizer Usage,Rainfall,Price
0,3.0,Alluvial Soil,high,High,Medium,Medium,1576.10,2194.615166
1,6.0,Red/Yellow Soil,low,Low,Medium,Low,1334.30,2809.892270
2,8.0,Red/Yellow Soil,high,High,High,Medium,2674.20,2646.770204
3,9.0,Laterile Soil,low,Average,Low,Medium,729.10,2291.295777
4,16.0,Laterile Soil,low,Average,High,High,1033.20,2132.876546
...,...,...,...,...,...,...,...,...
623,571399.0,Alluvial Soil,high,Average,High,Medium,1062.93,2962.032347
624,572826.0,Red/Yellow Soil,high,Low,Medium,High,1565.24,2676.308471
625,575641.0,Red/Yellow Soil,high,Low,High,Low,1321.30,2575.448101
626,575668.0,Alluvial Soil,low,Low,Medium,Medium,795.40,2761.919820


In [6]:
# Looks like PCA won't be good enough
'''
    We could look at Soil Type, Pesticide Usage, pH, Temperature, 
    Fertilizer Usage, and Rainfall to understand what countries believe
    to be the most efficient for growing potatoes.
    
    Exclude Rainfall and Area.
    
    Treemap idiom, we could use area instead or alongside rainfall too.
'''

'\n    We could look at Soil Type, Pesticide Usage, pH, Temperature, \n    Fertilizer Usage, and Rainfall to understand what countries believe\n    to be the most efficient for growing potatoes.\n    \n    Exclude Rainfall and Area.\n    \n    Treemap idiom, we could use area instead or alongside rainfall too.\n'

In [7]:
# We only want to consider the crop Potato
p = df.groupby(['Crop','Area','Soil Type','Pesticide Usage','pH','Temperature','Fertilizer Usage','Rainfall'])['Price'].sum()
p = p['Potato'].reset_index()

p.describe()

Unnamed: 0,Area,Rainfall,Price
count,628.0,628.0,628.0
mean,50009.913599,1482.237914,2546.680695
std,116220.678503,856.141664,486.760086
min,3.0,92.7,1063.0
25%,2437.5,854.24,2294.948823
50%,6783.0,1205.9,2527.237254
75%,23598.25,2031.1625,2732.530437
max,612600.0,5553.9,4903.0


In [8]:
k = pd.to_numeric(p['Rainfall'])
L = []

for i in k:
    if i >= 2338:
        L.append("High")
    elif i > 626 and i < 2338:
        L.append("Medium")
    else:
        L.append("Low")
        
p['Rainfall'] = pd.DataFrame(L, columns=['Rainfall'])
p

Unnamed: 0,Area,Soil Type,Pesticide Usage,pH,Temperature,Fertilizer Usage,Rainfall,Price
0,3.0,Alluvial Soil,high,High,Medium,Medium,Medium,2194.615166
1,6.0,Red/Yellow Soil,low,Low,Medium,Low,Medium,2809.892270
2,8.0,Red/Yellow Soil,high,High,High,Medium,High,2646.770204
3,9.0,Laterile Soil,low,Average,Low,Medium,Medium,2291.295777
4,16.0,Laterile Soil,low,Average,High,High,Medium,2132.876546
...,...,...,...,...,...,...,...,...
623,571399.0,Alluvial Soil,high,Average,High,Medium,Medium,2962.032347
624,572826.0,Red/Yellow Soil,high,Low,Medium,High,Medium,2676.308471
625,575641.0,Red/Yellow Soil,high,Low,High,Low,Medium,2575.448101
626,575668.0,Alluvial Soil,low,Low,Medium,Medium,Medium,2761.919820


In [9]:
# The dataset is still too large, so make a new smaller one

# we can reduce by combining and selecting a show number
# 5070 is one of the above, however we could randomize it OR create a treemap for all of 3640 to show the whole dataset
fd = p.groupby(['Soil Type','Pesticide Usage','pH','Temperature','Fertilizer Usage','Rainfall'])['Price'].mean().reset_index()
fd

Unnamed: 0,Soil Type,Pesticide Usage,pH,Temperature,Fertilizer Usage,Rainfall,Price
0,Alluvial Soil,high,Average,High,High,Medium,2318.595677
1,Alluvial Soil,high,Average,High,Low,Medium,2334.431421
2,Alluvial Soil,high,Average,High,Medium,Medium,2874.595512
3,Alluvial Soil,high,Average,Low,High,Low,2233.046569
4,Alluvial Soil,high,Average,Low,Low,Low,3070.194713
...,...,...,...,...,...,...,...
322,Red/Yellow Soil,low,Low,Medium,High,High,2344.814690
323,Red/Yellow Soil,low,Low,Medium,High,Medium,3041.260893
324,Red/Yellow Soil,low,Low,Medium,Low,Low,2707.832731
325,Red/Yellow Soil,low,Low,Medium,Low,Medium,2583.495032


In [10]:
fd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327 entries, 0 to 326
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Soil Type         327 non-null    object 
 1   Pesticide Usage   327 non-null    object 
 2   pH                327 non-null    object 
 3   Temperature       327 non-null    object 
 4   Fertilizer Usage  327 non-null    object 
 5   Rainfall          327 non-null    object 
 6   Price             327 non-null    float64
dtypes: float64(1), object(6)
memory usage: 18.0+ KB


In [11]:
fig = px.treemap(fd, path=[px.Constant("all"), 'Soil Type', 'Pesticide Usage','pH','Temperature'], values='Price')
# Honestly, I wish that I could use the entire dataset, however I do know that the treemap may not even render

fig.update_traces(root_color="lightgrey")

fig.update_layout(
    updatemenus=[
        dict(buttons=list([
            dict(label="Pesticide Usage",
                 method="update",
                 args=[{"visible": [True, False, False, False]}
                      ]),
            dict(label="pH",
                 method="update",
                 args=[{"visible": [False, False, True, False]},
                       ]),
            dict(label="Temperature",
                 method="update",
                 args=[{"visible": [True, False, True, False]}]),
            dict(label="Fertilizer Usage",
                 method="update",
                 args=[{"visible": [True, False, True, False]}]),
            dict(label="All",
                 method="update",
                 args=[{"visible": [True, False, True, False]}])
        ]),
        )
    ]
)
fig.show()

In [12]:
# Source: https://github.com/lforeman2/plotlyDropDownMenu/blob/main/plotlyDemo_groupedHistogram.ipynb

title="Soil vs "

plotColumns = ['Pesticide Usage','pH','Temperature','Fertilizer Usage','Rainfall']

traces = []
buttons = []
m=0
for i, col in enumerate(plotColumns):
    visible = [False] * len(plotColumns)
    name=col
    
    traces.append(
        px.treemap(
                fd, 
                path=[px.Constant("all"), 'Soil Type', name], 
                values='Price',
            ).data[0]
        )
    visible[m] = True
    m+=1
    buttons.append(dict(label=name,
                        method="update",
                        args=[{"visible":visible},
                              {"title":f"{(title+name+' affect on Potato Prices')}"}]))

updatemenus = [{'active':4, "buttons":buttons}]


#print(traces)

fig = go.Figure(data=traces,
                 layout=dict(updatemenus=updatemenus))
fig.update_layout(title=title+"Rainfall affect on Potato Prices", barmode='overlay')
fig.show()
fig.write_html("demowidget.html")