In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import the Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

## Import the Dataset

In [None]:
df = pd.read_csv("/kaggle/input/rainfall-data-from-1901-to-2017-for-india/Rainfall_Data_LL.csv")
df.head()

## Exploratory Data Analysis

In [None]:
# basic information about the dataset
df.info()

In [None]:
# number of null values in each column
df.isnull().sum()

### Average Rainfall per Subdivision

In [None]:
df[['SUBDIVISION','ANNUAL']].groupby('SUBDIVISION').mean()

## Subdivision with Lowest Rainfall

In [None]:
df[['SUBDIVISION','ANNUAL']].groupby('SUBDIVISION').mean().sort_values('ANNUAL').iloc[0]

## Subdivision with Heighest Rainfall

In [None]:
df[['SUBDIVISION','ANNUAL']].groupby('SUBDIVISION').mean().sort_values('ANNUAL').iloc[-1]

## Subdivision that receives heighest rainfall in winter

In [None]:
df[['SUBDIVISION', 'Jan-Feb']].groupby('SUBDIVISION').mean().sort_values('Jan-Feb').iloc[-1]

## Andaman & Nicobar Islands
For the rest of my analysis, I have chosen Andaman & Nicobar Islands

In [None]:
# only extracted the monthly data
andaman = df.loc[df['SUBDIVISION'] == 'Andaman & Nicobar Islands'].iloc[:, 2:16]
andaman.head()

In [None]:
andaman.info()

## Distribution of Annual rainfall


In [None]:
sns.displot(x='ANNUAL', data=andaman, kde=True)

## Statistical Information about the region

In [None]:
andaman['ANNUAL'].describe()

## Which year received the maximum and minimum rainfall?

In [None]:
andaman[andaman['ANNUAL'] == andaman['ANNUAL'].max()]

In [None]:
andaman[andaman['ANNUAL'] == andaman['ANNUAL'].min()]

Now, we'll make an attribute that would contain date (month, year). So that we could get rainfall values with the timeline.

In [None]:
df1 = pd.melt(andaman, id_vars='YEAR', value_vars=andaman.columns[1:-1]) ## This will melt the data
df1.head()

In [None]:
df1.tail()

In [None]:
df1['Date'] = df1['variable'] + ' ' + df1['YEAR'].astype(str)  
df1.loc[:,'Date'] = df1['Date'].apply(lambda x : datetime.strptime(x, '%b %Y')) ## Converting String to datetime object
df1.head()

## Rainfall throughout 1901 - 2017

In [None]:
df1.columns=['Year', 'Month', 'Rainfall', 'Date']
df1.sort_values(by='Date', inplace=True) ## To get the time series right.
fig = go.Figure(layout = go.Layout(yaxis=dict(range=[0, df1['Rainfall'].max()+1])))
fig.add_trace(go.Scatter(x=df1['Date'], y=df1['Rainfall']), )
fig.update_layout(title='Rainfall Throught Timeline:',
                 xaxis_title='Time', yaxis_title='Rainfall in mm')
fig.update_layout(xaxis=go.layout.XAxis(
    rangeselector=dict(
        buttons=list([dict(label="Whole View", step="all"),
                      dict(count=1,label="One Year View",step="year",stepmode="todate")                      
                     ])),
        rangeslider=dict(visible=True),type="date")
)
fig.show()

In [None]:
fig = px.box(df1, 'Month', 'Rainfall')
fig.update_layout(title='Minimum, Maximum and Median Monthly Rainfall.')
fig.show()

### Insights:
- February has the lowest rainfall
- June has the maximum rainfall
- Rainfall in March vary the lowest as it has lowest standard deviation

In [None]:
from sklearn.cluster import KMeans
sse = []
target = df1['Rainfall'].to_numpy().reshape(-1,1)
num_clusters = list(range(1, 10))

for k in num_clusters:
    km = KMeans(n_clusters=k)
    km.fit(target)
    sse.append(km.inertia_)

fig = go.Figure(data=[
    go.Scatter(x = num_clusters, y=sse, mode='lines'),
    go.Scatter(x = num_clusters, y=sse, mode='markers')
])

fig.update_layout(title="Evaluation on number of clusters:",
                 xaxis_title = "Number of Clusters:",
                 yaxis_title = "Sum of Squared Distance",
                 showlegend=False)
fig.show()

Cluster size of 3 is the ideal choice of k

In [None]:
km = KMeans(3)
km.fit(df1['Rainfall'].to_numpy().reshape(-1,1))
df1.loc[:,'Rainfall Labels'] = km.labels_
fig = px.scatter(df1, 'Date', 'Rainfall', color='Rainfall Labels')
fig.update_layout(title = "Rainfall clusters.",
                 xaxis_title="Date", yaxis_title="Rainfall")
fig.show()

In [None]:
fig = px.histogram(x=df1['Rainfall'], nbins=200, histnorm='density')
fig.update_layout(title='Frequency chart of rainfall readings:',
                 xaxis_title='Rainfall', yaxis_title='Count')

In [None]:
fig = px.line(df1, 'Year', 'Rainfall', facet_col='Month', facet_col_wrap=4)
fig.update_layout(title='Monthly rainfall throught history:')
fig.show()

In [None]:
## I am using decision tree regressor for prediction as the data does not actually have a linear trend.
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score 
from sklearn.linear_model import LinearRegression

df2 = df1[['Year', 'Month', 'Rainfall']].copy()
df2 = pd.get_dummies(df2)
y = df2[['Rainfall']]
x = df2.drop(columns='Rainfall')

lr = LinearRegression()
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.3)
lr.fit(train_x, train_y)
pred = lr.predict(test_x)
r2_score(test_y, pred)

## That brings us to the end of this project. You can connect with me on Twitter [@PiyalBanik](https://twitter.com/PiyalBanik)

In [None]:
nan