<a href="https://colab.research.google.com/github/manishmawatwal/DataScience/blob/main/Indian_Weather_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Weather Data in India from 1901 to 2017.csv to Weather Data in India from 1901 to 2017.csv


In [None]:
df = pd.read_csv('Weather Data in India from 1901 to 2017.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,0,1901,17.99,19.43,23.49,26.41,28.28,28.6,27.49,26.98,26.26,25.08,21.73,18.95
1,1,1902,19.0,20.39,24.1,26.54,28.68,28.44,27.29,27.05,25.95,24.37,21.33,18.78
2,2,1903,18.32,19.79,22.46,26.03,27.93,28.41,28.04,26.63,26.34,24.57,20.96,18.29
3,3,1904,17.77,19.39,22.95,26.73,27.83,27.85,26.84,26.73,25.84,24.36,21.07,18.84
4,4,1905,17.4,17.79,21.78,24.84,28.32,28.69,27.67,27.47,26.29,26.16,22.07,18.71


We have got an unexpected column named 'Unamed:0'. We face this when our csv file has an index column which has no name. We will get rid of it

In [None]:
df = pd.read_csv('Weather Data in India from 1901 to 2017.csv', index_col = 0)

In [None]:
df.head()

Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,1901,17.99,19.43,23.49,26.41,28.28,28.6,27.49,26.98,26.26,25.08,21.73,18.95
1,1902,19.0,20.39,24.1,26.54,28.68,28.44,27.29,27.05,25.95,24.37,21.33,18.78
2,1903,18.32,19.79,22.46,26.03,27.93,28.41,28.04,26.63,26.34,24.57,20.96,18.29
3,1904,17.77,19.39,22.95,26.73,27.83,27.85,26.84,26.73,25.84,24.36,21.07,18.84
4,1905,17.4,17.79,21.78,24.84,28.32,28.69,27.67,27.47,26.29,26.16,22.07,18.71


Now, we will make an attribute that would contain date(month, year) so that we could get temperature values with the timeline.

In [None]:
df1 = pd.melt(df, id_vars='YEAR', value_vars = df.columns[1:])
df1.head()

Unnamed: 0,YEAR,variable,value
0,1901,JAN,17.99
1,1902,JAN,19.0
2,1903,JAN,18.32
3,1904,JAN,17.77
4,1905,JAN,17.4


In [None]:
#converting string to datetime object
df1['Date'] = df1['variable'] + ' ' + df1['YEAR'].astype(str)
df1.loc[:,'Date'] = df1['Date'].apply(lambda x : datetime.strptime(x, '%b %Y'))
df1.head()

Unnamed: 0,YEAR,variable,value,Date
0,1901,JAN,17.99,1901-01-01
1,1902,JAN,19.0,1902-01-01
2,1903,JAN,18.32,1903-01-01
3,1904,JAN,17.77,1904-01-01
4,1905,JAN,17.4,1905-01-01


Temperature

In [None]:
df1.columns = ['Year', 'Month', 'Temperature', 'Date']
df1.sort_values(by = 'Date', inplace = True)
fig = go.Figure(layout = go.Layout(yaxis = dict(range = [0, df1['Temperature'].max() + 1])))
fig.add_trace(go.Scatter(x = df1['Date'], y = df1['Temperature']),)
fig.update_layout(title = 'Temperature Throught Timeline:', xaxis_title = 'Time', yaxis_title = 'Temperature in Degrees')
fig.update_layout(xaxis = go.layout.XAxis(rangeselector = dict(buttons = list([dict(label = 'Whole View', step = 'all'),
                                                                               dict(count = 1, label = 'One year view', step = 'year', stepmode = 'todate')
                                                                               ])
                                                              ),
                                          rangeslider = dict(visible = True), type = 'date')
                                         )
fig.show()

Temperature varies every year with months.

Warmest/Coldest/Average

In [None]:
fig = px.box(df1, 'Month', 'Temperature')
fig.update_layout(title = 'Warmest, Coldest and Median monthly temperature')
fig.show()

July is the month with least Standard Deviation which means temperature in july vary least. We can expect any day in july to be a warm day.

In [None]:
from sklearn.cluster import KMeans
sse = []
target = df1['Temperature'].to_numpy().reshape(-1,1)
num_clusters = list(range(1,10))

for k in num_clusters:
    km = KMeans(n_clusters = k)
    km.fit(target)
    sse.append(km.inertia_)

fig = go.Figure(data = [
                        go.Scatter(x = num_clusters, y = sse, mode = 'lines'),
                        go.Scatter(x = num_clusters, y = sse, mode = 'markers')
                        ])
fig.update_layout(title = 'Evaluation on number of clusters:',
                  xaxis_title = 'Number of Clusters:',
                  yaxis_title = 'Sum of Squared Dsitance',
                  showlegend = False)
fig.show()

A cluster size of 3 seems a good choice.

In [None]:
km = KMeans(3)
km.fit(df1['Temperature'].to_numpy().reshape(-1,1))
df1.loc[:,'Temp Labels'] = km.labels_
fig = px.scatter(df1, 'Date', 'Temperature', color = 'Temp Labels')
fig.update_layout(title = 'Temperature clusters', xaxis_title = 'Date', yaxis_title = 'Temperature')
fig.show()

Despite having 4 seasons we can see only 3 main clusters based on temperature

In [None]:
fig = px.histogram(x = df1['Temperature'], nbins = 200, histnorm = 'density')
fig.update_layout(title = 'Frequency chart of temperature readings:', xaxis_title = 'Temperature', yaxis_title = 'Count')

Yearly average temperature

In [None]:
df['Yearly Mean'] = df.iloc[:,1:].mean(axis = 1)
fig = go.Figure(data = [go.Scatter(name = 'Yearly Temperature', x = df['YEAR'], y = df['Yearly Mean'], mode = 'lines'),
                        go.Scatter(name = 'Yearly Temperature', x = df['YEAR'], y = df['Yearly Mean'], mode = 'markers')
                        ])
fig.update_layout(title = 'Yearly Mean Temperature:', xaxis_title = 'Time', yaxis_title = 'Temperature in Degrees')
fig.show()
fig = px.scatter(df, x = 'YEAR', y = 'Yearly Mean', trendline = 'lowess')
fig.update_layout(title = 'Trendline over the years:', xaxis_title = 'Time', yaxis_title = 'Temperature in Degrees')
fig.show()

We can see that the issue of global warning is true.
The yearly mean temprature was not incresing till 1980.

Monthly temperatures throughout history

In [None]:
fig = px.line(df1, 'Year', 'Temperature', facet_col = 'Month', facet_col_wrap = 4)
fig.update_layout(title = 'Monthly temperature throughout history:')
fig.show()

We can see positive trendlines

Seasonal Analysis

In [None]:
df['Winter'] = df[['DEC', 'JAN', 'FEB']].mean(axis = 1)
df['Summer'] = df[['MAR', 'APR', 'MAY']].mean(axis = 1)
df['Monsoon'] = df[['JUN', 'JUL', 'AUG', 'SEP']].mean(axis=1)
df['Autumn'] = df[['OCT', 'NOV']].mean(axis=1)
seasonal_df = df[['YEAR', 'Winter', 'Summer', 'Monsoon', 'Autumn']]
seasonal_df = pd.melt(seasonal_df, id_vars = 'YEAR', value_vars = seasonal_df.columns[1:])
seasonal_df.columns = ['Year', 'Season', 'Temperature']

In [None]:
fig = px.scatter(seasonal_df, 'Year', 'Temperature', facet_col = 'Season', facet_col_wrap = 2, trendline = 'ols')
fig.update_layout(title = 'Seasonal mean temperatures throught years:')
fig.show()

We can again see a positive trendline between temprature and time. The trendline does not have a very high positive correlation with years but still it is not negligable.

In [None]:
px.scatter(df1, 'Month', 'Temperature', size='Temperature', animation_frame='Year')

Forecasting

Let's try to forecast monthly mean temperature for year 2018

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

df2 = df1[['Year', 'Month', 'Temperature']].copy()
df2 = pd.get_dummies(df2)
y = df2[['Temperature']]
x = df2.drop(columns = 'Temperature')

dtr = DecisionTreeRegressor()
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3)
dtr.fit(train_x, train_y)
pred = dtr.predict(test_x)
r2_score(test_y, pred)

0.9655402889606705

A high r2 value means that the predictive model is working good.

Let's see the forecasted data for 2018

In [None]:
next_Year = df1[df1['Year'] == 2017][['Year', 'Month']]
next_Year.Year.replace(2017, 2018, inplace = True)
next_Year = pd.get_dummies(next_Year)
temp_2018 = dtr.predict(next_Year)

temp_2018 = {'Month': df1['Month'].unique(), 'Temperature': temp_2018}
temp_2018 = pd.DataFrame(temp_2018)
temp_2018['Year'] = 2018
temp_2018

Unnamed: 0,Month,Temperature,Year
0,JAN,19.02,2018
1,FEB,23.58,2018
2,MAR,25.58,2018
3,APR,29.17,2018
4,MAY,30.47,2018
5,JUN,29.44,2018
6,JUL,28.18,2018
7,AUG,28.17,2018
8,SEP,28.11,2018
9,OCT,27.24,2018


In [None]:
forecasted_temp = pd.concat([df1, temp_2018], sort = False).groupby(by = 'Year')['Temperature'].mean().reset_index()
fig = go.Figure(data = [go.Scatter(name = 'Yearly Mean Temperature', x = forecasted_temp['Year'], y = forecasted_temp['Temperature'], mode = 'lines'),
                        go.Scatter(name = 'Yearly MEan Temperature', x = forecasted_temp['Year'], y = forecasted_temp['Temperature'], mode = 'markers')
                        ])
fig.update_layout(title = 'Forecasted Temprature:', xaxis_title = 'Time', yaxis_title = 'Temprature in Degrees')
fig.show()

Yearly mean temperature of India in 2018 was 25.9