# Exploratory Data Analysis - Historial temperature of major cities

Tools used: Pandas, Numpy, Plotly
Dataset: https://www.kaggle.com/datasets/sudalairajkumar/daily-temperature-of-major-cities/code

In [1]:
#Import dependencies
import numpy as np
import pandas as pd
import plotly.express as px

#Import CSV
# Missing values were marked as "-99" in the dataset. Will replace with NaN
df = pd.read_csv("city_temperature.csv", na_values=[-99])
df.head()

  df = pd.read_csv("city_temperature.csv", na_values=[-99])


Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9


In [2]:
# Confirm range of years is between 1995 to 2020. Typos in year "200" and "201"
df.Year.value_counts().index.sort_values()

Int64Index([ 200,  201, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
            2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
            2015, 2016, 2017, 2018, 2019, 2020],
           dtype='int64')

In [3]:
# Count values per year. 
# Given that number of rows in year "200" and "201" is negligible to rest of data, will remove
df.Year.value_counts()

2000    119682
1999    119355
2001    119355
2002    119355
2003    119140
1998    119082
1996    118951
1997    118656
2004    118645
1995    118616
2005    117895
2006    117647
2010    115627
2007    115428
2008    114908
2009    114247
2011    113740
2012    111993
2013    111021
2014    109018
2015    107498
2016    107237
2017    106946
2018    106698
2019    106337
2020     38810
201        351
200         89
Name: Year, dtype: int64

In [4]:
# Remove "200" and "201"
df = df[df.Year > 1900]

In [5]:
# Look for missing values
# Possible to have missing collumns in state, since most countries do not have them
df.isna().sum()

Region                  0
Country                 0
State             1450550
City                    0
Month                   0
Day                     0
Year                    0
AvgTemperature      79232
dtype: int64

In [6]:
# Consecutive data are not filled in. This esplains the missing values for "AvgTemperature"
# Will fill in with previous day's temperature
df['AvgTemperature'].fillna(method='ffill', inplace=True)

In [7]:
# Convert Farenheit to Celcius
df['Celcius'] = (df['AvgTemperature'] - 32) * 5/9
df.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature,Celcius
0,Africa,Algeria,,Algiers,1,1,1995,64.2,17.888889
1,Africa,Algeria,,Algiers,1,2,1995,49.4,9.666667
2,Africa,Algeria,,Algiers,1,3,1995,48.8,9.333333
3,Africa,Algeria,,Algiers,1,4,1995,46.4,8.0
4,Africa,Algeria,,Algiers,1,5,1995,47.9,8.833333


# Average Temperature among Region (1995 to 2020)

In [11]:
# Check average temperature of Regions using Groupby Function

df_regionyear = df[['Region','Year','Celcius']].groupby(['Region','Year']).mean().reset_index()
df_regionyear.head()

Unnamed: 0,Region,Year,Celcius
0,Africa,1995,23.523183
1,Africa,1996,23.350323
2,Africa,1997,23.621858
3,Africa,1998,23.881934
4,Africa,1999,23.595156


In [12]:
# Plot
# Ignore 2020 because it is not completed yet. Otherwise, there is a slight increase in Avg Temp generally
fig = px.line(
df_regionyear, x="Year", y="Celcius", color='Region',
title='Avg Temperature in Regions from 1995 to 2020')
fig.show()

In [14]:
#Compare averages between 1995 and 2019. There has been an increase since then
df[df.Year.isin([1995,2019])][['Region','Year','Celcius']].groupby(['Region','Year']).mean().reset_index()

Unnamed: 0,Region,Year,Celcius
0,Africa,1995,23.523183
1,Africa,2019,23.938312
2,Asia,1995,19.488933
3,Asia,2019,20.240732
4,Australia/South Pacific,1995,16.220244
5,Australia/South Pacific,2019,17.400786
6,Europe,1995,11.06319
7,Europe,2019,11.576153
8,Middle East,1995,22.492921
9,Middle East,2019,23.888092


# Explore change in temperature over seasons

In [20]:
# Check average temperature of Regions over the month using Groupby Function
df_regionmonth = df[['Region','Month','Celcius']].groupby(['Region','Month']).mean().reset_index()
df_regionmonth

Unnamed: 0,Region,Month,Celcius
0,Africa,1,22.739525
1,Africa,2,23.392015
2,Africa,3,23.902338
3,Africa,4,23.940525
4,Africa,5,23.962620
...,...,...,...
79,South/Central America & Carribean,8,22.420321
80,South/Central America & Carribean,9,22.634837
81,South/Central America & Carribean,10,22.704383
82,South/Central America & Carribean,11,22.486871


In [21]:
# Effects of seasons on temperature on Northern and Southern hemispheres

fig = px.line(
df_regionmonth, x="Month", y="Celcius", color='Region',
title='Avg Monthly Temperatures in Regions',
width=900, height=400)
fig.show()