In [1]:
import pandas as pd
import scipy.stats as stats 
import numpy as np


import warnings
warnings.filterwarnings("ignore") 
from IPython import display

### **Study question: Is the land average temperature 8.37**

We have the hypothesess:
$$H_0: \bar{n} = 8.37$$
$$H_a: \bar{n} \neq 8.37$$

Set p-value to 0.05

In [2]:
global_temperatures_data = pd.read_csv("GlobalTemperatures.csv")
global_temperatures_data

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.490,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,
...,...,...,...,...,...,...,...,...,...
3187,2015-08-01,14.755,0.072,20.699,0.110,9.005,0.170,17.589,0.057
3188,2015-09-01,12.999,0.079,18.845,0.088,7.199,0.229,17.049,0.058
3189,2015-10-01,10.801,0.102,16.450,0.059,5.232,0.115,16.290,0.062
3190,2015-11-01,7.433,0.119,12.892,0.093,2.157,0.106,15.252,0.063


In [3]:
population_mean = global_temperatures_data['LandAverageTemperature'].mean()
global_temperatures_data = global_temperatures_data.dropna(axis=0)
global_temperatures_data


Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
1200,1850-01-01,0.749,1.105,8.242,1.738,-3.206,2.822,12.833,0.367
1201,1850-02-01,3.071,1.275,9.970,3.007,-2.291,1.623,13.588,0.414
1202,1850-03-01,4.954,0.955,10.347,2.401,-1.905,1.410,14.043,0.341
1203,1850-04-01,7.217,0.665,12.934,1.004,1.018,1.329,14.667,0.267
1204,1850-05-01,10.004,0.617,15.655,2.406,3.811,1.347,15.507,0.249
...,...,...,...,...,...,...,...,...,...
3187,2015-08-01,14.755,0.072,20.699,0.110,9.005,0.170,17.589,0.057
3188,2015-09-01,12.999,0.079,18.845,0.088,7.199,0.229,17.049,0.058
3189,2015-10-01,10.801,0.102,16.450,0.059,5.232,0.115,16.290,0.062
3190,2015-11-01,7.433,0.119,12.892,0.093,2.157,0.106,15.252,0.063


In [4]:
# Randomly select 100 rows from the population
sample_data = global_temperatures_data.sample(n=100, random_state=1)
sample_data

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
2437,1953-02-01,3.629,0.187,9.510,0.190,-2.182,0.189,14.020,0.094
2170,1930-11-01,6.574,0.202,12.032,0.186,0.892,0.213,14.505,0.128
2088,1924-01-01,2.339,0.290,7.912,0.298,-3.502,0.377,13.303,0.134
2719,1976-08-01,13.511,0.170,19.163,0.234,7.646,0.222,16.671,0.072
2459,1954-12-01,3.559,0.147,8.915,0.185,-1.773,0.176,13.620,0.088
...,...,...,...,...,...,...,...,...,...
2341,1945-02-01,2.646,0.194,8.373,0.192,-3.044,0.250,13.747,0.125
1525,1877-02-01,2.928,0.584,9.755,1.222,-2.357,0.872,13.732,0.232
1521,1876-10-01,8.946,0.327,14.400,0.677,2.766,1.037,15.047,0.175
2441,1953-06-01,13.612,0.216,19.461,0.204,7.749,0.202,16.890,0.105


In [5]:
def t_value(X, h_0):
    se = np.sqrt(np.var(X) / len(X))
    return (np.mean(X) - h_0) / se

def p_value(t):
    # Two-sided p-value, so we multiply by 2.
    return stats.norm.sf(abs(t))*2

In [6]:
t = t_value(sample_data['LandAverageTemperature'], population_mean)
t

0.15804377845564133

In [7]:
p_value(t)

0.874422301969351

Since 0.87 > 0.005, we fail to reject the null hypothesis. We can conclude that there is strong evidence that the average land temperature is almost 8.37 

### **Study question: Did the Latitude affect Average Temperature**

We have the hypothesess: \
$H_0: $There is no association between Latitude and Average Temperatures \
$H_a: $ There is association between Latitude and Average Temperature

Set p-value to 0.05

In [8]:
from scipy.stats import chi2_contingency

In [9]:
data = pd.read_csv("GlobalLandTemperaturesByCity.csv")
data

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E
...,...,...,...,...,...,...,...
8599207,2013-05-01,11.464,0.236,Zwolle,Netherlands,52.24N,5.26E
8599208,2013-06-01,15.043,0.261,Zwolle,Netherlands,52.24N,5.26E
8599209,2013-07-01,18.775,0.193,Zwolle,Netherlands,52.24N,5.26E
8599210,2013-08-01,18.025,0.298,Zwolle,Netherlands,52.24N,5.26E


In [10]:
data = data.dropna(axis=0)
data

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
5,1744-04-01,5.788,3.624,Århus,Denmark,57.05N,10.33E
6,1744-05-01,10.644,1.283,Århus,Denmark,57.05N,10.33E
7,1744-06-01,14.051,1.347,Århus,Denmark,57.05N,10.33E
8,1744-07-01,16.082,1.396,Århus,Denmark,57.05N,10.33E
...,...,...,...,...,...,...,...
8599206,2013-04-01,7.710,0.182,Zwolle,Netherlands,52.24N,5.26E
8599207,2013-05-01,11.464,0.236,Zwolle,Netherlands,52.24N,5.26E
8599208,2013-06-01,15.043,0.261,Zwolle,Netherlands,52.24N,5.26E
8599209,2013-07-01,18.775,0.193,Zwolle,Netherlands,52.24N,5.26E


### *Convert numeric variables to categorical*
**AvarageTemperature can be classifed in 5 interval scale**

In [11]:
min_temperature = data['AverageTemperature'].min()
max_temperature = data['AverageTemperature'].max()
bins = np.linspace(min_temperature, max_temperature, 5)
temperature_labels = ["[-42.704-22.11525]", "[-22.11525--1.5265]", "[-1.5265-19.06225]", "[19.06225-39.651]"]
bins

array([-42.704  , -22.11525,  -1.5265 ,  19.06225,  39.651  ])

In [12]:
data['AvgTemp'] = pd.cut(data['AverageTemperature'], bins=bins, labels=temperature_labels)
data = data[:-1]

**For Latitude, we can classify into 4 major climate zones**
1. Tropical zone: 0°–23.5°(between the tropics)
2. Subtropics: 23.5°–40°
3. Temperate zone:  40°–60°
4. Cold zone(Polar zone): 60°–90°
![image info](https://content.meteoblue.com/en/meteoscool/general-climate-zones/climate_zones_lightbox.png)
Reference: https://content.meteoblue.com/en/meteoscool/general-climate-zones

In [13]:
climate_labels = ['Tropical', 'Subtropics', 'Temperate', 'Polar']
bins = [0, 23.5, 40, 60, 90]
data['Climate'] = pd.cut(data['Latitude'].astype(str).str[:-1].astype(np.float), bins=bins, labels=climate_labels)

In [14]:
data

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,AvgTemp,Climate
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E,[-1.5265-19.06225],Temperate
5,1744-04-01,5.788,3.624,Århus,Denmark,57.05N,10.33E,[-1.5265-19.06225],Temperate
6,1744-05-01,10.644,1.283,Århus,Denmark,57.05N,10.33E,[-1.5265-19.06225],Temperate
7,1744-06-01,14.051,1.347,Århus,Denmark,57.05N,10.33E,[-1.5265-19.06225],Temperate
8,1744-07-01,16.082,1.396,Århus,Denmark,57.05N,10.33E,[-1.5265-19.06225],Temperate
...,...,...,...,...,...,...,...,...,...
8599205,2013-03-01,2.253,0.267,Zwolle,Netherlands,52.24N,5.26E,[-1.5265-19.06225],Temperate
8599206,2013-04-01,7.710,0.182,Zwolle,Netherlands,52.24N,5.26E,[-1.5265-19.06225],Temperate
8599207,2013-05-01,11.464,0.236,Zwolle,Netherlands,52.24N,5.26E,[-1.5265-19.06225],Temperate
8599208,2013-06-01,15.043,0.261,Zwolle,Netherlands,52.24N,5.26E,[-1.5265-19.06225],Temperate


In [15]:
sample_data = data.sample(n=100, random_state=1).reset_index()
sample_data

Unnamed: 0,index,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,AvgTemp,Climate
0,2433126,1896-10-01,8.093,1.510,Fushun,China,40.99N,123.55E,[-1.5265-19.06225],Temperate
1,6608504,1825-03-01,27.199,2.235,Sambalpur,India,21.70N,83.94E,[19.06225-39.651],Tropical
2,406754,1756-09-01,17.585,2.818,Armavir,Russia,45.81N,40.38E,[-1.5265-19.06225],Temperate
3,6357370,1999-11-01,1.167,0.370,Riga,Latvia,57.05N,25.08E,[-1.5265-19.06225],Temperate
4,4873146,2006-05-01,23.401,0.274,Mobile,United States,31.35N,88.59W,[19.06225-39.651],Subtropics
...,...,...,...,...,...,...,...,...,...,...
95,284054,1888-11-01,19.233,0.784,Anbu,China,23.31N,116.21E,[19.06225-39.651],Tropical
96,3645762,1831-10-01,12.486,2.120,Karaj,Iran,36.17N,51.71E,[-1.5265-19.06225],Subtropics
97,1247953,1904-06-01,25.056,0.618,Córdoba,Mexico,18.48N,96.34W,[19.06225-39.651],Tropical
98,3553379,1882-02-01,0.127,0.625,Kahramanmaras,Turkey,37.78N,36.61E,[-1.5265-19.06225],Subtropics


In [16]:
sample_data['Climate'].value_counts()

Tropical      41
Temperate     34
Subtropics    24
Polar          1
Name: Climate, dtype: int64

In [17]:
contingency_tab = pd.crosstab(sample_data['AvgTemp'], sample_data['Climate'], margins = True)
contingency_tab

Climate,Tropical,Subtropics,Temperate,Polar,All
AvgTemp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
[-22.11525--1.5265],0,0,3,1,4
[-1.5265-19.06225],2,12,29,0,43
[19.06225-39.651],39,12,2,0,53
All,41,24,34,1,100


In [18]:
observed = contingency_tab.iloc[:3,:4]
observed.index.name = None
observed.columns.name = None

In [19]:
expected = np.outer(contingency_tab['All'][0:3],
                   contingency_tab.loc['All'][0:4]) / 100
expected = pd.DataFrame(expected, columns=observed.columns, index=observed.index)

In [20]:
expected

Unnamed: 0,Tropical,Subtropics,Temperate,Polar
[-22.11525--1.5265],1.64,0.96,1.36,0.04
[-1.5265-19.06225],17.63,10.32,14.62,0.43
[19.06225-39.651],21.73,12.72,18.02,0.53


In [21]:
chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()
print(chi_squared_stat)

84.86008959604348


In [22]:
crit = stats.chi2.ppf(q=0.95, # Find the critical value for 95% confidence*
                      df=6)   # df = (n_row-1)(n_col-1)

In [23]:
print("Critical value:", crit)

Critical value: 12.591587243743977


In [24]:
p_value = 1 - stats.chi2.cdf(x=chi_squared_stat, df=6)
print("P value: ", p_value)

P value:  3.3306690738754696e-16


A very small p-value is strong evidence that the null hypothesis is not true, so we reject the null hypothesis. We can conclude that there is an association between Latitude and Average Temperature

In fact, latitude is one of the primary factors that affect temperature. Because of the difference in the amount of sunlight the Earth received, the lower the latitude, the warmer the region becomes and vice versa.
