# COUNTRIES

In [13]:
# Import the necessary libraries
import pandas as pd
import json
import requests
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [14]:
# Read the CSV file
pd.options.display.max_colwidth = 200
country_df = pd.read_csv("Data/GlobalLandTemperaturesByCountry.csv")
country_df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland
...,...,...,...,...
577457,2013-05-01,19.059,1.022,Zimbabwe
577458,2013-06-01,17.613,0.473,Zimbabwe
577459,2013-07-01,17.000,0.453,Zimbabwe
577460,2013-08-01,19.759,0.717,Zimbabwe


In [15]:
# Using 'to_list()' converts the column labels to a Python list
columns = country_df.columns.to_list()
columns

['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'Country']

In [16]:
# Checck data types
country_df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
Country                           object
dtype: object

In [17]:
country_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland


In [18]:
# Convert to date time format
country_df["dt"] = pd.to_datetime(country_df["dt"])

In [19]:
years = country_df["dt"].dt.year
years.head()

0    1743
1    1743
2    1744
3    1744
4    1744
Name: dt, dtype: int64

In [20]:
country_df.insert(1, "Year", country_df["dt"].dt.year)
country_df.tail()

Unnamed: 0,dt,Year,AverageTemperature,AverageTemperatureUncertainty,Country
577457,2013-05-01,2013,19.059,1.022,Zimbabwe
577458,2013-06-01,2013,17.613,0.473,Zimbabwe
577459,2013-07-01,2013,17.0,0.453,Zimbabwe
577460,2013-08-01,2013,19.759,0.717,Zimbabwe
577461,2013-09-01,2013,,,Zimbabwe


In [23]:
country_df2 = country_df[['Year', 'AverageTemperature', 'Country']]
country_df2.head()

Unnamed: 0,Year,AverageTemperature,Country
0,1743,4.384,Åland
1,1743,,Åland
2,1744,,Åland
3,1744,,Åland
4,1744,,Åland


In [25]:
country_df2.dtypes

Year                    int64
AverageTemperature    float64
Country                object
dtype: object

In [26]:
# Determine the number of unique countries in the dataset
country_df["Country"].unique()
len(country_df["Country"].unique())

243

In [28]:
# Group by 'Year', 'City', and 'Country' and calculate the mean temperature for each year
country_df3 = country_df2.groupby(['Year', 'Country'])['AverageTemperature'].mean().reset_index()
country_df3.head()

Unnamed: 0,Year,Country,AverageTemperature
0,1743,Albania,8.62
1,1743,Andorra,7.556
2,1743,Austria,2.482
3,1743,Belarus,0.767
4,1743,Belgium,7.106


In [29]:
# Rename the columns
country_df4 = country_df3.rename(columns={'dt': 'Year', 'AverageTemperature': 'Yearly Average Temperature'})

# Display the DataFrame with the average temperature for each year
country_df4.head()

Unnamed: 0,Year,Country,Yearly Average Temperature
0,1743,Albania,8.62
1,1743,Andorra,7.556
2,1743,Austria,2.482
3,1743,Belarus,0.767
4,1743,Belgium,7.106


In [30]:
# Determine the number of unique countries in the dataset
country_df["Country"].unique()
len(country_df["Country"].unique())

243

In [34]:
# earlies year is established as 1950 to align with the rest of the data 
earliest_year = 1950
latest_year = country_df4["Year"].max()

In [35]:
latest_year

2013

In [36]:
date_range = (f"Our date range is {earliest_year} to {latest_year}")
print(date_range)

Our date range is 1950 to 2013


In [38]:
cleaned_countries = country_df4[country_df4['Year']>=earliest_year]
cleaned_countries.head()

Unnamed: 0,Year,Country,Yearly Average Temperature
32691,1950,Afghanistan,13.0435
32692,1950,Africa,23.880833
32693,1950,Albania,13.812833
32694,1950,Algeria,22.781833
32695,1950,American Samoa,26.520083


In [42]:
reorderd_df = cleaned_countries[["Country", "Year", "Yearly Average Temperature"]]
reorderd_df.head()

Unnamed: 0,Country,Year,Yearly Average Temperature
32691,Afghanistan,1950,13.0435
32692,Africa,1950,23.880833
32693,Albania,1950,13.812833
32694,Algeria,1950,22.781833
32695,American Samoa,1950,26.520083


In [43]:
final_df = reorderd_df.dropna()
final_df.head()

Unnamed: 0,Country,Year,Yearly Average Temperature
32691,Afghanistan,1950,13.0435
32692,Africa,1950,23.880833
32693,Albania,1950,13.812833
32694,Algeria,1950,22.781833
32695,American Samoa,1950,26.520083


In [44]:
final_df.to_csv("Cleaned Data/Countries.csv")

# STATES 

In [None]:
state_df = pd.read_csv("Data/GlobalLandTemperaturesByState.csv")
state_df

In [None]:
pd.options.display.max_colwidth = 200
state_df.head()

In [None]:
columns = state_df.columns.to_list()
# Or, you can use: columns = list(orders_df)
columns

In [None]:
state_df.dtypes

In [None]:
state_df["dt"] = pd.to_datetime(state_df["dt"])

In [None]:
state_df.dtypes

In [None]:
state_df["State"].unique()
len(state_df["State"].unique())

In [None]:
# Group by 'Year', 'City', and 'Country' and calculate the mean temperature for each year
yearly_avg_state_df = state_df.groupby(['dt', 'State', 'Country'])['AverageTemperature'].mean().reset_index()

# Rename the columns
yearly_avg_state_df = yearly_avg_state_df.rename(columns={'dt': 'Year', 'AverageTemperature': 'Yearly Average Temperature'})

# Display the DataFrame with the average temperature for each year
yearly_avg_state_df.head()

In [None]:
cleaned_states = yearly_avg_state_df[yearly_avg_state_df['Year']>="1950-02-01"]
cleaned_states

In [None]:
cleaned_states_grouped_dropna = cleaned_states.dropna()
cleaned_states_grouped_dropna

In [None]:
# Now you can use to_csv on the resulting DataFrame
cleaned_states_grouped_dropna.to_csv("Cleaned Data/States.csv")

# CITIES 

In [None]:
pd.options.display.max_colwidth = 200
# Extract the ZIP file manually as pd.read_csv function did not work due to compatibility issue
import zipfile
with zipfile.ZipFile("Data/GlobalLandTemperaturesByCity.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("Data/")

# Read the CSV file directly
city_df = pd.read_csv("Data/GlobalLandTemperaturesByCity.csv")

city_df

In [None]:
# Converts the column labels to a Python list
city_columns = list(city_df)
city_columns

In [None]:
# Checck data types
city_df.dtypes

In [None]:
# Convert to date time format
city_df["dt"] = pd.to_datetime(city_df["dt"])

In [None]:
# Check the data types again
city_df.dtypes

In [None]:
# Determine the number of unique countries in the dataset
city_df["Country"].unique()
len(city_df["Country"].unique())

In [None]:
# Determine the number of unique cities in the dataset
city_df["City"].unique()
len(city_df["City"].unique())

In [None]:
# Group by 'Year', 'City', and 'Country' and calculate the mean temperature for each year
yearly_avg_city_df = city_df.groupby(['dt', 'City', 'Country'])['AverageTemperature'].mean().reset_index()

# Rename the columns
yearly_avg_city_df = yearly_avg_city_df.rename(columns={'dt': 'Year', 'AverageTemperature': 'Yearly Average Temperature'})

# Display the DataFrame with the average temperature for each year
(yearly_avg_city_df)


In [None]:
cleaned_cities = yearly_avg_city_df[yearly_avg_city_df['Year']>="1950-02-01"]
cleaned_cities

In [None]:
final_city_df = cleaned_cities.dropna()
final_city_df

In [None]:
# final_city_df = cleaned_cities_dropna.groupby(['City', 'Country']).reset_index()
final_city_df.to_csv("Cleaned Data/Cities.csv", index=False)