In [3]:
import numpy as np
import pandas as pd

## Read File

In [4]:
df = pd.read_csv('temperature.csv', encoding="utf-8")
print(df.head())

  code             name  1901-01  1901-02  1901-03  1901-04  1901-05  1901-06  \
0  ABW    Aruba (Neth.)    27.20    27.10    27.60    28.60    29.20    28.70   
1  AFG      Afghanistan    -0.82     2.15     8.73    13.55    17.94    21.73   
2  AGO           Angola    22.50    22.65    22.69    21.48    20.19    18.43   
3  AIA  Anguilla (U.K.)    26.10    26.40    26.30    27.20    27.60    28.00   
4  ALA          Finland    -4.15    -7.16    -3.21     2.68     8.31    12.82   

   1901-07  1901-08  ...  2023-03  2023-04  2023-05  2023-06  2023-07  \
0    29.10    29.70  ...    27.70    28.40    30.00    30.70    30.40   
1    24.86    23.18  ...    11.43    15.13    19.09    24.73    26.31   
2    18.00    19.90  ...    22.73    22.38    21.06    18.77    18.01   
3    28.00    28.50  ...    26.50    27.10    28.30    29.80    29.90   
4    19.11    17.11  ...    -1.17     4.30     8.72    14.99    15.47   

   2023-08  2023-09  2023-10  2023-11  2023-12  
0    31.10    31.80    30

## Rename Headings

In [5]:
df = df.rename(columns={"code": "Country_Code", "name": "Country_Name"})
print(df.head())

  Country_Code     Country_Name  1901-01  1901-02  1901-03  1901-04  1901-05  \
0          ABW    Aruba (Neth.)    27.20    27.10    27.60    28.60    29.20   
1          AFG      Afghanistan    -0.82     2.15     8.73    13.55    17.94   
2          AGO           Angola    22.50    22.65    22.69    21.48    20.19   
3          AIA  Anguilla (U.K.)    26.10    26.40    26.30    27.20    27.60   
4          ALA          Finland    -4.15    -7.16    -3.21     2.68     8.31   

   1901-06  1901-07  1901-08  ...  2023-03  2023-04  2023-05  2023-06  \
0    28.70    29.10    29.70  ...    27.70    28.40    30.00    30.70   
1    21.73    24.86    23.18  ...    11.43    15.13    19.09    24.73   
2    18.43    18.00    19.90  ...    22.73    22.38    21.06    18.77   
3    28.00    28.00    28.50  ...    26.50    27.10    28.30    29.80   
4    12.82    19.11    17.11  ...    -1.17     4.30     8.72    14.99   

   2023-07  2023-08  2023-09  2023-10  2023-11  2023-12  
0    30.40    31.10   

## Checking for Duplicate Rows

In [6]:
duplicates = df.duplicated().sum()
print(f"Total duplicate rows: {duplicates}")

Total duplicate rows: 0


In [7]:
unique_codes = df["Country_Code"].unique()
print(f"Unique country codes: {len(unique_codes)}")

print(df["Country_Name"].value_counts())

Unique country codes: 246
Country_Name
France              6
Finland             2
China               2
Aruba (Neth.)       1
Oman                1
                   ..
Grenada             1
Greenland (Den.)    1
Guatemala           1
Guam (U.S.)         1
Zimbabwe            1
Name: count, Length: 239, dtype: int64


Some Country_Codes are duplicates. This is due to the teritories that the countries own.

France: FRA, GLP, GUF, MTQ, MYT, REU

Finland: ALA, FIN

China: TWN, CHN

## Converting Duplicate Country Names

To avoid confustion when comparing locations.

In [8]:
df.loc[df['Country_Code'] == 'TWN', 'Country_Name'] = 'Taiwan'
df.loc[df['Country_Code'] == 'ALA', 'Country_Name'] = 'Åland Islands'
df.loc[df['Country_Code'] == 'GLP', 'Country_Name'] = 'Guadeloupe'
df.loc[df['Country_Code'] == 'GUF', 'Country_Name'] = 'French Guiana'
df.loc[df['Country_Code'] == 'MTQ', 'Country_Name'] = 'Martinique'
df.loc[df['Country_Code'] == 'MYT', 'Country_Name'] = 'Mayotte'
df.loc[df['Country_Code'] == 'REU', 'Country_Name'] = 'Réunion'

In [9]:
print(df[df['Country_Code'] == 'TWN'])
print(df[df['Country_Code'] == 'ALA'])
print(df[df['Country_Code'] == 'GLP'])
print(df[df['Country_Code'] == 'GUF'])
print(df[df['Country_Code'] == 'MTQ'])
print(df[df['Country_Code'] == 'MYT'])
print(df[df['Country_Code'] == 'REU'])

    Country_Code Country_Name  1901-01  1901-02  1901-03  1901-04  1901-05  \
225          TWN       Taiwan    15.84    10.59    14.63    19.27    20.94   

     1901-06  1901-07  1901-08  ...  2023-03  2023-04  2023-05  2023-06  \
225    22.65    23.94    23.04  ...    17.64    20.41    22.45    24.51   

     2023-07  2023-08  2023-09  2023-10  2023-11  2023-12  
225    25.74     25.2    24.69     22.6     19.7     16.8  

[1 rows x 1478 columns]
  Country_Code   Country_Name  1901-01  1901-02  1901-03  1901-04  1901-05  \
4          ALA  Åland Islands    -4.15    -7.16    -3.21     2.68     8.31   

   1901-06  1901-07  1901-08  ...  2023-03  2023-04  2023-05  2023-06  \
4    12.82    19.11    17.11  ...    -1.17      4.3     8.72    14.99   

   2023-07  2023-08  2023-09  2023-10  2023-11  2023-12  
4    15.47    15.91    14.78      6.5      1.8    -1.28  

[1 rows x 1478 columns]
   Country_Code Country_Name  1901-01  1901-02  1901-03  1901-04  1901-05  \
82          GLP   Guadelo

In [10]:
unique_codes = df["Country_Code"].unique()
print(f"Unique country codes: {len(unique_codes)}")

print(df["Country_Name"].value_counts())

Unique country codes: 246
Country_Name
Aruba (Neth.)          1
Malawi                 1
Mayotte                1
Namibia                1
New Caledonia (Fr.)    1
                      ..
Greece                 1
Grenada                1
Greenland (Den.)       1
Guatemala              1
Zimbabwe               1
Name: count, Length: 246, dtype: int64


## Convert to DateTime

In [11]:
df = df.melt(id_vars=["Country_Code", "Country_Name"], var_name="Year_Month", value_name="Temperature")

df["Year_Month"] = pd.to_datetime(df["Year_Month"], format="%Y-%m")

print(df.head())

  Country_Code     Country_Name Year_Month  Temperature
0          ABW    Aruba (Neth.) 1901-01-01        27.20
1          AFG      Afghanistan 1901-01-01        -0.82
2          AGO           Angola 1901-01-01        22.50
3          AIA  Anguilla (U.K.) 1901-01-01        26.10
4          ALA    Åland Islands 1901-01-01        -4.15


## Removing Rows with Missing Values

In [12]:
missing_values = df.isnull().sum().sum()
print(f"Total missing values: {missing_values}")

Total missing values: 2952


In [13]:
df_missing = df[df.isnull().any(axis=1)]
print(df_missing)

       Country_Code            Country_Name Year_Month  Temperature
26              BLM  Saint-Barthélemy (Fr.) 1901-01-01          NaN
35              BVT    Bouvet Island (Nor.) 1901-01-01          NaN
272             BLM  Saint-Barthélemy (Fr.) 1901-02-01          NaN
281             BVT    Bouvet Island (Nor.) 1901-02-01          NaN
518             BLM  Saint-Barthélemy (Fr.) 1901-03-01          NaN
...             ...                     ...        ...          ...
362393          BVT    Bouvet Island (Nor.) 2023-10-01          NaN
362630          BLM  Saint-Barthélemy (Fr.) 2023-11-01          NaN
362639          BVT    Bouvet Island (Nor.) 2023-11-01          NaN
362876          BLM  Saint-Barthélemy (Fr.) 2023-12-01          NaN
362885          BVT    Bouvet Island (Nor.) 2023-12-01          NaN

[2952 rows x 4 columns]


In [14]:
df = df.drop(df[df['Country_Code'].isin(['BLM', 'BVT'])].index)

In [15]:
missing_values = df.isnull().sum().sum()
print(f"Total missing values: {missing_values}")

Total missing values: 0


In [16]:
df_missing = df[df.isnull().any(axis=1)]
print(df_missing)

Empty DataFrame
Columns: [Country_Code, Country_Name, Year_Month, Temperature]
Index: []


In [12]:
print(df.head())

  Country_Code     Country_Name Year_Month  Temperature
0          ABW    Aruba (Neth.) 1901-01-01        27.20
1          AFG      Afghanistan 1901-01-01        -0.82
2          AGO           Angola 1901-01-01        22.50
3          AIA  Anguilla (U.K.) 1901-01-01        26.10
4          ALA          Finland 1901-01-01        -4.15


## Saving Cleaned Dataset

In [17]:
df.to_csv("cleaned_temperature.csv", index=False, encoding="utf-8-sig")

In [14]:
df = pd.read_csv('cleaned_temperature.csv', encoding="utf-8")
print(df.head())

  Country_Code     Country_Name  Year_Month  Temperature
0          ABW    Aruba (Neth.)  1901-01-01        27.20
1          AFG      Afghanistan  1901-01-01        -0.82
2          AGO           Angola  1901-01-01        22.50
3          AIA  Anguilla (U.K.)  1901-01-01        26.10
4          ALA          Finland  1901-01-01        -4.15
