In [2]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

In [5]:
# Step 1: Load data from csv file
file_path = r'Resources\weather_all_country_codes3.csv' 
weather_country_data_df = pd.read_csv(file_path)

# Step 2: Display the first few rows of the DataFrame to verify loading
print(weather_country_data_df.head())

   Unnamed: 0                 date datatype            station attributes  \
0           0  1996-01-01T00:00:00     DX70  GHCND:AEM00041218          S   
1           1  1996-01-01T00:00:00     DX90  GHCND:AEM00041218          S   
2           2  1996-01-01T00:00:00     EMXT  GHCND:AEM00041218   S,0706,    
3           3  1996-01-01T00:00:00     TMAX  GHCND:AEM00041218          S   
4           0  2004-01-01T00:00:00     DP10  GHCND:AEM00041194          S   

   value country  
0  339.0      AE  
1  218.0      AE  
2  119.0      AE  
3   96.7      AE  
4    4.0      AE  


In [7]:
# Create a new column called 'year' based on the 'date' field
weather_country_data_df['year'] = pd.to_datetime(weather_country_data_df['date']).dt.year

# Create a new column called 'country_code' by parsing out the two characters after the ':' in the 'station' field
weather_country_data_df['country_code'] = weather_country_data_df['station'].apply(lambda x: x.split(':')[1][:2])

# Step 2: Display the first few rows of the DataFrame to verify loading
print(weather_country_data_df.head())


   Unnamed: 0                 date datatype            station attributes  \
0           0  1996-01-01T00:00:00     DX70  GHCND:AEM00041218          S   
1           1  1996-01-01T00:00:00     DX90  GHCND:AEM00041218          S   
2           2  1996-01-01T00:00:00     EMXT  GHCND:AEM00041218   S,0706,    
3           3  1996-01-01T00:00:00     TMAX  GHCND:AEM00041218          S   
4           0  2004-01-01T00:00:00     DP10  GHCND:AEM00041194          S   

   value country  year country_code  
0  339.0      AE  1996           AE  
1  218.0      AE  1996           AE  
2  119.0      AE  1996           AE  
3   96.7      AE  1996           AE  
4    4.0      AE  2004           AE  


In [11]:
# Identify the datatypes of interest
datatypes_of_interest = ['EMNT', 'EMXT', 'MNPN', 'MXPN', 'TAVG', 'TMAX']

# Compute the average for each year and country code combination where there are multiple records
averaged_df = weather_country_data_df[weather_country_data_df['datatype'].isin(datatypes_of_interest)]
averaged_df = averaged_df.groupby(['year', 'country_code', 'datatype']).agg({'value': 'mean', 'date': 'first', 'station': 'first', 'attributes': 'first'}).reset_index()

# Create a unique key for identifying records
weather_country_data_df['unique_key'] = weather_country_data_df['year'].astype(str) + '_' + weather_country_data_df['country_code'] + '_' + weather_country_data_df['datatype']
averaged_df['unique_key'] = averaged_df['year'].astype(str) + '_' + averaged_df['country_code'] + '_' + averaged_df['datatype']

# Remove the original records that were averaged
remaining_df = weather_country_data_df[~weather_country_data_df['unique_key'].isin(averaged_df['unique_key'])]

# Append the averaged records to the remaining dataframe
final_df = pd.concat([remaining_df, averaged_df], ignore_index=True)

# Drop the unique_key column
final_df = final_df.drop(columns=['unique_key'])

# Step 2: Display the first few rows of the DataFrame to verify loading
print(final_df.head())

   Unnamed: 0                 date datatype            station  attributes  \
0         0.0  1996-01-01T00:00:00     DX70  GHCND:AEM00041218           S   
1         1.0  1996-01-01T00:00:00     DX90  GHCND:AEM00041218           S   
2         0.0  2004-01-01T00:00:00     DP10  GHCND:AEM00041194           S   
3         1.0  2004-01-01T00:00:00     DP1X  GHCND:AEM00041194           S   
4         2.0  2004-01-01T00:00:00     EMXP  GHCND:AEM00041194   ,S,1116,    

   value country  year country_code  
0  339.0      AE  1996           AE  
1  218.0      AE  1996           AE  
2    4.0      AE  2004           AE  
3    0.0      AE  2004           AE  
4    0.9      AE  2004           AE  


In [13]:
# Pivot the dataframe to get unique datatypes as new columns
pivot_df = final_df.pivot_table(index=['country_code', 'year'], columns='datatype', values='value', aggfunc='first').reset_index()

# Flatten the multi-level column index resulting from pivot
pivot_df.columns.name = None
pivot_df.columns = ['country_code', 'year'] + list(pivot_df.columns[2:])

# Step 2: Display the first few rows of the DataFrame to verify loading
print(pivot_df.head())

  country_code  year  DP10  DP1X  DT32   DX70   DX90  EMNT  EMXP   EMXT  HTDD  \
0           AE  1996   NaN   NaN   NaN  339.0  218.0   NaN   NaN  119.0   NaN   
1           AE  2004   4.0   0.0   NaN    NaN    NaN   NaN  0.90    NaN   NaN   
2           AE  2006   7.0   1.0   NaN  335.0  231.0   NaN  1.77  118.0   NaN   
3           AE  2008   NaN   NaN   NaN  330.0  247.0   NaN   NaN  120.0   NaN   
4           AE  2009   NaN   NaN   NaN  329.0  234.0   NaN   NaN  121.0   NaN   

   MNPN  MXPN  PRCP  TAVG  TMAX  
0   NaN   NaN   NaN   NaN  96.7  
1   NaN   NaN  2.31   NaN   NaN  
2   NaN   NaN  4.08   NaN  97.2  
3   NaN   NaN   NaN   NaN  97.3  
4   NaN   NaN   NaN   NaN  97.9  


In [14]:
# Export the pivoted dataframe to CSV format in the Resources folder
# Export the DataFrame to a CSV file in the Resources folder
output_file_path = r'Resources/country_weather_adj.csv'
pivot_df.to_csv(output_file_path, index=False)