In [3]:
import pandas as pd
import numpy as np

# Step 2: Import the dataset and replace the first 3 columns by a proper datetime index
data = pd.read_csv('wind.txt', delim_whitespace=True, parse_dates=[[0, 1, 2]])
data = data.set_index('Yr_Mo_Dy')

# Step 4: Create a function to fix the year and apply it
def fix_year(year):
    if year > 2000:
        year -= 100
    return year

data.index = data.index.map(lambda x: x.replace(year=fix_year(x.year)))

# Step 6: Compute how many values are missing for each location over the entire record
missing_values = data.isnull().sum()

# Step 7: Compute how many non-missing values there are in total
non_missing_count = data.notnull().sum().sum()

# Step 8: Calculate the mean windspeeds over all the locations and all the times
mean_windspeed = data.mean().mean()

# Step 9: Calculate min, max, mean, and standard deviation windspeeds at each location
loc_stats = data.agg(['min', 'max', 'mean', 'std'], axis=0)

# Step 10: Calculate min, max, mean, and standard deviation windspeeds for each day
day_stats = data.agg(['min', 'max', 'mean', 'std'], axis=1)

# Step 11: Find the average windspeed in January for each location
jan_avg = data[data.index.month == 1].groupby(data.index.year).mean()

# Step 12: Downsample to yearly frequency for each location
yearly_data = data.resample('Y').mean()

# Step 13: Downsample to monthly frequency for each location
monthly_data = data.resample('M').mean()

# Step 14: Downsample to weekly frequency for each location
weekly_data = data.resample('W-MON').mean()

# Step 15: Calculate min, max, mean, and standard deviation windspeeds for the first 52 weeks
weekly_stats = weekly_data.head(52).agg(['min', 'max', 'mean', 'std'], axis=0)

print("Missing values:\n", missing_values)
print("\nNon-missing values count:", non_missing_count)
print("\nMean windspeed over all locations and times:", mean_windspeed)
print("\nLocation statistics:\n", loc_stats)
print("\nDay statistics:\n", day_stats)
print("\nAverage windspeed in January:\n", jan_avg)
print("\nYearly data:\n", yearly_data)
print("\nMonthly data:\n", monthly_data)
print("\nWeekly data:\n", weekly_data)
print("\nWeekly statistics for the first 52 weeks:\n", weekly_stats)


Missing values:
 RPT    0
VAL    1
ROS    0
KIL    0
SHA    1
BIR    0
DUB    0
CLA    1
MUL    0
CLO    0
BEL    0
MAL    0
dtype: int64

Non-missing values count: 33

Mean windspeed over all locations and times: 12.053333333333333

Location statistics:
             RPT        VAL        ROS        KIL        SHA       BIR  \
min   14.710000  14.960000  10.830000   6.500000  11.170000  6.170000   
max   18.500000  16.880000  13.170000  10.130000  12.620000  9.870000   
mean  16.083333  15.920000  12.110000   8.640000  11.895000  7.903333   
std    2.099389   1.357645   1.185411   1.900289   1.025305  1.861003   

            DUB        CLA        MUL        CLO        BEL       MAL  
min   11.250000  10.040000   8.500000   7.670000  12.750000  12.71000  
max   13.670000  10.250000  10.830000  12.580000  18.500000  15.04000  
mean  12.140000  10.145000   9.706667   9.973333  16.263333  13.86000  
std    1.330902   0.148492   1.167233   2.469015   3.080265   1.16529  

Day statistics:
 

  data = pd.read_csv('wind.txt', delim_whitespace=True, parse_dates=[[0, 1, 2]])
