In [1]:
# %load wind_statistics.py
"""
Wind Statistics
----------------

Topics: Using array methods over different axes, fancy indexing.

1. The data in 'wind.data' has the following format::

        61  1  1 15.04 14.96 13.17  9.29 13.96  9.87 13.67 10.25 10.83 12.58 18.50 15.04
        61  1  2 14.71 16.88 10.83  6.50 12.62  7.67 11.50 10.04  9.79  9.67 17.54 13.83
        61  1  3 18.50 16.88 12.33 10.13 11.17  6.17 11.25  8.04  8.50  7.67 12.75 12.71

   The first three columns are year, month and day.  The
   remaining 12 columns are average windspeeds in knots at 12
   locations in Ireland on that day.

   Use the 'loadtxt' function from numpy to read the data into
   an array.

2. Calculate the min, max and mean windspeeds and standard deviation of the
   windspeeds over all the locations and all the times (a single set of numbers
   for the entire dataset).

3. Calculate the min, max and mean windspeeds and standard deviations of the
   windspeeds at each location over all the days (a different set of numbers
   for each location)

4. Calculate the min, max and mean windspeed and standard deviations of the
   windspeeds across all the locations at each day (a different set of numbers
   for each day)

5. Find the location which has the greatest windspeed on each day (an integer
   column number for each day).

6. Find the year, month and day on which the greatest windspeed was recorded.

7. Find the average windspeed in January for each location.

You should be able to perform all of these operations without using a for
loop or other looping construct.

Bonus
~~~~~

1. Calculate the mean windspeed for each month in the dataset.  Treat
   January 1961 and January 1962 as *different* months. (hint: first find a
   way to create an identifier unique for each month. The second step might
   require a for loop.)

2. Calculate the min, max and mean windspeeds and standard deviations of the
   windspeeds across all locations for each week (assume that the first week
   starts on January 1 1961) for the first 52 weeks. This can be done without
   any for loop.

Bonus Bonus
~~~~~~~~~~~

Calculate the mean windspeed for each month without using a for loop.
(Hint: look at `searchsorted` and `add.reduceat`.)

Notes
~~~~~

These data were analyzed in detail in the following article:

   Haslett, J. and Raftery, A. E. (1989). Space-time Modelling with
   Long-memory Dependence: Assessing Ireland's Wind Power Resource
   (with Discussion). Applied Statistics 38, 1-50.


See :ref:`wind-statistics-solution`.
"""

from numpy import loadtxt



In [2]:
import numpy as np

### 1. Use the 'loadtxt' function from numpy to read the data into an array.

In [3]:
np.loadtxt?

In [4]:
a = np.loadtxt('wind.data')

In [5]:
a

array([[61.  ,  1.  ,  1.  , ..., 12.58, 18.5 , 15.04],
       [61.  ,  1.  ,  2.  , ...,  9.67, 17.54, 13.83],
       [61.  ,  1.  ,  3.  , ...,  7.67, 12.75, 12.71],
       ...,
       [78.  , 12.  , 29.  , ..., 16.42, 18.88, 29.58],
       [78.  , 12.  , 30.  , ..., 12.12, 14.67, 28.79],
       [78.  , 12.  , 31.  , ..., 11.38, 12.08, 22.08]])

In [6]:
a.shape

(6574, 15)

In [7]:
a[:,3:].shape

(6574, 12)

### 2. Calculate the min, max and mean windspeeds and standard deviation of the windspeeds over all the locations and all the times (a single set of numbers for the entire dataset).

In [8]:
np.min(a[:,3:])

0.0

In [9]:
np.max(a[:,3:])

42.54

In [10]:
np.mean(a[:,3:])

10.22837377040868

In [11]:
np.std(a[:,3:])

5.603840181095793

### 3. Calculate the min, max and mean windspeeds and standard deviations of the windspeeds at each location over all the days (a different set of numbers for each location)

In [12]:
np.min(a[:,3:],axis=0)

array([0.67, 0.21, 1.5 , 0.  , 0.13, 0.  , 0.  , 0.  , 0.  , 0.04, 0.13,
       0.67])

In [13]:
np.max(a[:,3:],axis=0)

array([35.8 , 33.37, 33.84, 28.46, 37.54, 26.16, 30.37, 31.08, 25.88,
       28.21, 42.38, 42.54])

In [14]:
np.mean(a[:,3:],axis=0)

array([12.36371463, 10.64644813, 11.66010344,  6.30627472, 10.45688013,
        7.09225434,  9.7968345 ,  8.49442044,  8.49581838,  8.70726803,
       13.121007  , 15.59946152])

In [15]:
np.std(a[:,3:],axis=0)

array([5.61918301, 5.26820081, 5.00738377, 3.60513309, 4.93536333,
       3.96838126, 4.97689374, 4.49865783, 4.16746101, 4.50327222,
       5.83459319, 6.69734719])

### 4. Calculate the min, max and mean windspeed and standard deviations of the windspeeds across all the locations at each day (a different set of numbers for each day)

In [16]:
np.min(a[:,3:],axis=1)

array([9.29, 6.5 , 6.17, ..., 8.71, 9.13, 9.59])

In [17]:
np.max(a[:,3:],axis=1)

array([18.5 , 17.54, 18.5 , ..., 29.58, 28.79, 27.29])

In [18]:
np.mean(a[:,3:],axis=1)

array([13.09666667, 11.79833333, 11.34166667, ..., 14.89      ,
       15.3675    , 15.4025    ])

In [19]:
np.std(a[:,3:],axis=1).shape

(6574,)

### 5. Find the location which has the greatest windspeed on each day (an integer column number for each day).

In [20]:
np.argmax(a[:,3:],axis=1)

array([10, 10,  0, ..., 11, 11,  2], dtype=int64)

In [21]:
np.argmax(a[:,3:],axis=1).shape

(6574,)

### 6. Find the year, month and day on which the greatest windspeed was recorded.

In [22]:
np.argmax(a[:,3:])

25943

In [23]:
np.argmax(a[:,3:]) // 12

2161

In [24]:
np.argmax(a[:,3:]) % 12

11

In [25]:
a[:,3:].flatten()[25943]

42.54

In [26]:
a[2161,:]

array([66.  , 12.  ,  2.  , 28.21, 27.37, 22.54, 20.62, 22.08, 16.13,
       28.16, 17.29, 22.5 , 23.16, 25.88, 42.54])

In [27]:
#year
a[2161,0]

66.0

In [28]:
#month
a[2161,1]

12.0

In [29]:
#day
a[2161,2]

2.0

In [30]:
#Testing out Alexandre lagrande chappelles solution

In [40]:
dates = a[:,:3]
winds = a[:,3:]

In [48]:
#returns a boolean array where max values are given 'True' index
winds == winds.max()

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [49]:
#this returns a tuple of arrays with row indicies and column indicies

np.where(winds == winds.max())

(array([2161], dtype=int64), array([11], dtype=int64))

In [54]:
row, col = np.where(winds == winds.max())
print(row)
print(col)
print(dates[row])
print("the maximum values is on the date {}".format(dates[row]))

[2161]
[11]
[[66. 12.  2.]]
the maximum values is on the date [[66. 12.  2.]]


### 7. Find the average windspeed in January for each location.

In [None]:
## This is correct (see the following lines)

np.mean(a[:,3:][a[:,1]==1],axis=0)

In [None]:
a[a[:,1]==1].shape

In [None]:
a[:,3:][a[:,1]==1].shape

In [None]:
a[a[:,1]==1]

In [None]:
a[:,3:][a[:,1]==1]

### Bonus 1) Calculate the mean windspeed for each month in the dataset.  Treat  January 1961 and January 1962 as *different* months. (hint: first find a  way to create an identifier unique for each month. The second step might   require a for loop.)

In [None]:
a

In [None]:
str(a[:,0]) + str(a[:,1])

In [None]:
np.concatenate((a[:,0] , a[:,1]), axis=0)

In [None]:
np.concatenate((a[:,0] , a[:,1]), axis=0).shape

In [None]:
#list(map("".join,zip(str(a[:,0]),str(a[:,1]))))

In [None]:
z = a[:,0]*12+a[:,1]
z

In [None]:
print(z.shape)
print(a.shape)
print(z.T.shape)


print(z.reshape(z.size,1).shape)

In [None]:
b = np.append(a,z.reshape(z.size,1),axis=1)
b.shape

In [None]:
print(b[:5,:])
print(" ")
print(a[:5,:])

In [None]:
monthsDic = {}
monthsDicCount = {}
i = 0

for month in b[:,15]:
    if month not in monthsDic:
        monthsDic[month] = 0
        monthsDicCount[month] = 0
    monthsDic[month] += np.sum(b[i,3:14])
    monthsDicCount[month] += 1
    i+=1

for month, total in monthsDic.items():
    monthsDic[month] = total / (monthsDicCount[month]*12)
    
#print(monthsDic)
    
    #use the dictionary method here from your python course!


In [None]:
# Bonus

# compute the month number for each day in the dataset
months = (a[:, 0] - 61) * 12 + a[:, 1] - 1

# we're going to use the month values as indices, so we need
# them to be integers
months = months.astype(int)

print(months)
print(months.shape)

# get set of unique months
month_values = set(months)

# print(month_values)

# initialize an array to hold the result
monthly_means = np.zeros(len(month_values))

# print(monthly_means)

for month in month_values:
    # find the rows that correspond to the current month
    day_indices = (months == month)
    
    #print(day_indices)

    # extract the data for the current month using fancy indexing
    month_data = a[day_indices]

    # find the mean
    monthly_means[month] = month_data.mean()

    # Note: experts might do this all-in one
    # monthly_means[month] = data[months==month].mean()

# In fact the whole for loop could reduce to the following one-liner
# monthly_means = array([data[months==month].mean() for month in month_values])


    
    
    
    #use a numpy array because thats the point of these exercises!

In [None]:
#set?

In [None]:
#print(monthly_means)

### Bonus 2) Calculate the min, max and mean windspeeds and standard deviations of the windspeeds across all locations for each week (assume that the first week starts on January 1 1961) for the first 52 weeks. This can be done without any for loop.

In [None]:
# compute the week number for each day in the dataset
months = (a[:, 0] - 61) * 12 + a[:, 1] - 1

days = (a[:,0] - 61) * 52 + a[:,2] - 1

In [None]:
days

In [None]:
days = days.astype(int)

In [None]:
weeks = days // 7

# get set of unique months
week_values = set(weeks)



In [None]:
# initialize an array to hold the result
weekly_means = np.zeros(len(week_values))

# initialize an array to hold the result
weekly_means = np.zeros(131)

In [None]:
#find the weekly means
for week in week_values:
    
    # find the rows that correspond to the current week
    day_indicies = (weeks == week)
    
    #extact the data for the current week using fancy indexing
    week_data = a[day_indicies,3:]
    
    #find the mean
    weekly_means[week] = week_data.mean()
    

In [None]:
weekly_means

In [None]:
# Bonus 2.
# Extract the data for the first 52 weeks. Then reshape the array to put
# on the same line 7 days worth of data for all locations. Let Numpy
# figure out the number of lines needed to do so
weekly_data = a[:52 * 7].reshape(-1, 7 * 12)

print('Bonus 2. Weekly statistics over all locations')
print("")
print('  min:', weekly_data.min(axis=1))
print("")
print('  max:', weekly_data.max(axis=1))
print("")
print('  mean:', weekly_data.mean(axis=1))
print("")
print('  standard deviation:', weekly_data.std(axis=1))
print("")
print()