In [17]:
# Import NumPy library
import numpy as np

'''Explanation: In the first cell, we import the NumPy library, which is necessary for using the numpy.random.choice function.'''

# Example 1: Sampling without replacement
data = np.array([1, 2, 3, 4, 5])
sample = np.random.choice(data, size=3, replace=False)

print("Example 1:")
print("Original array:", data)
print("Sampled without replacement:", sample)

'''Explanation: In the second cell, we demonstrate how to use numpy.random.choice to sample three elements without replacement from an array [1, 2, 3, 4, 5].'''

# Example 2: Sampling with replacement
sample_with_replacement = np.random.choice(data, size=5, replace=True)

print("\nExample 2:")
print("Sampled with replacement:", sample_with_replacement)

'''Explanation: In the third cell, we show how to sample five elements with replacement from the same array, allowing duplicates in the output.'''

# Example 3: Weighted sampling
weighted_data = np.array([1, 2, 3, 4, 5])
weights = np.array([0.1, 0.2, 0.3, 0.2, 0.2])
weighted_sample = np.random.choice(weighted_data, size=3, replace=False, p=weights)

print("\nExample 3:")
print("Weighted data array:", weighted_data)
print("Weights:", weights)
print("Weighted sample:", weighted_sample)

'''Explanation: In the fourth cell, we illustrate how to perform weighted sampling using the p parameter, where each element has specified probabilities. The example includes an array [1, 2, 3, 4, 5] with corresponding weights.'''

Example 1:
Original array: [1 2 3 4 5]
Sampled without replacement: [3 1 2]

Example 2:
Sampled with replacement: [1 4 5 2 4]

Example 3:
Weighted data array: [1 2 3 4 5]
Weights: [0.1 0.2 0.3 0.2 0.2]
Weighted sample: [2 4 5]


'Explanation: In the fourth cell, we illustrate how to perform weighted sampling using the p parameter, where each element has specified probabilities. The example includes an array [1, 2, 3, 4, 5] with corresponding weights.'

In [28]:
import csv
from datetime import datetime
from itertools import groupby

# Update the CSV location
csv_location = './data/prob2_AB0208.csv'

with open(csv_location) as handle:
    mydata = list(csv.DictReader(handle))

# Display the column names to identify the correct ones
print("Column names in the CSV file:", mydata[0].keys())

# Update the column names in the code accordingly
# For example, if the actual column names are 'TimeStamp', 'RotorSpeed_rpm', 'WindSpeed_mps'
mydata = [{'Timestamp': datetime.strptime(row['TimeStamp'], '%Y-%m-%d %H:%M:%S.%f').strftime('%m/%d/%Y %H:%M:%S'),
           'Rotorspeed': float(row['RotorSpeed_rpm']) if row['RotorSpeed_rpm'] else 0.0,
           'Windspeed': float(row['WindSpeed_mps']) if row['WindSpeed_mps'] else 0.0}
          for row in mydata]

# a) Convert timestamps into the format MM/dd/yyyy HH:mm:ss using map
mydata = list(map(lambda row: {'Timestamp': datetime.strptime(row['Timestamp'], '%m/%d/%Y %H:%M:%S').strftime('%m/%d/%Y %H:%M:%S'),
                               'Rotorspeed': row['Rotorspeed'],
                               'Windspeed': row['Windspeed']},
                 mydata))

# b) Sort the rows according to increasing rotorspeed using sorted and lambda
mydata = sorted(mydata, key=lambda x: x['Rotorspeed'])

# c) Add a column called WindSpeed_Group using itertools.groupby
def categorize_windspeed(windspeed):
    if windspeed < 5:
        return 'A'
    elif 5 <= windspeed <= 10:
        return 'B'
    else:
        return 'C'

# Add WindSpeed_Group column
for row in mydata:
    row['WindSpeed_Group'] = categorize_windspeed(row['Windspeed'])

# Display the modified data
for row in mydata:
    print(row)


Column names in the CSV file: dict_keys(['TimeStamp', 'ReactivePower_kVAr', 'Power_kW', 'WindSpeed_mps', 'ErrorCode', 'GenState', 'RunState', 'RotorSpeed_rpm', 'RotorSpeedAve', 'RotorOverSpeed', 'RotorUnderSpeed'])
{'Timestamp': '04/23/2018 08:56:03', 'Rotorspeed': -0.3300243, 'Windspeed': 15.1633, 'WindSpeed_Group': 'C'}
{'Timestamp': '04/24/2018 12:38:45', 'Rotorspeed': -0.2700199, 'Windspeed': 9.455032, 'WindSpeed_Group': 'B'}
{'Timestamp': '04/24/2018 07:14:16', 'Rotorspeed': -0.2550188, 'Windspeed': 13.22011, 'WindSpeed_Group': 'C'}
{'Timestamp': '04/24/2018 07:31:58', 'Rotorspeed': -0.2400177, 'Windspeed': 9.337159, 'WindSpeed_Group': 'B'}
{'Timestamp': '04/25/2018 12:07:51', 'Rotorspeed': -0.2400177, 'Windspeed': 7.309283, 'WindSpeed_Group': 'B'}
{'Timestamp': '04/30/2018 13:39:19', 'Rotorspeed': -0.2400177, 'Windspeed': 6.916753, 'WindSpeed_Group': 'B'}
{'Timestamp': '04/20/2018 14:44:13', 'Rotorspeed': -0.2250166, 'Windspeed': 6.644386, 'WindSpeed_Group': 'B'}
{'Timestamp': '0

In [29]:
import numpy as np
import time

def original_integration(f, a, b, n):
    h = (b - a) / n
    result = 0.5 * (f(a) + f(b))
    for i in range(1, n):
        result += f(a + i * h)
    result *= h
    return result

def vectorized_integration(f, a, b, n):
    x = np.linspace(a, b, n + 1)
    h = (b - a) / n
    result = 0.5 * (f(a) + f(b)) + np.sum(f(x[1:-1]))
    result *= h
    return result

# Function to integrate: f(x) = 8x^12 + 11x^10 - 12x^8 + 3
def f(x):
    return 8 * x**12 + 11 * x**10 - 12 * x**8 + 3

# Interval [a, b] and number of subintervals
a, b = -15, 17
n = 1000000  # Adjust the number of subintervals

# Timing the original integration
start_time = time.time()
result_original = original_integration(f, a, b, n)
end_time = time.time()
time_original = end_time - start_time

# Timing the vectorized integration
start_time = time.time()
result_vectorized = vectorized_integration(f, a, b, n)
end_time = time.time()
time_vectorized = end_time - start_time

# Display results and timings
print("Original Result:", result_original)
print("Vectorized Result:", result_vectorized)
print("Time taken (Original): {:.6f} seconds".format(time_original))
print("Time taken (Vectorized): {:.6f} seconds".format(time_vectorized))


Original Result: 7335495724470886.0
Vectorized Result: 7335495724471518.0
Time taken (Original): 0.583538 seconds
Time taken (Vectorized): 0.156907 seconds


In [39]:
import numpy as np

# a) Use numpy.genfromtxt to read the file into a 2-dimensional numpy array
file_path = './data/prob4_AB0208.csv'
data = np.genfromtxt(file_path, delimiter=',', dtype=str, skip_header=1)

# Display the original data
print("Original Data:")
print(data)

# b) Use Boolean masking to drop the rows that contain nan entries
data = data[~np.any(np.isnan(data.astype(float)), axis=1)]

# Display the data after dropping rows with nan entries
print("\nData after dropping rows with nan entries:")
print(data)

# c) Convert the time entries into a human-readable format (e.g., YYYY-MM-DD HH:mm:ss)
time_column_index = 0

def convert_to_datetime(x):
    try:
        return np.datetime64(x, 's').astype('datetime64[s]')
    except ValueError:
        return np.datetime64('NaT')

data[:, time_column_index] = np.vectorize(convert_to_datetime)(data[:, time_column_index])

# Display the data after converting time entries
print("\nData after converting time entries:")
print(data)

# d) Add a new row that contains the averages of the numeric columns, skipping 'NaT' for the time column
numeric_data = np.ma.masked_equal(data[:, 1:].astype(float), np.nan)
averages = np.mean(numeric_data, axis=0)
averages = np.insert(averages.filled(np.nan), time_column_index, np.nan)  # Fill 'NaT' with nan for the time column

# Display the averages
print("\nAverages of numeric columns:")
print(averages)

# Create a new row with the averages
data_with_averages = np.vstack([data, averages])

# Display the data with the new row of averages
print("\nData with the new row of averages:")
print(data_with_averages)


Original Data:
[['268.424561' '0.868437' '0.000000' '1672790400.000000' '0.300850'
  '-2.382514']
 ['267.929047' '0.869720' '0.039103' '1672794000.000000' '0.152142'
  '-2.623582']
 ['267.227448' '0.872451' '0.069010' 'nan' '-0.037141' '-2.734714']
 ...
 ['270.075745' '0.896504' '0.000000' '1672693200.000000' '-0.731940'
  '-0.121091']
 ['270.487732' '0.877308' '0.000651' '1672696800.000000' '-1.110839'
  '-0.406309']
 ['270.797119' '0.862045' '0.013021' '1672700400.000000' '-1.196100'
  '-0.412851']]

Data after dropping rows with nan entries:
[['268.424561' '0.868437' '0.000000' '1672790400.000000' '0.300850'
  '-2.382514']
 ['267.929047' '0.869720' '0.039103' '1672794000.000000' '0.152142'
  '-2.623582']
 ['266.758057' '0.873738' '0.095133' '1672801200.000000' '0.443858'
  '-2.607769']
 ...
 ['270.075745' '0.896504' '0.000000' '1672693200.000000' '-0.731940'
  '-0.121091']
 ['270.487732' '0.877308' '0.000651' '1672696800.000000' '-1.110839'
  '-0.406309']
 ['270.797119' '0.862045' '

In [49]:

##### imports #####

import requests
import json
import sys

# this has to do with pass by value / reference
from copy import deepcopy

##### config #####

english = True
# english = False


##### helpers #####


# notebook replacement of sys.exit()
# call with raise StopExecution
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

query_template = {
    "query": [], # list of query items
    "response": {
        "format": "json"
    }
}

query_item_template = {
    "code": "", # variable
    "selection": {
        "filter": "item",
        "values": [] # list of strings
    }
}


##### main #####


with requests.Session() as session:

    '''
    first, some browsing in order to get the correct database
    you can do this with a browser too (but translation may become an issue)
    '''

    lang_id = 'en' if english else 'fi'
    base_url = f'https://pxdata.stat.fi/PXWeb/api/v1/{lang_id}/StatFin'
#     response = session.get(base_url)

#     for item in response.json():
#         print(item['id'], item['text'])

#     # stop execution
#     raise StopExecution

    '''
    next, append the id of your thing of interest to the url
    (EDIT the adopt below)
    '''

    catalogue_url = f'{base_url}/khi'
    # response = session.get(catalogue_url)

    '''
    check what .px files are available in the "catalogue"
    '''
#     for item in response.json():
#         print(item['id'], item['text'])

#     # stop execution
#     raise StopExecution

    '''
    once you decide what .px file interests you, 
    EDIT it below in order to fetch the available data headers

    '''

    headers_url = f'{base_url}/khi/statfin_khi_pxt_11xc.px'
    response = session.get(headers_url)

    myjson = response.json()
    # print()
    # print('variables:', len(myjson['variables']))
    # print()
    # for var in myjson['variables']:
    #     print(var['text'])
    # print()

#     if english:
#         tmp_url = headers_url.replace('/en/','/fi/')
#         response = session.get(tmp_url)
#         myjson = response.json()
#         print()
#         print('the corresponding variables in finnish (needed in the actual query):')
#         print()
#         for var in myjson['variables']:
#             print(var['text'])
#         print()

#     # stop execution
#     raise StopExecution

    '''
    okay, but then things get more serious as we build the actual query for the data

    first, fetch the maximum values that one can download
    (this is kind of hi-tech, got it from the documentation)
    (which typically sucks in free & public apis like this)
    '''
    response = session.get(f'https://statfin.stat.fi/PXWeb/api/v1/{lang_id}/?config')
    maxvalues = response.json()['maxValues']

    '''
    query building (we don't request anything yet)
    please edit only the "for myvar" line
    '''
    query = deepcopy(query_template)
    total_values = 1
    for myvar in ['Hyödyke', 'Vuosi', 'Tiedot']: # EDIT this line
        myvalues = []
        query_item = deepcopy(query_item_template)
        for v in myjson['variables']:
            if v['code'] == myvar:
                myvalues = v['values']
        total_values = total_values * len(myvalues)
        query_item['code'] = myvar
        query_item['selection']['values'] = myvalues
        query['query'].append(query_item)
    if total_values > maxvalues:
        print('your query is too big, try again with fewer variables')
        raise StopExecution


    '''
    obtain the actual data with a "post" request
    that's like submitting a web form
    and cannot be done by gui browsing anymore
    '''
    response = session.post(headers_url, json=query)

    '''
    finally, dump the data to a file
    '''
    myjson = response.json()
    with open('test.json', 'w') as handle:
        json.dump(myjson, handle, indent=4)

