In [None]:
import pandas as pd

def csv_reader(filename, elements):
    """
    Fetches data from met_data.csv
    filename: Place of file
    elements: List of wanted elements
    return: Dict with filtered data
    """
    df = pd.read_csv(filename, header=None, names=['station', 'timestamp', 'measurements'])
    
    records = []
    
    for _, row in df.iterrows():
        timestamp = row['timestamp']
        try:
            measurements = eval(row['measurements']) 
        except Exception as e:
            print(f"Feil ved eval() for rad {row}: {e}")
            continue
        
        for measurement in measurements:
            element_id = measurement['elementId']
            if element_id in elements:
                records.append({
                    'timestamp': timestamp,
                    'element': element_id,
                    'value': measurement['value'],
                    'unit': measurement['unit']
                })
    
    result_df = pd.DataFrame(records)
    
    if result_df.empty:
        return {}
    
    result = {}
    for element in elements:
        filtered_df = result_df[result_df['element'] == element]
        result[element] = {
            row['timestamp']: {'value': row['value'], 'unit': row['unit']}
            for _, row in filtered_df.iterrows()
        }
    
    return result



{'sum(duration_of_sunshine P1D)': {'2015-01-01T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-02T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-03T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-04T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-05T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-06T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-07T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-08T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-09T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-10T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-11T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-12T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-13T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-14T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-15T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-16T00:00:00.000Z': {'value': 0.0, 'unit':

In [14]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression
import ast

def csv_reader(filename, elements):
    
    
    # Les CSV med korrekt format
    df = pd.read_csv(filename, header=None, names=['element_id', 'value','unit', 'timeOffset', 'timeResolution', 'timeSeriesId', 'performanceCategory', 'qualityCode', '..', 'station', 'referenceTimestamp'])
    

    
    # Sjekk at nødvendige kolonner finnes
    if not {'element_id', 'value','unit', 'timeOffset', 'timeResolution', 'timeSeriesId', 'performanceCategory', 'qualityCode', '..', 'station', 'referenceTimestamp'}.issubset(df.columns):
        raise ValueError("CSV-filen har ikke forventede kolonner: 'datatype', 'value','unit', 'timeOffset', 'timeResolution', 'timeSeriesId', 'performanceCategory', 'qualityCode', '..', 'station', 'referenceTimestamp'")
    
    records = []
    
    for _, row in df.iterrows():
        timestamp = row['referenceTimestamp']
        try:
            measurements = ast.literal_eval(row['value', 'unit'])  # Sikrere enn eval
        except Exception as e:
            print(f"Feil ved parsing av målinger for rad {row}: {e}")
            continue

        for measurement in measurements:
            element_id = measurement['elementId']
            if element_id in elements:
                records.append({
                    'referenceTimestamp': timestamp,
                    'datatype': element_id,
                    'value': measurement['value'],
                    'unit': measurement['unit']
                })
    
    result_df = pd.DataFrame(records)
    if result_df.empty:
        return {}
    # Konverter timestamp til datetime
    result_df['timestamp'] = pd.to_datetime(result_df['timestamp'], errors='coerce')
    result_df = result_df.dropna(subset=['timestamp'])  # Fjern rader med ugyldig tid
    
    # Fjern store avvik ved hjelp av Z-score
    result_df['value'] = result_df.groupby('element')['value'].transform(
        lambda x: x.where(np.abs(zscore(x)) < 3)
    )
    
    # Fyll inn manglende verdier ved hjelp av lineær regresjon
    for element in elements:
        subset = result_df[result_df['element'] == element].copy()
        subset = subset.sort_values('timestamp')
        
        # Konverter tid til numerisk verdi for regresjon
        subset['time_numeric'] = (subset['timestamp'] - subset['timestamp'].min()).dt.total_seconds()
        
        missing_mask = subset['value'].isna()
        if missing_mask.any() and not subset['value'].isna().all():
            reg = LinearRegression()
            known_x = subset.loc[~missing_mask, 'time_numeric'].values.reshape(-1, 1)
            known_y = subset.loc[~missing_mask, 'value'].values
            reg.fit(known_x, known_y)
            
            pred_x = subset.loc[missing_mask, 'time_numeric'].values.reshape(-1, 1)
            subset.loc[missing_mask, 'value'] = reg.predict(pred_x)
        
        result_df.update(subset)    
    # Konverter til ønsket format
    result = {}
    for element in elements:
        filtered_df = result_df[result_df['element'] == element]
        result[element] = {
            row['referenceTimestamp']: {'value': row['value'], 'unit': row['unit']}
            for _, row in filtered_df.iterrows()
        }
    
    return result




csv_reader('/Users/kristiansolberg1/Library/CloudStorage/OneDrive-NTNU/Anvendt programmering/Mappeprosjekt/miljodataAnalyse-1/data/met_data.csv', 'datatype')


Feil ved parsing av målinger for rad datatype               sum(precipitation_amount P1D)
value                                            0.2
unit                                              mm
timeOffset                                     PT18H
timeResolution                                   P1D
timeSeriesId                                       0
performanceCategory                                C
qualityCode                                        2
..                                               2.0
station                                    SN18700:0
referenceTimestamp          2015-01-01T00:00:00.000Z
Name: 0, dtype: object: 'key of type tuple not found and not a MultiIndex'
Feil ved parsing av målinger for rad datatype               sum(precipitation_amount P1D)
value                                            0.0
unit                                              mm
timeOffset                                      PT6H
timeResolution                                   P1D
tim

KeyboardInterrupt: 

In [3]:
df = pd.read_csv('/Users/kristiansolberg1/Library/CloudStorage/OneDrive-NTNU/Anvendt programmering/Mappeprosjekt/miljodataAnalyse-1/data/met_data.csv')
print(df.columns)


Index(['sum(precipitation_amount P1D)', '0.2', 'mm', 'PT18H', 'P1D', '0', 'C',
       '2', '2.0', 'SN18700:0', '2015-01-01T00:00:00.000Z'],
      dtype='object')
