In [None]:
import pandas as pd

def csv_reader(filename, elements):
    """
    Fetches data from met_data.csv
    filename: Place of file
    elements: List of wanted elements
    return: Dict with filtered data
    """
    df = pd.read_csv(filename, header=None, names=['station', 'timestamp', 'measurements'])
    
    records = []
    
    for _, row in df.iterrows():
        timestamp = row['timestamp']
        try:
            measurements = eval(row['measurements']) 
        except Exception as e:
            print(f"Feil ved eval() for rad {row}: {e}")
            continue
        
        for measurement in measurements:
            element_id = measurement['elementId']
            if element_id in elements:
                records.append({
                    'timestamp': timestamp,
                    'element': element_id,
                    'value': measurement['value'],
                    'unit': measurement['unit']
                })
    
    result_df = pd.DataFrame(records)
    
    if result_df.empty:
        return {}
    
    result = {}
    for element in elements:
        filtered_df = result_df[result_df['element'] == element]
        result[element] = {
            row['timestamp']: {'value': row['value'], 'unit': row['unit']}
            for _, row in filtered_df.iterrows()
        }
    
    return result



{'sum(duration_of_sunshine P1D)': {'2015-01-01T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-02T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-03T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-04T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-05T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-06T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-07T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-08T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-09T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-10T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-11T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-12T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-13T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-14T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-15T00:00:00.000Z': {'value': 0.0, 'unit': 'hours'}, '2015-01-16T00:00:00.000Z': {'value': 0.0, 'unit':

In [42]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression
import ast

def csv_reader(filename, datatyper):
    
    
    # Les CSV med korrekt format
    df = pd.read_csv(filename, header=None, names=['datatype', 'value','unit', 'timeOffset', 'timeResolution', 'timeSeriesId', 'performanceCategory', 'qualityCode', '..', 'station', 'referenceTimestamp'])
    

    
    # Sjekk at nødvendige kolonner finnes
    if not {'datatype', 'value','unit', 'timeOffset', 'timeResolution', 'timeSeriesId', 'performanceCategory', 'qualityCode', '..', 'station', 'referenceTimestamp'}.issubset(df.columns):
        raise ValueError("CSV-filen har ikke forventede kolonner: 'datatype', 'value','unit', 'timeOffset', 'timeResolution', 'timeSeriesId', 'performanceCategory', 'qualityCode', '..', 'station', 'referenceTimestamp'")
    
    records = []
    
    for _, row in df.iterrows():
        datatype = row['datatype']
        value = row['value']
        unit = row['unit']
        station = row['station']
        timestamp = (row['referenceTimestamp'])[:10]
        
        records.append({
                    'referenceTimestamp': timestamp,
                    'datatype': datatype,
                    'value': value,
                    'unit': unit,
                    'station' : station
                })
    result_df = pd.DataFrame(records)
    if result_df.empty:
        return {}
    
    
    # Konverter timestamp til datetime og sørger for at alle verdier under 'value' er tall

    result_df['value'] = pd.to_numeric(result_df['value'], errors='coerce')
    result_df['referenceTimestamp'] = pd.to_datetime(result_df['referenceTimestamp'], errors='coerce')
    result_df = result_df.dropna(subset=['referenceTimestamp'])  # Fjern rader med ugyldig tid
        
        
    # Fjern store avvik ved hjelp av Z-score
    result_df['value'] = result_df.groupby('datatype')['value'].transform(
        lambda x: x.where(np.abs(zscore(x)) < 3)
    )
    
    
    # Fyll inn manglende verdier ved hjelp av lineær regresjon
    for datatype in datatyper:
        subset = result_df[result_df['datatype'] == datatype].copy()
        subset = subset.sort_values('referenceTimestamp')
        
        # Konverter tid til numerisk verdi for regresjon
        subset['time_numeric'] = (subset['referenceTimestamp'] - subset['referenceTimestamp'].min()).dt.total_seconds()
        
        missing_mask = subset['value'].isna()
        if missing_mask.any() and not subset['value'].isna().all():
            reg = LinearRegression()
            known_x = subset.loc[~missing_mask, 'time_numeric'].values.reshape(-1, 1)
            known_y = subset.loc[~missing_mask, 'value'].values
            reg.fit(known_x, known_y)
            
            pred_x = subset.loc[missing_mask, 'time_numeric'].values.reshape(-1, 1)
            subset.loc[missing_mask, 'value'] = reg.predict(pred_x)
        
        result_df.update(subset)  
        
    print(result_df)
    #print(result_df)  
    # Konverter til ønsket format
    result = {}
    for datatype in datatyper:
        filtered_df = result_df[result_df['datatype'] == datatype]
        result[datatype] = {
            row['referenceTimestamp']: {'value': row['value'], 'unit': row['unit']}
            for _, row in filtered_df.iterrows()
        }
        
    return result




print(csv_reader('/Users/kristiansolberg1/Library/CloudStorage/OneDrive-NTNU/Anvendt programmering/Mappeprosjekt/miljodataAnalyse-1/data/met_data.csv', ['sum(precipitation_amount P1D)']))


      referenceTimestamp                       datatype  value   unit  \
0             2015-01-01  sum(precipitation_amount P1D)    0.2     mm   
1             2015-01-01  sum(precipitation_amount P1D)    0.0     mm   
2             2015-01-01  sum(duration_of_sunshine P1D)    0.0  hours   
3             2015-01-02  sum(precipitation_amount P1D)    0.5     mm   
4             2015-01-02  sum(precipitation_amount P1D)    0.7     mm   
...                  ...                            ...    ...    ...   
21904         2024-12-30  sum(precipitation_amount P1D)    7.1     mm   
21905         2024-12-30  sum(duration_of_sunshine P1D)    0.0  hours   
21906         2024-12-31  sum(precipitation_amount P1D)    4.2     mm   
21907         2024-12-31  sum(precipitation_amount P1D)   14.9     mm   
21908         2024-12-31  sum(duration_of_sunshine P1D)    0.0  hours   

         station  
0      SN18700:0  
1      SN18700:0  
2      SN18700:0  
3      SN18700:0  
4      SN18700:0  
...      

In [3]:
df = pd.read_csv('/Users/kristiansolberg1/Library/CloudStorage/OneDrive-NTNU/Anvendt programmering/Mappeprosjekt/miljodataAnalyse-1/data/met_data.csv')
print(df.columns)


Index(['sum(precipitation_amount P1D)', '0.2', 'mm', 'PT18H', 'P1D', '0', 'C',
       '2', '2.0', 'SN18700:0', '2015-01-01T00:00:00.000Z'],
      dtype='object')
