In [184]:
import netCDF4 as nc
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split


In [185]:
# Attempting to load the netCDF file again
try:
    dataset = nc.Dataset('AR_TS_MO_BSO3B.nc')
    # Extract all available columns (variables)
    variables = list(dataset.variables.keys())
except Exception as e:
    variables = str(e)

variables


['TIME',
 'TIME_QC',
 'LATITUDE',
 'LONGITUDE',
 'POSITION_QC',
 'DEPH',
 'DEPH_QC',
 'TEMP',
 'PSAL',
 'PRES',
 'HCDT',
 'SVEL',
 'NSCT',
 'EWCT',
 'TEMP_QC',
 'PSAL_QC',
 'PRES_QC',
 'HCDT_QC',
 'SVEL_QC',
 'NSCT_QC',
 'EWCT_QC']

In [186]:
variables.index('TIME')

0

In [188]:
# Function to extract values from a specific variable in the netCDF dataset
def extract_variable_values(nc_file, variable_name):
    # Load the dataset
    dataset = nc.Dataset(nc_file)
    
    # Extract values of the specified variable
    values = None
    if variable_name in dataset.variables:
        values = dataset.variables[variable_name][:]
    
    return values


In [189]:


def extract_data_to_dataframe(file_name, variables):
    full = []

    for var in variables:
        psal_values = extract_variable_values(file_name, var).data
        full.append(np.array(psal_values[:]))

    df = pd.DataFrame(full, variables)
    df = df.T
    return df


In [190]:
def convert_time_to_year_week(df, time_column_name, reference_day):
    # Define the reference date
    reference_date = datetime.strptime(reference_day, '%B %Y')

    # Convert the time values to datetime objects
    df['Datetime'] = reference_date + df[time_column_name].apply(lambda x: timedelta(days=x))

    # Extract the year and week from the datetime objects
    df['Year'] = df['Datetime'].dt.year
    df['Week'] = df['Datetime'].dt.strftime('%U')
    df['Week'] = df['Week'].astype(int)

    # Drop the intermediate 'Datetime' column
    #df.drop(columns=['Datetime'], inplace=True)

    return df

In [191]:
def extract_first_two(lst):
    if len(lst) >= 2:
        return lst[:2]
    else:
        return lst


In [192]:
# Attempt to extract values from the  variables into list
'''full=[]
files=['AR_TS_MO_BSO1.nc','AR_TS_MO_BSO2.nc','AR_TS_MO_BSO3.nc','AR_TS_MO_BSO4.nc','AR_TS_MO_BSO5.nc','AR_TS_MO_BSO2B.nc','AR_TS_MO_BSO3B.nc']
mark=0
for doc in files:
    for var in variables:
        
        if mark<len(variables):
            psal_values = extract_variable_values(doc, var).data
            full.append(list(psal_values[:]))
            mark=mark+1  
        else:
            psal_values = extract_variable_values(doc, var).data
            print(full[variables.index(var)])
            print(extract_variable_values(doc, var))
            print('h')
            full[variables.index(var)].append(psal_values)'''

"full=[]\nfiles=['AR_TS_MO_BSO1.nc','AR_TS_MO_BSO2.nc','AR_TS_MO_BSO3.nc','AR_TS_MO_BSO4.nc','AR_TS_MO_BSO5.nc','AR_TS_MO_BSO2B.nc','AR_TS_MO_BSO3B.nc']\nmark=0\nfor doc in files:\n    for var in variables:\n        \n        if mark<len(variables):\n            psal_values = extract_variable_values(doc, var).data\n            full.append(list(psal_values[:]))\n            mark=mark+1  \n        else:\n            psal_values = extract_variable_values(doc, var).data\n            print(full[variables.index(var)])\n            print(extract_variable_values(doc, var))\n            print('h')\n            full[variables.index(var)].append(psal_values)"

In [193]:
df1=extract_data_to_dataframe('AR_TS_MO_BSO1.nc',variables)
df2=extract_data_to_dataframe('AR_TS_MO_BSO2.nc',variables)
df3=extract_data_to_dataframe('AR_TS_MO_BSO3.nc',variables)
df4=extract_data_to_dataframe('AR_TS_MO_BSO4.nc',variables)
df5=extract_data_to_dataframe('AR_TS_MO_BSO5.nc',variables)
df6=extract_data_to_dataframe('AR_TS_MO_BSO2B.nc',variables)
df7=extract_data_to_dataframe('AR_TS_MO_BSO3B.nc',variables)


In [194]:
df_main= pd.concat([df1,df2,df3,df4,df5,df6,df7], ignore_index=True)

In [195]:
df_main

Unnamed: 0,TIME,TIME_QC,LATITUDE,LONGITUDE,POSITION_QC,DEPH,DEPH_QC,TEMP,PSAL,PRES,...,SVEL,NSCT,EWCT,TEMP_QC,PSAL_QC,PRES_QC,HCDT_QC,SVEL_QC,NSCT_QC,EWCT_QC
0,17437.215278,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.68, 6.04, 5.84]","[34.909, 35.294000000000004, 35.064]","[0.5044, 1.4104, 0.0]",...,"[0.11, 0.159, 0.116]","[-0.058, -0.126, -0.112]","[-0.093, -0.097, -0.027]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
1,17437.229167,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.43, 6.04, 5.84]","[34.923, 35.212, 35.064]","[0.5044, 1.4104, 0.0]",...,"[0.15, 0.11900000000000001, 0.116]","[0.035, -0.084, -0.116]","[-0.146, -0.083, 0.005]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
2,17437.243056,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.23, 6.08, 5.84]","[34.97, 35.247, 35.064]","[0.5082, 1.4104, 0.0]",...,"[0.052000000000000005, 0.069, 0.113]","[-0.018000000000000002, 0.009000000000000001, ...","[-0.048, -0.069, -0.02]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
3,17437.256944,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.23, 6.08, 5.84]","[34.891, 35.247, 35.064]","[0.5044, 1.4104, 0.0]",...,"[0.034, 0.034, 0.098]","[0.034, 0.034, -0.098]","[0.003, -0.007, -0.009000000000000001]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
4,17437.270833,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.2, 6.04, 5.84]","[34.914, 35.212, 35.064]","[0.5082, 1.4177, 0.0]",...,"[0.028, 0.031, 0.084]","[0.015, 0.031, -0.084]","[0.024, 0.006, -0.002]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2238132,19060.194444,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.88, 3.67]","[35.147, 55.0]","[-0.0937, 0.0]",...,"[0.043000000000000003, 0.136]","[-0.005, 0.025]","[0.043000000000000003, 0.134]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
2238133,19060.208333,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.9, 3.67]","[35.123, 55.0]","[-0.0937, 0.0]",...,"[0.046, 0.124]","[-0.002, 0.028]","[0.046, 0.121]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
2238134,19060.222222,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.88, 3.67]","[35.147, 55.0]","[-0.0937, 0.0]",...,"[0.055, 0.124]","[0.013000000000000001, 0.021]","[0.053, 0.122]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
2238135,19060.236111,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.8500000000000005, 3.67]","[35.171, 55.0]","[-0.0937, 0.0]",...,"[0.058, 0.121]","[0.002, 0.022]","[0.057, 0.11900000000000001]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"


In [196]:
df_main

Unnamed: 0,TIME,TIME_QC,LATITUDE,LONGITUDE,POSITION_QC,DEPH,DEPH_QC,TEMP,PSAL,PRES,...,SVEL,NSCT,EWCT,TEMP_QC,PSAL_QC,PRES_QC,HCDT_QC,SVEL_QC,NSCT_QC,EWCT_QC
0,17437.215278,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.68, 6.04, 5.84]","[34.909, 35.294000000000004, 35.064]","[0.5044, 1.4104, 0.0]",...,"[0.11, 0.159, 0.116]","[-0.058, -0.126, -0.112]","[-0.093, -0.097, -0.027]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
1,17437.229167,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.43, 6.04, 5.84]","[34.923, 35.212, 35.064]","[0.5044, 1.4104, 0.0]",...,"[0.15, 0.11900000000000001, 0.116]","[0.035, -0.084, -0.116]","[-0.146, -0.083, 0.005]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
2,17437.243056,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.23, 6.08, 5.84]","[34.97, 35.247, 35.064]","[0.5082, 1.4104, 0.0]",...,"[0.052000000000000005, 0.069, 0.113]","[-0.018000000000000002, 0.009000000000000001, ...","[-0.048, -0.069, -0.02]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
3,17437.256944,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.23, 6.08, 5.84]","[34.891, 35.247, 35.064]","[0.5044, 1.4104, 0.0]",...,"[0.034, 0.034, 0.098]","[0.034, 0.034, -0.098]","[0.003, -0.007, -0.009000000000000001]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
4,17437.270833,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.2, 6.04, 5.84]","[34.914, 35.212, 35.064]","[0.5082, 1.4177, 0.0]",...,"[0.028, 0.031, 0.084]","[0.015, 0.031, -0.084]","[0.024, 0.006, -0.002]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2238132,19060.194444,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.88, 3.67]","[35.147, 55.0]","[-0.0937, 0.0]",...,"[0.043000000000000003, 0.136]","[-0.005, 0.025]","[0.043000000000000003, 0.134]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
2238133,19060.208333,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.9, 3.67]","[35.123, 55.0]","[-0.0937, 0.0]",...,"[0.046, 0.124]","[-0.002, 0.028]","[0.046, 0.121]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
2238134,19060.222222,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.88, 3.67]","[35.147, 55.0]","[-0.0937, 0.0]",...,"[0.055, 0.124]","[0.013000000000000001, 0.021]","[0.053, 0.122]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
2238135,19060.236111,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.8500000000000005, 3.67]","[35.171, 55.0]","[-0.0937, 0.0]",...,"[0.058, 0.121]","[0.002, 0.022]","[0.057, 0.11900000000000001]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"


In [197]:


# Reference time earliest found date 
reference_time = 17398.750694

# Create the time_flattened column
df_main['TIME_flat_rf'] = df_main['TIME'] - reference_time

# Display the DataFrame
pd.DataFrame(df_main)


Unnamed: 0,TIME,TIME_QC,LATITUDE,LONGITUDE,POSITION_QC,DEPH,DEPH_QC,TEMP,PSAL,PRES,...,NSCT,EWCT,TEMP_QC,PSAL_QC,PRES_QC,HCDT_QC,SVEL_QC,NSCT_QC,EWCT_QC,TIME_flat_rf
0,17437.215278,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.68, 6.04, 5.84]","[34.909, 35.294000000000004, 35.064]","[0.5044, 1.4104, 0.0]",...,"[-0.058, -0.126, -0.112]","[-0.093, -0.097, -0.027]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]",38.464584
1,17437.229167,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.43, 6.04, 5.84]","[34.923, 35.212, 35.064]","[0.5044, 1.4104, 0.0]",...,"[0.035, -0.084, -0.116]","[-0.146, -0.083, 0.005]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]",38.478473
2,17437.243056,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.23, 6.08, 5.84]","[34.97, 35.247, 35.064]","[0.5082, 1.4104, 0.0]",...,"[-0.018000000000000002, 0.009000000000000001, ...","[-0.048, -0.069, -0.02]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]",38.492362
3,17437.256944,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.23, 6.08, 5.84]","[34.891, 35.247, 35.064]","[0.5044, 1.4104, 0.0]",...,"[0.034, 0.034, -0.098]","[0.003, -0.007, -0.009000000000000001]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]",38.50625
4,17437.270833,1,71.50528,19.706944,7,"[53.0, 126.0, 212.0]","[7, 7, 7]","[7.2, 6.04, 5.84]","[34.914, 35.212, 35.064]","[0.5082, 1.4177, 0.0]",...,"[0.015, 0.031, -0.084]","[0.024, 0.006, -0.002]","[1, 1, 1]","[1, 1, 1]","[1, 4, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]",38.520139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2238132,19060.194444,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.88, 3.67]","[35.147, 55.0]","[-0.0937, 0.0]",...,"[-0.005, 0.025]","[0.043000000000000003, 0.134]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]",1661.44375
2238133,19060.208333,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.9, 3.67]","[35.123, 55.0]","[-0.0937, 0.0]",...,"[-0.002, 0.028]","[0.046, 0.121]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]",1661.457639
2238134,19060.222222,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.88, 3.67]","[35.147, 55.0]","[-0.0937, 0.0]",...,"[0.013000000000000001, 0.021]","[0.053, 0.122]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]",1661.471528
2238135,19060.236111,1,72.747223,19.489166,7,"[123.0, 383.0]","[7, 7]","[4.8500000000000005, 3.67]","[35.171, 55.0]","[-0.0937, 0.0]",...,"[0.002, 0.022]","[0.057, 0.11900000000000001]","[1, 1]","[1, 4]","[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]",1661.485417


In [198]:
print( datetime.strptime('August 1997', '%B %Y')+timedelta(days=0.1))

1997-08-01 02:24:00


In [199]:
print( datetime.strptime('August 1997', '%B %Y'))

1997-08-01 00:00:00


In [200]:

reference_day = 'August 1997'

result_df = convert_time_to_year_week(df_main, 'TIME_flat_rf', reference_day)



In [201]:
Standard_df=pd.DataFrame(result_df)

In [202]:
df_sorted = Standard_df.sort_values(by='TIME')

In [203]:
columns_to_drop = ['TIME', 'TIME_QC','TIME_flat_rf',
 'POSITION_QC',
 'DEPH_QC',
 'HCDT',
 'SVEL',
 'NSCT',
 'EWCT',
 'TEMP_QC',
 'PSAL_QC',
 'PRES_QC',
 'HCDT_QC',
 'SVEL_QC',
 'NSCT_QC',
 'EWCT_QC']

# Drop the specified columns
df_sorted.drop(columns=columns_to_drop, inplace=True)

In [204]:
df_sorted

Unnamed: 0,LATITUDE,LONGITUDE,DEPH,TEMP,PSAL,PRES,Datetime,Year,Week
136659,71.980278,19.653334,"[53.0, 126.0, 225.0, 294.0, 9.96921e+36]","[6.72, 6.46, 5.55, 4.49, -2147483647.0]","[34.92, 35.01, 35.216, 35.044000000000004, -21...","[0.5459, 2.1351, 0.0, 0.0, 9.96921e+36]",1997-08-01 00:00:00.038400,1997,30
136660,71.980278,19.653334,"[53.0, 126.0, 225.0, 294.0, 9.96921e+36]","[6.65, 6.46, 5.62, 4.5200000000000005, -214748...","[34.989, 35.092, 35.228, 35.106, -2147483647.0]","[0.5459, 2.1351, 0.0, 0.0, 9.96921e+36]",1997-08-01 00:20:00.038400,1997,30
136661,71.980278,19.653334,"[53.0, 126.0, 225.0, 294.0, 9.96921e+36]","[6.63, 6.46, 5.65, 4.5200000000000005, -214748...","[35.012, 35.01, 35.204, 35.021, -2147483647.0]","[0.5459, 2.1351, 0.0, 0.0, 9.96921e+36]",1997-08-01 00:40:00.038400,1997,30
136662,71.980278,19.653334,"[53.0, 126.0, 225.0, 294.0, 9.96921e+36]","[6.65, 6.48, 5.62, 4.54, -2147483647.0]","[34.989, 35.068, 35.228, 35.082, -2147483647.0]","[0.5423, 2.1351, 0.0, 0.0, 9.96921e+36]",1997-08-01 01:00:00.038400,1997,30
136663,71.980278,19.653334,"[53.0, 126.0, 225.0, 294.0, 9.96921e+36]","[6.65, 6.51, 5.74, 4.5200000000000005, -214748...","[34.989, 34.964, 35.192, 35.106, -2147483647.0]","[0.5423, 2.1351, 0.0, 0.0, 9.96921e+36]",1997-08-01 01:20:00.038400,1997,30
...,...,...,...,...,...,...,...,...,...
2133247,73.5,19.346666,"[58.0, 464.0, 9.96921e+36, 9.96921e+36]","[5.41, 3.5700000000000003, -2147483647.0, -214...","[0.0, 0.0, -2147483647.0, -2147483647.0]","[0.5549, 489.0778, 9.96921e+36, 9.96921e+36]",2017-03-13 16:58:00.038400,2017,11
2133248,73.5,19.346666,"[58.0, 464.0, 9.96921e+36, 9.96921e+36]","[5.39, 3.5500000000000003, -2147483647.0, -214...","[0.0, 0.0, -2147483647.0, -2147483647.0]","[0.5549, 489.0778, 9.96921e+36, 9.96921e+36]",2017-03-13 17:18:00.038400,2017,11
2133249,73.5,19.346666,"[58.0, 464.0, 9.96921e+36, 9.96921e+36]","[5.39, 3.5700000000000003, -2147483647.0, -214...","[0.0, 0.0, -2147483647.0, -2147483647.0]","[0.5585, 489.0778, 9.96921e+36, 9.96921e+36]",2017-03-13 17:38:00.038400,2017,11
2133250,73.5,19.346666,"[58.0, 464.0, 9.96921e+36, 9.96921e+36]","[5.39, 3.5500000000000003, -2147483647.0, -214...","[0.0, 0.0, -2147483647.0, -2147483647.0]","[0.5585, 489.0778, 9.96921e+36, 9.96921e+36]",2017-03-13 17:58:00.038400,2017,11


In [205]:

# Apply the function to the specified columns to reduce cell length
columns_to_process = ['DEPH', 'TEMP', 'PSAL', 'PRES']
for column in columns_to_process:
    df_sorted[column] = df_sorted[column].apply(extract_first_two)

In [206]:
df_sorted

Unnamed: 0,LATITUDE,LONGITUDE,DEPH,TEMP,PSAL,PRES,Datetime,Year,Week
136659,71.980278,19.653334,"[53.0, 126.0]","[6.72, 6.46]","[34.92, 35.01]","[0.5459, 2.1351]",1997-08-01 00:00:00.038400,1997,30
136660,71.980278,19.653334,"[53.0, 126.0]","[6.65, 6.46]","[34.989, 35.092]","[0.5459, 2.1351]",1997-08-01 00:20:00.038400,1997,30
136661,71.980278,19.653334,"[53.0, 126.0]","[6.63, 6.46]","[35.012, 35.01]","[0.5459, 2.1351]",1997-08-01 00:40:00.038400,1997,30
136662,71.980278,19.653334,"[53.0, 126.0]","[6.65, 6.48]","[34.989, 35.068]","[0.5423, 2.1351]",1997-08-01 01:00:00.038400,1997,30
136663,71.980278,19.653334,"[53.0, 126.0]","[6.65, 6.51]","[34.989, 34.964]","[0.5423, 2.1351]",1997-08-01 01:20:00.038400,1997,30
...,...,...,...,...,...,...,...,...,...
2133247,73.5,19.346666,"[58.0, 464.0]","[5.41, 3.5700000000000003]","[0.0, 0.0]","[0.5549, 489.0778]",2017-03-13 16:58:00.038400,2017,11
2133248,73.5,19.346666,"[58.0, 464.0]","[5.39, 3.5500000000000003]","[0.0, 0.0]","[0.5549, 489.0778]",2017-03-13 17:18:00.038400,2017,11
2133249,73.5,19.346666,"[58.0, 464.0]","[5.39, 3.5700000000000003]","[0.0, 0.0]","[0.5585, 489.0778]",2017-03-13 17:38:00.038400,2017,11
2133250,73.5,19.346666,"[58.0, 464.0]","[5.39, 3.5500000000000003]","[0.0, 0.0]","[0.5585, 489.0778]",2017-03-13 17:58:00.038400,2017,11


In [207]:

df_sorted = df_sorted.rename(columns={'LATITUDE': 'Lat', 'LONGITUDE': 'Lon', 'TEMP': 'Sea Temperature','DEPH': 'Depth'})	


In [208]:
df_sorted

Unnamed: 0,Lat,Lon,Depth,Sea Temperature,PSAL,PRES,Datetime,Year,Week
136659,71.980278,19.653334,"[53.0, 126.0]","[6.72, 6.46]","[34.92, 35.01]","[0.5459, 2.1351]",1997-08-01 00:00:00.038400,1997,30
136660,71.980278,19.653334,"[53.0, 126.0]","[6.65, 6.46]","[34.989, 35.092]","[0.5459, 2.1351]",1997-08-01 00:20:00.038400,1997,30
136661,71.980278,19.653334,"[53.0, 126.0]","[6.63, 6.46]","[35.012, 35.01]","[0.5459, 2.1351]",1997-08-01 00:40:00.038400,1997,30
136662,71.980278,19.653334,"[53.0, 126.0]","[6.65, 6.48]","[34.989, 35.068]","[0.5423, 2.1351]",1997-08-01 01:00:00.038400,1997,30
136663,71.980278,19.653334,"[53.0, 126.0]","[6.65, 6.51]","[34.989, 34.964]","[0.5423, 2.1351]",1997-08-01 01:20:00.038400,1997,30
...,...,...,...,...,...,...,...,...,...
2133247,73.5,19.346666,"[58.0, 464.0]","[5.41, 3.5700000000000003]","[0.0, 0.0]","[0.5549, 489.0778]",2017-03-13 16:58:00.038400,2017,11
2133248,73.5,19.346666,"[58.0, 464.0]","[5.39, 3.5500000000000003]","[0.0, 0.0]","[0.5549, 489.0778]",2017-03-13 17:18:00.038400,2017,11
2133249,73.5,19.346666,"[58.0, 464.0]","[5.39, 3.5700000000000003]","[0.0, 0.0]","[0.5585, 489.0778]",2017-03-13 17:38:00.038400,2017,11
2133250,73.5,19.346666,"[58.0, 464.0]","[5.39, 3.5500000000000003]","[0.0, 0.0]","[0.5585, 489.0778]",2017-03-13 17:58:00.038400,2017,11


In [209]:
# Split the 'temperature' column into two separate columns
df_sorted[['Temperature', 'Temperature2']] = df_sorted['Sea Temperature'].apply(pd.Series)
df_sorted[['Salinity', 'Salinity2']] = df_sorted['PSAL'].apply(pd.Series)



# Drop the original 'temperature' column if needed
df_sorted.drop(columns=['Sea Temperature'], inplace=True)
df_sorted.drop(columns=['PSAL'], inplace=True)

In [210]:
df_sorted

Unnamed: 0,Lat,Lon,Depth,PRES,Datetime,Year,Week,Temperature,Temperature2,Salinity,Salinity2
136659,71.980278,19.653334,"[53.0, 126.0]","[0.5459, 2.1351]",1997-08-01 00:00:00.038400,1997,30,6.72,6.46,34.920,35.010
136660,71.980278,19.653334,"[53.0, 126.0]","[0.5459, 2.1351]",1997-08-01 00:20:00.038400,1997,30,6.65,6.46,34.989,35.092
136661,71.980278,19.653334,"[53.0, 126.0]","[0.5459, 2.1351]",1997-08-01 00:40:00.038400,1997,30,6.63,6.46,35.012,35.010
136662,71.980278,19.653334,"[53.0, 126.0]","[0.5423, 2.1351]",1997-08-01 01:00:00.038400,1997,30,6.65,6.48,34.989,35.068
136663,71.980278,19.653334,"[53.0, 126.0]","[0.5423, 2.1351]",1997-08-01 01:20:00.038400,1997,30,6.65,6.51,34.989,34.964
...,...,...,...,...,...,...,...,...,...,...,...
2133247,73.5,19.346666,"[58.0, 464.0]","[0.5549, 489.0778]",2017-03-13 16:58:00.038400,2017,11,5.41,3.57,0.000,0.000
2133248,73.5,19.346666,"[58.0, 464.0]","[0.5549, 489.0778]",2017-03-13 17:18:00.038400,2017,11,5.39,3.55,0.000,0.000
2133249,73.5,19.346666,"[58.0, 464.0]","[0.5585, 489.0778]",2017-03-13 17:38:00.038400,2017,11,5.39,3.57,0.000,0.000
2133250,73.5,19.346666,"[58.0, 464.0]","[0.5585, 489.0778]",2017-03-13 17:58:00.038400,2017,11,5.39,3.55,0.000,0.000


In [211]:
#Barens watch is only between 2012 upward to
df_sorted_2012_2017=df_sorted[(df_sorted['Year'] >= 2012) & (df_sorted['Year'] <= 2017)]

In [212]:
y=df_sorted_2012_2017.iloc[:, 9:10]
X= pd.concat([df_sorted_2012_2017.iloc[:, :9], df_sorted_2012_2017.iloc[:, 10:]], axis=1)

In [213]:
y

Unnamed: 0,Salinity
2021100,283.000
1046866,34.537
537907,35.040
2021101,282.000
1046867,34.550
...,...
2133247,0.000
2133248,0.000
2133249,0.000
2133250,0.000


In [214]:
#Split train test data in 0.75 to 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [256]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
# Create polynomial features (you can adjust the degree)
#poly = PolynomialFeatures(degree=2)
# Combine latitude, longitude, and temperature into a feature matrix
X_train_needed = X_train[['Lat', 'Lon', 'Temperature','Year','Week']]
X_test_needed=X_test[['Lat', 'Lon', 'Temperature','Year','Week']]
#X_poly = poly.fit_transform(X_train)


# Fit a multivariable decison tree regression model
model = DecisionTreeRegressor()
model.fit(X_train_needed, y_train)

DecisionTreeRegressor()

In [257]:
# Predict salinity for dataset2 based on latitude, longitude, and temperature
predicted_salinity_train_data = model.predict(X_train_needed)
predicted_salinity_test_data= model.predict(X_test_needed)


In [263]:
model2 = LinearRegression()
model2.fit(X_train_needed, y_train)

LinearRegression()

In [264]:
# Predict salinity for dataset2 based on latitude, longitude, and temperature
predicted_salinity_train_data2 = model2.predict(X_train_needed)
predicted_salinity_test_data2= model2.predict(X_test_needed)

In [266]:
def CalcAccuracy_Regression_dt(Actual_Y, Predicted_y):
    error= abs(Actual_Y.to_numpy().flatten()-Predicted_y)
    mse=np.mean(np.square(error))
    rel_err=mse/np.mean(np.square(Actual_Y.to_numpy()))
    accuracy=1-rel_err
    return(accuracy)

In [267]:
def CalcAccuracy_Linear_Regression(Actual_Y, Predicted_y):
    error= abs(Actual_Y-Predicted_y)
    mse=np.mean(np.square(error))
    rel_err=mse/np.mean(np.square(Actual_Y.to_numpy()))
    accuracy=1-rel_err
    return(accuracy)

In [261]:
print("Train data accuracy")
CalcAccuracy_Regression_dt(y_train, predicted_salinity_train_data)

Train data accuracy


0.999929713741398

In [262]:
print("Test data accuracy")
CalcAccuracy_Regression_dt(y_test, predicted_salinity_test_data)

Test data accuracy


0.9998709077374296

In [268]:
print("Train data accuracy")
CalcAccuracy_Linear_Regression(y_train, predicted_salinity_train_data2)

Train data accuracy


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Salinity    0.292605
dtype: float64

In [269]:
print("Test data accuracy")
CalcAccuracy_Linear_Regression(y_test, predicted_salinity_test_data2)

Test data accuracy


Salinity    0.293512
dtype: float64

In [None]:

# Close the NetCDF file when done
dataset.close()

# Now you have a Pandas DataFrame containing the entire NetCDF file's data

In [None]:
df_sorted_2012_2017.to_csv('salinity_temp_press_2012_2017.csv', index=False)