# Machine Learning for Data Science - 2019-II
## Test 1 - Question 1

* Student: Luis Vasquez Espinoza
* Code: 20152231J

__________

**Q1:** Read the NetCDF file: sst1.cdf. Fill the missing temperatures by the
mean of the non-missing ones. Finally, serialize and deserialize the new –ready
to process– data structure. You can read the sst1.txt file for a perspective of the
previous file.

**Answer:**

In [18]:
# Libraries to be used
import pandas as pd
import xarray as xr
import numpy as np

# Generic loading function
def load_nautic_df(path):
    try:
        ds = xr.open_dataset(path)
    except:
        ds = xr.open_dataset(path, decode_times=False)
    df = ds.to_dataframe()
    return df

In [19]:
data_path_1 = 'data/sst1.cdf' # Relative path in my pc

In [20]:
sst1_df = load_nautic_df(data_path_1)

In [21]:
# Showing first values of dataframe
sst1_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,T_20,QT_5020,ST_6020
depth,lat,lon,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,-2.0,265.0,2018-01-01 12:00:00,21.726934,1.0,5.0
0.0,-2.0,265.0,2018-01-02 12:00:00,21.908794,1.0,5.0
0.0,-2.0,265.0,2018-01-03 12:00:00,21.714407,1.0,5.0
0.0,-2.0,265.0,2018-01-04 12:00:00,21.667702,1.0,5.0
0.0,-2.0,265.0,2018-01-05 12:00:00,21.641754,1.0,5.0


In [22]:
# Resetting indices for working
sst1_df = sst1_df.reset_index()
sst1_df

Unnamed: 0,depth,lat,lon,time,T_20,QT_5020,ST_6020
0,0.0,-2.0,265.0,2018-01-01 12:00:00,21.726934,1.0,5.0
1,0.0,-2.0,265.0,2018-01-02 12:00:00,21.908794,1.0,5.0
2,0.0,-2.0,265.0,2018-01-03 12:00:00,21.714407,1.0,5.0
3,0.0,-2.0,265.0,2018-01-04 12:00:00,21.667702,1.0,5.0
4,0.0,-2.0,265.0,2018-01-05 12:00:00,21.641754,1.0,5.0
...,...,...,...,...,...,...,...
309,0.0,-2.0,265.0,2018-11-06 12:00:00,23.252274,2.0,1.0
310,0.0,-2.0,265.0,2018-11-07 12:00:00,23.307045,2.0,1.0
311,0.0,-2.0,265.0,2018-11-08 12:00:00,23.375366,2.0,1.0
312,0.0,-2.0,265.0,2018-11-09 12:00:00,23.396191,2.0,1.0


In [23]:
# Checking for useless values in temperature feature
print("----- Direct search for None and Nan values -----")
print("Null values un T_20 feature: {}".format(sst1_df.T_20.isnull().sum()))
print("Nan values un T_20 feature: {}".format(sst1_df.T_20.isna().sum()))
print("\n----- Checking if non-considered values are making false quality -----")
print("Total values in dataframe: {}".format(len(sst1_df)))
print("Total considered values in T_20 feature: {}".format(len(sst1_df.T_20)))

----- Direct search for None and Nan values -----
Null values un T_20 feature: 0
Nan values un T_20 feature: 0

----- Checking if non-considered values are making false quality -----
Total values in dataframe: 314
Total considered values in T_20 feature: 314


_______
**Comment:** With the professor's indication, the missing data of that year will be generated.
_______

In [24]:
#  Generating date range until 2018-12-31

missing_dates = pd.date_range(start='2018-11-10 12:00:00', end='2018-12-31 12:00:00', freq='d')
missing_depth = [0.0]*len(missing_dates)
missing_lat = [np.nan]*len(missing_dates)
missing_lon = [np.nan]*len(missing_dates)
missing_T20 = [sst1_df.T_20.mean()]*len(missing_dates) # Calculating mean on the run
missing_QT = [1.0]*len(missing_dates)
missing_ST = [5.0]*len(missing_dates)

new_data = {
    'depth':missing_depth,
    'lat':missing_depth,
    'lon':missing_lon,
    'time':missing_dates,
    'T_20':missing_T20,
    'QT_5020':missing_QT,
    'ST_6020':missing_ST
}

new_data = pd.DataFrame(new_data)
new_data.head()

Unnamed: 0,depth,lat,lon,time,T_20,QT_5020,ST_6020
0,0.0,0.0,,2018-11-10 12:00:00,23.744432,1.0,5.0
1,0.0,0.0,,2018-11-11 12:00:00,23.744432,1.0,5.0
2,0.0,0.0,,2018-11-12 12:00:00,23.744432,1.0,5.0
3,0.0,0.0,,2018-11-13 12:00:00,23.744432,1.0,5.0
4,0.0,0.0,,2018-11-14 12:00:00,23.744432,1.0,5.0


In [25]:
# Joining new data with original in a final dataset

sst1_df = sst1_df.append(new_data).sort_index()
sst1_df = sst1_df.reset_index()
sst1_df

Unnamed: 0,index,depth,lat,lon,time,T_20,QT_5020,ST_6020
0,0,0.0,-2.0,265.0,2018-01-01 12:00:00,21.726934,1.0,5.0
1,0,0.0,0.0,,2018-11-10 12:00:00,23.744432,1.0,5.0
2,1,0.0,0.0,,2018-11-11 12:00:00,23.744432,1.0,5.0
3,1,0.0,-2.0,265.0,2018-01-02 12:00:00,21.908794,1.0,5.0
4,2,0.0,-2.0,265.0,2018-01-03 12:00:00,21.714407,1.0,5.0
...,...,...,...,...,...,...,...,...
361,309,0.0,-2.0,265.0,2018-11-06 12:00:00,23.252274,2.0,1.0
362,310,0.0,-2.0,265.0,2018-11-07 12:00:00,23.307045,2.0,1.0
363,311,0.0,-2.0,265.0,2018-11-08 12:00:00,23.375366,2.0,1.0
364,312,0.0,-2.0,265.0,2018-11-09 12:00:00,23.396191,2.0,1.0


In [26]:
# Serializing dataframe
import pickle

saving_path = 'data/serial_clean_sst1.df' # Relative path in my pc

pickle.dump(sst1_df, open(saving_path, 'wb'))

In [27]:
# Loading the data to confirm

loaded_df1 = pickle.load(open(saving_path, 'rb'))

loaded_df1

Unnamed: 0,index,depth,lat,lon,time,T_20,QT_5020,ST_6020
0,0,0.0,-2.0,265.0,2018-01-01 12:00:00,21.726934,1.0,5.0
1,0,0.0,0.0,,2018-11-10 12:00:00,23.744432,1.0,5.0
2,1,0.0,0.0,,2018-11-11 12:00:00,23.744432,1.0,5.0
3,1,0.0,-2.0,265.0,2018-01-02 12:00:00,21.908794,1.0,5.0
4,2,0.0,-2.0,265.0,2018-01-03 12:00:00,21.714407,1.0,5.0
...,...,...,...,...,...,...,...,...
361,309,0.0,-2.0,265.0,2018-11-06 12:00:00,23.252274,2.0,1.0
362,310,0.0,-2.0,265.0,2018-11-07 12:00:00,23.307045,2.0,1.0
363,311,0.0,-2.0,265.0,2018-11-08 12:00:00,23.375366,2.0,1.0
364,312,0.0,-2.0,265.0,2018-11-09 12:00:00,23.396191,2.0,1.0
