In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
df1 = pd.read_csv('ICES_bottlelowresctd\\bottle_lowres_ctd\\bottle_lowres_temp.csv')
df2 = pd.read_csv('ICES_Expendabledatac\\xbt_data\\xbt_data_surfaceonly.csv')
df3 = pd.read_csv('ICES_high_resolution\\high_res_ctd\\high_res_data.csv')
df4 = pd.read_csv('ICES_OceanSurfacedat\\Surface_data\\oceansurfacedat_alter.csv')

In [None]:
ices_merged_data = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [None]:
ices_merged_data.to_csv('ices_merged_data.csv',index=None)

In [None]:
df=pd.read_csv('ices_merged_data.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#Removing all the blank or null rows
blank=ices_merged_data['Temperature [degC]'].isnull()
ices_merged_data=ices_merged_data.drop(ices_merged_data[blank].index)

In [None]:
ices_merged_data.shape

In [None]:
ices_merged_data.to_csv('ices_merged_eda.csv',index=None)

In [None]:
df=pd.read_csv('ices_merged_eda.csv')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Identify duplicates based on all columns (date, lon, lat, temperature)
duplicates = df[df.duplicated(keep=False)]

# Print duplicate rows
if not duplicates.empty:
    print("Duplicate rows found:")
    print(duplicates)
else:
    print("No duplicate rows found.")

In [None]:
df.shape

### Removal of duplicates

In [None]:
# Remove duplicates based on all columns (date, lon, lat, temperature)
df_no_duplicates = df.drop_duplicates(keep='first')

# Save the DataFrame without duplicates to a new CSV file
df_no_duplicates.to_csv('ices_merged_eda.csv',index=None)

In [None]:
df=pd.read_csv('ices_merged_eda.csv')

In [None]:
df.shape

In [None]:
#Check for outliers in the temperature column using a boxplot
plt.figure(figsize=[18,10])
sns.set(style="darkgrid")
plt.rcParams.update({'font.size': 20, 'font.weight': 'bold'})
sns.boxplot(x=df['Temperature [degC]'],width=0.5,linewidth=2,color='darkgoldenrod')
plt.xlabel('Temperature', fontsize=14, fontweight='bold')
plt.title('Boxplot of Temperature', fontsize=14, fontweight='bold')
plt.show()

In [None]:
#Plot the distribution of the temperature column using a histogram
plt.figure(figsize=[18,10])
plt.rcParams.update({'font.size': 20, 'font.weight': 'bold'})
plt.hist(df['Temperature [degC]'],bins=5, color='darkgoldenrod')
plt.xlabel('Temperature', fontsize=14, fontweight='bold')
plt.ylabel('Count', fontsize=14, fontweight='bold')
plt.show()

In [None]:
# Convert datetime column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df.info()

In [None]:
#Data points after merging all the data (ICES)
import cartopy.crs as ccrs
import cartopy as cart
import cartopy.feature as cfeature
%matplotlib inline


fig = plt.figure(figsize=[18,10])
ax = plt.axes(projection=ccrs.PlateCarree())

plt.scatter(df['Longitude [degrees_east]'], df['Latitude [degrees_north]'], s=10, color='darkgoldenrod', marker='*',
             transform=ccrs.PlateCarree())

ax.add_feature(cfeature.LAND,zorder=100, edgecolor='k')
ax.add_feature(cfeature.OCEAN)
ax.add_feature(cfeature.COASTLINE)
ax.add_feature(cfeature.BORDERS, linestyle=':')

plt.ylabel ('Latitude')
plt.xlabel ('Longitude')

plt.show()

### Spatial plot (each plot shows the daily data points(YYYY/MM)

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

df=pd.read_csv('ices_merged_eda.csv')

# Convert the Date column to pandas datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract year and month from the Date column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

# Group the data by year and month
grouped_data = df.groupby(['Year', 'Month'])

# Define the bounding box
bounding_box = [-14, -5, 49, 56]

# Define the figure size (width, height) in inches
figure_size = (15, 8)

# Iterate over each group to create subplots
for (year, month), group in grouped_data:
    # Create a subplot with a map using Cartopy and set the figure size
    fig = plt.figure(figsize=figure_size)
    ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
    ax.add_feature(cfeature.LAND)
    ax.add_feature(cfeature.OCEAN)
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=':')

    # Set the extent of the map based on the bounding box
    ax.set_extent(bounding_box)

    # Create a scatter plot for each group
    scatter = ax.scatter(
        group['Longitude [degrees_east]'],
        group['Latitude [degrees_north]'],
        c=group['Temperature [degC]'],
        cmap='coolwarm',
        transform=ccrs.PlateCarree()
    )

    # Annotate each data point
    #for index, row in group.iterrows():
        #ax.annotate(
            #text=row['Date'].strftime('%Y-%m-%d'),
           # xy=(row['Longitude [degrees_east]'], row['Latitude [degrees_north]']),
            #xytext=(5, 0),
            #textcoords='offset points',
            #transform=ccrs.PlateCarree()
        #)

    # Set title for the subplot
    ax.set_title(f"Temperature Data Points for {year}-{month:02d}",fontsize=15,fontweight='bold')

    # Save the subplot as an image file
    plt.savefig(f"mergedTemperature{year}_{month:02d}.png")

    # Show the current subplot
    plt.show()

### (X,Y) plots for each YYYY/MM, X:Temperature, Y:Day of Month

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Convert the Date column to pandas datetime format
#df['Date'] = pd.to_datetime(df['Date'])

# Extract year and month from the Date column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

# Determine the global minimum and maximum temperature values
global_min_temp = df['Temperature [degC]'].min()
global_max_temp = df['Temperature [degC]'].max()

# Group the data by year
year_grouped_data = df.groupby('Year')

# Define the figure size (width, height) in inches
figure_size = (20, 12)

# Iterate over each year to create a single plot with monthly subplots
for year, year_group in year_grouped_data:
    # Create a grid of subplots (3 rows x 4 columns) for each year
    fig, axes = plt.subplots(nrows=3, ncols=4, figsize=figure_size, sharex='col', sharey='row')
    axes = axes.flatten()

    # Group the year data by month
    month_grouped_data = year_group.groupby('Month')

    # Iterate over each month
    for month in range(1, 13):
        # Get the corresponding axis for the current month
        ax = axes[month - 1]

        # Check if there is data for the current month
        if month in month_grouped_data.groups:
            group = month_grouped_data.get_group(month)
            # Sort the group data by Date
            group = group.sort_values('Date')

            # Create a scatter plot for each group
            ax.scatter(group['Date'].dt.day, group['Temperature [degC]'], color='darkgoldenrod', marker='o')

        # Set title (bold) and labels for the subplot
        ax.set_title(f"{year}-{month:02d}",fontsize=15, fontweight='bold')
        ax.set_xlabel('Day of the Month', fontsize=15, fontweight='bold')
        ax.set_ylabel('Temperature [degC]', fontsize=15, fontweight='bold')

        # Generate a complete range of dates for the current year and month
        date_range = pd.date_range(start=f"{year}-{month:02d}-01", end=f"{year}-{month:02d}-{pd.Timestamp(year, month, 1).days_in_month}")

        # Create an array of uniformly spaced tick values
        tick_values = np.linspace(1, pd.Timestamp(year, month, 1).days_in_month, num=8, dtype=int)

        # Set xticks and xtick labels with a 45-degree rotation and right horizontal alignment
        ax.set_xticks(tick_values)
        ax.set_xticklabels(date_range[tick_values - 1].strftime('%Y-%m-%d'), rotation=45, ha='right', fontsize=15)

        # Set the y-axis limits based on the global minimum and maximum temperature values
        ax.set_ylim(global_min_temp, global_max_temp)

    # Adjust the space between subplots
    plt.tight_layout()

    # Save the single plot as an image file
    #plt.savefig(f"Temperature_{year}.png")

    # Show the single plot
    plt.show()
