In [None]:
!pip install sweetviz


Collecting sweetviz
  Downloading sweetviz-2.3.1-py3-none-any.whl.metadata (24 kB)
Downloading sweetviz-2.3.1-py3-none-any.whl (15.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sweetviz
Successfully installed sweetviz-2.3.1


In [None]:
import sweetviz as sv
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

def process_data(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Remove irrelevant rows and columns
    df = df[~df['station'].isin(['?', 'Tz', 'TZ'])]
    df.drop(['Date', 'Bluegreen', 'Green Algae', 'Diatoms', 'Depth_inst',
             'Cryptophyta', 'Yellow substances', 'Transmission'], axis=1, inplace=True)

    # Find combinations of Date_time, station, and depth that appear more than once
    combination_counts = df.reset_index()[['Date_time', 'station', 'depth']].value_counts()
    combinations_more_than_one = combination_counts[combination_counts > 1]
    df_combinations_more_than_one = combinations_more_than_one.reset_index()
    df_combinations_more_than_one.columns = ['Date_time', 'station', 'depth', 'count']

    # Drop duplicates based on the above combinations
    df_without_duplicates = df.drop(df_combinations_more_than_one.index)

    # Filter rows where depth <= 15
    top_15 = df_without_duplicates[df_without_duplicates['depth'] <= 15]

    # Group by 'depth', 'station', and 'Date_time', and get the minimum 'Pressure' for each group
    df_min_pressure = top_15.groupby(['depth', 'station', 'Date_time'], as_index=False)['Pressure'].min()

    # Merge to keep only rows with the minimum pressure values
    filtered_top_15 = pd.merge(top_15, df_min_pressure, on=['depth', 'station', 'Date_time', 'Pressure'], how='inner')

    # Remove duplicate rows if created during the merge
    filtered_top_15 = filtered_top_15.drop_duplicates(subset=['depth', 'station', 'Date_time', 'Pressure'])

    # Find the minimum pressure for each (station, depth) combination
    min_pressure_by_depth_station = filtered_top_15.groupby(['station', 'depth'])['Pressure'].min().reset_index()
    min_pressure_by_depth_station.rename(columns={'Pressure': 'min_Pressure'}, inplace=True)

    # Merge this back with the filtered DataFrame
    filtered_top_15 = pd.merge(filtered_top_15, min_pressure_by_depth_station, on=['station', 'depth'], how='left')

    # Update pressure values to be the minimum pressure for that depth and station
    filtered_top_15['Pressure'] = filtered_top_15['min_Pressure']

    # Drop the extra column used for the minimum pressure
    filtered_top_15.drop(columns=['min_Pressure'], inplace=True)

    return filtered_top_15

# Example usage
file_path = '/content/drive/MyDrive/DSSG/FP_2017_2023_DS_Project.csv'
cleaned_data = process_data(file_path)
print(cleaned_data[['depth', 'station', 'Date_time', 'Pressure']])


       depth station   Date_time  Pressure
0       0.06       K   6/11/2017      1.16
1       0.12       K   6/11/2017      1.16
2       0.10       K   6/11/2017      1.16
3       0.15       K   6/11/2017      1.17
4       0.03       K   6/11/2017      1.15
...      ...     ...         ...       ...
77170  12.00       K  12/31/2023      2.33
77171  12.75       K  12/31/2023      2.40
77172  13.44       K  12/31/2023      2.47
77173  14.16       K  12/31/2023      2.54
77174  14.94       K  12/31/2023      2.62

[77175 rows x 4 columns]


In [None]:
def create_station_subsets(cleaned_data):
    # Get the unique stations
    unique_stations = cleaned_data['station'].unique()

    # Create a dictionary to store subsets for each station
    station_subsets = {}

    # Loop over each unique station and create a subset
    for station in unique_stations:
        station_subset = cleaned_data[cleaned_data['station'] == station]
        station_subsets[station] = station_subset

    return station_subsets

# Example usage
station_subsets = create_station_subsets(cleaned_data)



In [9]:
import pandas as pd
import sweetviz as sv

for key in station_subsets.keys():
  df = station_subsets[key]
  # Create a Sweetviz report
  report = sv.analyze(df)
  # Display the report in the notebook or save it to an HTML file
  report.show_html(f'EDA_report_for_station{key}.html')
  #save the report in my google drive
  report.show_html(f'/content/drive/MyDrive/DSSG/Sweetviz_EDA_report_for_station{key}.html')


                                             |          | [  0%]   00:00 -> (? left)

Report EDA_report_for_stationK.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
Report /content/drive/MyDrive/DSSG/Sweetviz_EDA_report_for_stationK.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


                                             |          | [  0%]   00:00 -> (? left)

Report EDA_report_for_stationG.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
Report /content/drive/MyDrive/DSSG/Sweetviz_EDA_report_for_stationG.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


                                             |          | [  0%]   00:00 -> (? left)

Report EDA_report_for_stationH.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
Report /content/drive/MyDrive/DSSG/Sweetviz_EDA_report_for_stationH.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


                                             |          | [  0%]   00:00 -> (? left)

Report EDA_report_for_stationA.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
Report /content/drive/MyDrive/DSSG/Sweetviz_EDA_report_for_stationA.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


                                             |          | [  0%]   00:00 -> (? left)

Report EDA_report_for_stationD.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
Report /content/drive/MyDrive/DSSG/Sweetviz_EDA_report_for_stationD.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
