# Top

* [Setup](#Setup)
* [Get Random Samples (with training features)](#Get-Random-Samples-(with-training-features))
* [Agglomerate annotated subfiles to create RS dataset](#Agglomerate-annotated-subfiles-to-create-RS-dataset)
* [Plot Random Sampling labeled data](#Plot-Random-Sampling-labeled-data)
* [Statistics of RS labeled set](#Statistics-of-RS-labeled-set)

[Bottom](#Bottom)

______________________

# Setup

[Back to Top](#Top)

In [None]:
##
##/////////////////////////////////////
## Imports and Util

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from tabulate import tabulate

import geopandas as gpd
import rioxarray
import rasterio as rio
from shapely.geometry import Point, box

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, StratifiedGroupKFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score

from joblib import load
import os
import re

import random

#visualization parameters
palette = ['#006600', '#99ff33', '#2d8659', '#c6538c', '#808000', '#804000', '#0000ff']
class_names = ['Cashew', 'Non-cashew']
vis_params = {
        "cmap": ListedColormap(palette),
        "vmin": 1,
        "vmax": 2,
        "alpha": 0.9,
}

base_directory = os.getcwd()
#dataset with min max band values for normalization
normalizer_df = os.path.join(base_directory, "")
norm = pd.read_csv(normalizer_df)
norm = norm[["band","mean","std","min","max"]]

In [None]:
##
##/////////////////////////////////////
## Auxiliary functions to the important ones

def get_normalized_X(X, norm):
    """
    Apply min-max normalization to feature dataframe. Returns norm dataframe
    """
    X_norm = X.copy(deep=True)
    for band in norm["band"]:
        norm_params = norm.loc[norm["band"]==band]
        X_norm[band] = (X_norm[band] - norm_params["min"].iloc[0]) / (norm_params["max"].iloc[0] - norm_params["min"].iloc[0]) #iloc[0] because of FutureWarning
    return X_norm

##############################################

def transform_y_2classes(y):
    """
    Transforms a y labeled array/Series from the 7 classes, where 5 is cashew, into a 2 labeled Series where 1 cashew, 2 non-cashew
    """
    y_update = pd.Series(y == 5, dtype="int")
    y_update.loc[y_update==0] = 2
    return y_update

##############################################

def balanced_accuracy_scorer(estimator, X_true, y_true):
    y_pred = estimator.predict(X_true)
    if (np.unique(y_pred) != np.unique(y_true)).all():
        print("Classes in true labels and predictions mismatch!")
        print("Unique preds: ", np.unique(y_pred))
        print("Unique trues", np.unique(y_true))
    return balanced_accuracy_score(y_true,y_pred)

def f1_cashew_scorer(estimator, X_true, y_true):
    """
    Return F1 score for cashew. Adapted to the 7-class system where cashew is class 5, or to the binary system where cashew is class 1
    """
    y_pred = estimator.predict(X_true)
    if (np.unique(y_pred) != np.unique(y_true)).all():
        print("Classes in true labels and predictions mismatch!")
        print("Unique preds: ", np.unique(y_pred))
        print("Unique trues", np.unique(y_true))
    if np.unique(y_true).shape[0] == 2:
        return f1_score(y_true,y_pred,average=None)[0] #assumes 1st class is cashew, 2nd is non      
    else:
        try:
            return f1_score(y_true,y_pred,average=None)[4] #in the 7 total classes, cashew is 5th
        except IndexError: #other number of classes, could be just 1, if cashew doesn't appear for example
            return -1

##############################################

def plot_multipolygon_boundaries(multi,color="black",linewidth=0.2,alpha=1):
    """
    Works with polygons or multipolygons, plots just the outlines of the shape
    """
    for geom in multi.geoms:
        xs, ys = geom.exterior.xy    
        plt.plot(xs,ys, color=color, linewidth=linewidth,alpha=alpha)

##############################################

def filterOverlappingPoints(df1, df2):
    """
    Returns: df1 with just the rows that do not appear in df2, according to xy position
    """
    if df2.empty == False:
        merged_df = pd.merge(df1, df2, on=["x","y"], how="left", indicator=True)
        return merged_df[merged_df["_merge"]=="left_only"].index
    else:
        return df1.index

##############################################

def find_latest_file(directory):
    '''
    Get directory+filename of a file in a directory with highest number.
    Filenames follow the form: someName_nº.extension
    Generalization of savePickle.py's find_latest_pickle()
    '''
    files = [file for file in os.listdir(directory)]
    if not files:
        return None, -1 

    #extract suffix numbers from file names and find the maximum
    suffix_numbers = [int(file.split("_")[-1].split(".")[0]) for file in files]
    index_of_max = suffix_numbers.index(max(suffix_numbers))
    
    return os.path.join(directory, files[index_of_max]), suffix_numbers[index_of_max]

##############################################

def find_n_lowest_difference_index(df, n_points):
    """
    Given a pandas dataframe df with column "Difference", return the indexes of points with lowest "Difference" between first and second classes  
    """
    df["Difference"] = -df["Difference"] #to utilize numpy's nlargest, invert sign
    largest = df.nlargest(n_points,"Difference")
    largest["Difference"] = -largest["Difference"] #re-revert sign
    return largest.index;

###############################################

def get_files_in_directory(directory_path):
    """
    Returns: list with all files in the specified directory
    """
    files = os.listdir(directory_path)
    #filter out directories, leaving only files
    files = [file for file in files if os.path.isfile(os.path.join(directory_path, file))]
    return files

###############################################

def extract_number_from_filename(file_name):
    """
    Extracts the number from a file name before the extension (if any).
    """
    match = re.search(r'(\d+)(\.\w+)?$', file_name)
    if match:
        return int(match.group(1))
    else:
        return None

____

# Get Random Samples (with training features)

[Back to top](#Top)

In [None]:
import os
import random
import rasterio
import pandas as pd
random.seed(42)

def geotiff_to_df(image_path, column_order=None, standardize=False, norm_dataframe=None):
    """
    Converts a geotiff file into a pandas dataframe.
    CORRECTED v3 - changed standardization to be global (using min max parameters from the entire region of interest) instead of batch-based normalization.
    Assumes the images are on crs "EPSG:4326".
    column_order: the order of bands in xds is different than the band order in model training. This parameter is to give the correct training column order.
    """
    xds = rioxarray.open_rasterio(image_path, masked=True)
    x_coords, y_coords = xds.coords['x'].values, xds.coords['y'].values

    #for geotiffs with only one band, tipically the class band, simpler transformation, without normalization
    if xds.shape[0] == 1:
        values = xds[0].values.flatten()
        df = pd.DataFrame({'x': list(x_coords)*len(y_coords),
                           'y': y_coords.repeat(len(x_coords)),
                           'band_0': values})
        return df

    names = xds.attrs["long_name"]
    df_final = pd.DataFrame(columns=xds.attrs["long_name"])
    
    #correct column order if needed
    if column_order is not None:
        df_final = df_final[column_order]
    #if standardize, flatten the 2D/3D array and apply min-max norm; else, just flatten the array
    if standardize:
        for i, name in enumerate(names):
            values = xds[i].values.flatten() #each band has format (y_size, x_size); flatten() transform into 1d array with size (y_size * x_size)
            norm_params = norm_dataframe.loc[norm["band"]==name]
            df_final[name] = (values - norm_params["min"].iloc[0]) / (norm_params["max"].iloc[0] - norm_params["min"].iloc[0]) #iloc[0] due to FutureWarning; though pd.Series only has 1 element
    else:
        for i, name in enumerate(names):
            values = xds[i].values.flatten()
            df_final[name] = values

    #merge lon and lat columns
    if "x" not in df_final.columns:
        df_final["x"] = list(x_coords)*len(y_coords)
        df_final["y"] = y_coords.repeat(len(x_coords))
    
    return df_final

###############################################

def obtain_patch_coordinates(tif_to_df_unique_x, tif_to_df_unique_y, point_coordinates, polygonID, verbose):
    """
    Given coordinates for a point (central pixel), returns a dataframe with the coordinates for the direct 8 neighboring pixels. Works for a list of point_coordinates.
    tif_to_df_unique_x and y regard the possible raster x and y coordinates.
    polygonID is given to keep track of the unique patches. Every 3x3 pixel patch has a unique polygonID for identification and data split purposes.
    """
    final_x = []
    final_y = []
    final_polygonID = []
    for point in zip(point_coordinates["x"], point_coordinates["y"]):
        x_indices = []
        y_indices = []
        central_index_x = np.where(tif_to_df_unique_x == point[0])[0][0] #double [0] to obtain the index
        central_index_y = np.where(tif_to_df_unique_y == point[1])[0][0]
        x_indices.append(tif_to_df_unique_x[central_index_x])
        y_indices.append(tif_to_df_unique_y[central_index_y])
        #evaluates if the central pixel is on any border of the raster. Special cases if it is, otherwise adds the 8 neighboring pixels.
        if central_index_x == 0:
            if verbose:
                print("entered 1")
            x_indices.append(tif_to_df_unique_x[central_index_x+1])
        elif central_index_x == (tif_to_df_unique_x.shape[0]-1):
            if verbose: 
                print("entered 2")
            x_indices.append(tif_to_df_unique_x[central_index_x-1])
        else:
            x_indices.append(tif_to_df_unique_x[central_index_x+1])
            x_indices.append(tif_to_df_unique_x[central_index_x-1])
        if central_index_y == 0:
            if verbose:
                print("entered 4")
            y_indices.append(tif_to_df_unique_y[central_index_y+1])
        elif central_index_y == (tif_to_df_unique_y.shape[0]-1):
            if verbose:
                print("entered 5")
            y_indices.append(tif_to_df_unique_y[central_index_y-1])
        else:
            y_indices.append(tif_to_df_unique_y[central_index_y+1])
            y_indices.append(tif_to_df_unique_y[central_index_y-1]) 
        xy_coords = np.array(np.meshgrid(x_indices, y_indices)).T.reshape(-1,2) #from https://stackoverflow.com/questions/1208118/using-numpy-to-build-an-array-of-all-combinations-of-two-arrays
        final_x.append(xy_coords[:,0])
        final_y.append(xy_coords[:,1])
        final_polygonID.append([polygonID for i in range(xy_coords.shape[0])])
        polygonID += 1

    df_return = pd.DataFrame({"x": np.concatenate(final_x), "y": np.concatenate(final_y), "polygonID": np.concatenate(final_polygonID)})
    return df_return, polygonID #polygonID is also returned to keep track of the change in unique patches

###############################################

def random_coefs_from_geotiffs(geotiffs_path, num_points_per_geotiff=1000, num_indexes_to_return=None, check_inside_multipolygon=True,
                               column_order=None, standardize=None, norm_dataframe=None, random_seed=0, polygonID=1000):
    """
    Randomly selects points from geotiff files in the specified directory.
    directory_path (str): path to the directory containing the geotiff files.
    num_points_per_geotiff (int): number of points to randomly select from each geotiff.
    num_indexes_to_return (int): number of randomly selected indexes to return.
    Returns:
    dataframe with columns "x", "y", and "geotiff" indicating 
    the coordinates of the randomly selected points and the original geotiff file name.
    """
    final_df = pd.DataFrame()
    random.seed(random_seed) #stipulate seed of random sample selection for reproducibility
    for root, dirs, files in os.walk(geotiffs_path):
        for file in files:
            if file.endswith(".tif") or file.endswith(".tiff"):  #check if file is a GeoTIFF
                file_path = os.path.join(root, file)
                print(f"Processing GeoTIFF: {file}")
                #transform geotiff into dataframe
                tif_to_df = geotiff_to_df(file_path)
                #sample a number of CENTRAL PIXELS
                selected_indexes = random.sample(range(tif_to_df.shape[0]), num_points_per_geotiff)
                tif_to_df_selected = tif_to_df.iloc[selected_indexes]
                #also get the neighboring pixels of each selected central pixel (tipically working with just the 8 direct neighboring pixels) 
                xypatches, polygonID = obtain_patch_coordinates(np.unique(tif_to_df["x"]), np.unique(tif_to_df["y"]), tif_to_df_selected, polygonID)
                tif_to_df_patches = pd.merge(tif_to_df, xypatches[["x","y","polygonID"]], on=["x","y"], how="inner")
                tif_to_df_patches["geotiff"] = extract_number_from_filename(file)
                #add pixels (central + neighbors) to final_df
                if final_df.empty:
                    final_df = tif_to_df_patches
                else:
                    final_df = pd.concat([final_df, tif_to_df_patches], ignore_index=True)
                print("OG tif_df file size: ", tif_to_df.shape)
                print("selected indices: ", tif_to_df_selected.shape)
                print("Size with patches: ", tif_to_df_patches.shape)
                print("polygonID: ", polygonID)

    #safety check if pixels are inside ROI multipolygon
    if check_inside_multipolygon:
        final_df.reset_index(drop=True,inplace=True)
        print("Before inside_multi_polygon: ", final_df.shape)
        inside_multi_polygon = [point_inside_multi_polygon(x, y, multi_polygon) for x, y in zip(final_df["x"], final_df["y"])]
        final_df = final_df.loc[inside_multi_polygon]
        print("After inside_multi_polygon: ", final_df.shape)
    
    #final cut of points from final_df, if shape is still higher than the desired number of samples to annotate
    final_df.reset_index(drop=True,inplace=True)
    if num_indexes_to_return is not None:
        print("num_indexes_to_return is not None, cutting until desired number")
        selected_indexes = random.sample(range(len(final_df)), num_indexes_to_return)
        final_df = final_df.iloc[selected_indexes]
        final_df.reset_index(drop=True,inplace=True)
        
    return final_df

* __Obtain randomly selected patches (before checking inside multipolygon)__

In [None]:
%%time

geojson_file = ""
gdf = gpd.read_file(geojson_file)

multi_polygon = gdf.unary_union

base_directory = os.getcwd()
geotiffs_path = os.path.join(base_directory, "")
random_path = os.path.join(base_directory, "")

num_points_per_geotiff = 10
num_indexes_to_return = None
random_points_df = random_coefs_from_geotiffs(geotiffs_path, num_points_per_geotiff, num_indexes_to_return, random_seed=0, polygonID = 1000, check_inside_multipolygon=True)
print("Randomly selected points DataFrame:")
print(random_points_df)

* __Split dataframe into multiple smaller dataframes__

__Disclaimer:__ This step wasn't strictly necessary. However, the randomly selected data samples will be imported into Google Earth Pro, which doesn't work well with large datasets. So, partitioning helps importing sample fractions at a time to the program.

In [None]:
def split_dataframe_by_polygon(df, num_subfiles, output_folder=None, output_folder_gep=None):
    """
    Shuffle polygonID's, and get random patches for each sub-file.
    Subfiles are stores in two different folders:
    output_folder_gep, where files will be imported into Google Earth Pro for the class annotation process and will thus be edited;
    output_folder, safety save where files will not be edited
    """
    #create output folders if needed
    if output_folder != None:
        os.makedirs(output_folder, exist_ok=True)
    if output_folder_gep != None:
        os.makedirs(output_folder_gep, exist_ok=True)

    #shuffle randomly selected patches (uniquely identified by polygonID) and split them throughout a number of sub-files
    unique_polygon_ids = df['polygonID'].unique()
    random.shuffle(unique_polygon_ids)    
    samples_per_subfile = len(unique_polygon_ids) // num_subfiles
    for i in range(num_subfiles):
        #calculate start and end indices for slicing
        start_index = i * samples_per_subfile
        end_index = start_index + samples_per_subfile
        if i == num_subfiles - 1: #if it's the last subfile, include remaining IDs
            end_index = None
        
        subset_polygon_ids = unique_polygon_ids[start_index:end_index]
        sub_df = df[df['polygonID'].isin(subset_polygon_ids)]
        sub_df.reset_index(drop=True, inplace=True)

        subfile_path = os.path.join(output_folder, f'randomsub_{i+1}.csv')
        sub_df.to_csv(subfile_path)
        subfile_path = os.path.join(output_folder_gep, f'randomsub_{i+1}.csv')
        sub_df[["x", "y", "polygonID", "geotiff"]].to_csv(subfile_path)

    print(f"{num_subfiles} sub-files created successfully in '{output_folder}' folder.")
    print(f"{num_subfiles} sub-files created successfully in '{output_folder_gep}' folder.")
    return;

In [None]:
output_folder = ""
output_folder_gep = ""
test_df = split_dataframe_by_polygon(random_points_df_inside, 30, output_folder=output_folder, output_folder_gep=output_folder_gep)

___

# Agglomerate annotated subfiles to create RS dataset

[Back to top](#Top)

In [None]:
directory_with_classes = os.path.join(base_directory, "") #directory of GEP imported files, where we added a Class column
directory_with_coefs = os.path.join(base_directory, "") #directory with the dataframes with training coefficients

agg_df = pd.DataFrame()

for file in os.listdir(directory_with_classes):
    file_classes = os.path.join(directory_with_classes, file)
    classes_df = pd.read_csv(file_classes, encoding="latin-1") #due to special characters like "?"
    #if dataset has no "Class" column, we haven't labeled this sub-file yet, ignore it
    if "Class" not in classes_df.columns:
        continue
    if (classes_df["Unnamed: 0"] == classes_df.index).all() == False: #order rows has been altered, revert it
        classes_df.sort_values(by="Unnamed: 0", axis=0, inplace=True, ignore_index=True)
    
    #add class and class certainty columns to coefs_df
    file_coefs = os.path.join(directory_with_coefs, file)
    coefs_df = pd.read_csv(file_coefs)
    coefs_df["Class"] = classes_df["Class"]
    coefs_df["Pred certainty"] = classes_df["Pred certainty"]

    #concatenate to final manually labeled dataset
    agg_df = pd.concat([agg_df, coefs_df], ignore_index=True)
agg_df.drop(columns=["Unnamed: 0"], inplace=True)
#save fully labeled Random Sampling dataset!!!
rs_save_file=os.path.join(base_directory, "")
agg_df.to_pickle(rs_save_file)

____

## __Plot Random Sampling labeled data__

[Back to top](#Top)

In [None]:
rs_df = pd.read_pickle(rs_save_file)

#samples that were observed but could not be attributed a class were labeled with -1. Clean dataset of those samples
clean_rs_df = rs_df[rs_df["Class"] > 0]
clean_rs_df.reset_index(drop=True, inplace=True)

roi_directory = os.path.join(base_directory, "")
region_of_interest = gpd.read_file(roi_directory)
multipolygon = region_of_interest.unary_union #for plotting, working with just a single multipolygon is simpler

s2cc_sklearn_preds_file = os.path.join(base_directory, "") #first predictions land cover map we produced

print("Total dataset size: ", rs_df.shape)
print("Class>0 dataset size: ", rs_df.loc[rs_df["Class"]>0].shape)

In [None]:
#visualization parameters
colors = {1: '#006600', 2: '#99ff33', 3: "#2d8659", 4: '#c6538c', 5: '#808000', 6: '#804000', 7: '#0000ff', 8:'orange'}
palette = ['#006600', '#99ff33', '#2d8659', '#c6538c', '#808000', '#804000', '#0000ff', 'orange']
class_names = ['Closed Forest', 'Open Forest', 'Mangrove', 'Savanna', 'Cashew', 'Non-Forest', 'Water', 'Clearly non-cashew']
vis_params = {
    "cmap": ListedColormap(palette),
    "vmin": 1,
    "vmax": 7,
    "alpha": 0.9,
}

plt.figure(figsize=(18,6))

#plot every Random Sampling sample, even those with Class -1 (that could not be attributed a land cover class)
plt.subplot(1,2,1)
plot_multipolygon_boundaries(multipolygon,color="black",linewidth=0.6)
for al_class, color in colors.items():
    subset = rs_df[rs_df['Class'] == al_class]
    plt.scatter(subset['x'], subset['y'], s=10, color=color, label=class_names[al_class-1])
subset = rs_df[rs_df['Class'] < 0]
plt.scatter(subset['x'], subset['y'], s=10, color="black", label="Not classified")
plt.title('AL samples - Classes')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()

#plot the CLEAN Random Sampling dataset, without the Class -1 samples
plt.subplot(1,2,2)
plot_multipolygon_boundaries(multipolygon,color="black",linewidth=0.6)
for al_class, color in colors.items():
    subset = clean_rs_df[clean_rs_df['Class'] == al_class]
    plt.scatter(subset['x'], subset['y'], s=10, color=color, label=class_names[al_class-1])
plt.title('CLEAN AL samples - Classes')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
#similar to previous notebook cell, but instead of plotting every class, just plots points according to whether they were attributed a class (Class > 0) or not (Class = -1)

plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
plot_multipolygon_boundaries(multipolygon,color="black",linewidth=0.6)
subset = rs_df[rs_df['Class'] >= 0]
plt.scatter(subset['x'], subset['y'], s=10, color="green", label="Classified")
subset = rs_df[rs_df['Class'] < 0]
plt.scatter(subset['x'], subset['y'], s=10, color="red", label="Not classified")
plt.title('AL samples - Classified vs. not classified')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()


plt.subplot(1,2,2)
plot_multipolygon_boundaries(multipolygon,color="black",linewidth=0.6)
subset = rs_df[rs_df['Class'] >= 0]
plt.scatter(subset['x'], subset['y'], s=10, color="green", label="Classified")
plt.title('AL samples - Just Classified')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
#plot the land cover map with overlay of the Random Sampling dataset

with rasterio.open(s2cc_sklearn_preds_file) as src:
    image = src.read(1)

masked_image = np.ma.masked_outside(image, 1, 7) #set values outside the range [1, 7] to be transparent

#plot land cover map

palette = ['#006600', '#99ff33', '#2d8659', '#c6538c', '#808000', '#804000', '#0000ff']
class_names = ['Closed Forest', 'Open Forest', 'Mangrove', 'Savanna', 'Cashew', 'Non-Forest', 'Water']
vis_params = {
    "cmap": ListedColormap(palette),
    "vmin": 1,
    "vmax": 7,
    "alpha": 0.5,
}

plt.figure(figsize=(15,12), facecolor='white') #white background color
left, bottom, right, top = src.bounds
img = plt.imshow(masked_image, **vis_params, extent=(left, right, bottom, top))

cbar = plt.colorbar(img, ticks=np.arange(1, 8), label="Class", orientation="vertical", shrink=0.4)
cbar.set_ticklabels(class_names)
plt.title("Land cover map 2021")
plt.axis("off")
plt.xlabel("Longitude")
plt.ylabel("Latitude")

#plot boundaries and random sampling points

plot_multipolygon_boundaries(multipolygon,color="black",linewidth=0.6)

colors = {1: '#006600', 2: '#99ff33', 3: "#2d8659", 4: '#c6538c', 5: '#808000', 6: '#804000', 7: '#0000ff', 8: 'orange'}
class_names = ['Closed Forest', 'Open Forest', 'Mangrove', 'Savanna', 'Cashew', 'Non-Forest', 'Water', 'Clearly not cashew']
for al_class, color in colors.items():
    subset = rs_df[rs_df['Class'] == al_class]
    plt.scatter(subset['x'], subset['y'], s=20, color=color, label=class_names[al_class-1], linewidths=0.5, edgecolors="black")

plt.legend()
plt.show()

___

# Statistics of RS labeled set

[Back to top](#Top)

* __PolygonID sizes (nº of samples per polygon)__

24 of 130 not fully annotated patches (nºpixels < 9)

In [None]:
group_counts = clean_rs_df["polygonID"].value_counts()
unique_counts = group_counts.value_counts()

plt.bar(unique_counts.index, unique_counts.values)
plt.title("Barplot of nº pixels per patch")
plt.xlabel("Nº pixels per patch")
plt.ylabel("Patch count")
plt.show()

* __Pixel counts per class__

In [None]:
plt.figure(figsize=(15, 5))

colors = {-1: "black", 1: '#006600', 2: '#99ff33', 3: "#2d8659", 4: '#c6538c', 5: '#808000', 6: '#804000', 7: '#0000ff', 8:'orange'}

#counts for entire RS dataset (with Class -1 points)
plt.subplot(1,2,1)

class_counts = rs_df['Class'].value_counts()
class_counts = class_counts.sort_index()
class_counts.plot(kind='bar', color=[colors[cls] for cls in class_counts.index])

plt.title('Class counts - entire RS dataset')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)

#counts for CLEAN RS dataset (without Class -1 points)
plt.subplot(1,2,2)

class_counts = clean_rs_df['Class'].value_counts()
class_counts = class_counts.sort_index()
class_counts.plot(kind='bar', color=[colors[cls] for cls in class_counts.index])

plt.title('Class counts - CLEAN RS dataset')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

____

# Bottom

[Back to Top.](#Top)