# 1. Import the dataset

In [None]:
import pandas as pd

csv_df = pd.read_csv("../Data/Clean/Manikanta_CR.csv")

csv_df.head(10)

# 2. Number of rows, columns, classes/labels

In [None]:
print("Total rows:", len(csv_df))
print("Total columns:", len(csv_df.columns))
print("Columns:", csv_df.columns.values)
print("Total crops:", len(csv_df.label.unique()))
print("Crops:", sorted(csv_df.label.unique()))

# 3. Has duplicates rows? How many?

In [None]:
print("Has duplicates rows:", csv_df.duplicated().any())
print("Total duplicates rows:", len(csv_df) - len(csv_df.drop_duplicates()))

# 4. Has rows with null values? How many?

In [None]:
print("Has null values:", csv_df.isnull().values.any())
print("Total rows with null values:", len(csv_df) - len(csv_df.dropna()))

# 5. Cleanup dataset for further analysis
Additionally, show row count of cleaned file

In [None]:
clean_df = csv_df.copy()
clean_df = clean_df.dropna()
clean_df = clean_df.drop_duplicates()
print(clean_df)
print("Cleaned dataset row count:", len(clean_df))

# 6. Columns value types

In [None]:
print(clean_df.dtypes)

# 7. Statistics of each column of the dataset

In [None]:
stats_table = clean_df.describe()
stats_table.round(4)

# 8. Dataset column instance distribution

In [None]:
import matplotlib.pyplot as plt

features_histograms = clean_df.hist(bins=50, figsize=(16,10), color="orange", edgecolor="black")

for ax in features_histograms.flatten():
    ax.title.set_size(14)
    
plt.show()

# 9. Dataset crops distribution
Includes before and after cleanup

In [None]:
original_label_count = csv_df.label.value_counts().sort_index()
cleaned_label_count = clean_df.label.value_counts().sort_index()

fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(16,4))

fig.suptitle("Crops instance distribution", fontsize=18, fontweight="bold")

ax1.bar(original_label_count.index, height=original_label_count.values, edgecolor="black", color="orange")
ax1.set_title("Original distribution", fontsize=14)
ax1.tick_params(axis="x", rotation=90, labelsize=12)

ax2.bar(cleaned_label_count.index, height=cleaned_label_count.values, edgecolor="black", color="orange")
ax2.set_title("After cleanup distribution", fontsize=14)
ax2.tick_params(axis="x", rotation=90, labelsize=12)

plt.show()

## 9.1 Side by side crops instances count

In [None]:
distribution_df = original_label_count.to_frame()
distribution_df.rename(columns={"count": "original_count"}, inplace=True)
distribution_df["cleaned_count"] = cleaned_label_count
print(distribution_df)

# 10. Scatter plots

In [None]:
import seaborn as sns

crops_palette = sns.color_palette("tab10", len(clean_df.label.unique()))

In [None]:
crops_pairplot = sns.pairplot(clean_df, hue="label", palette=crops_palette)
plt.setp(crops_pairplot.legend.get_texts(), fontsize="12")

for ax in crops_pairplot.axes.flat:
    ax.set_xlabel(ax.get_xlabel(), fontsize=14)
    ax.set_ylabel(ax.get_ylabel(), fontsize=14)

plt.show()

## 10.1. Single scatter plot

In [None]:
x_feature = "K"
y_feature = "B"

sp = sns.scatterplot(clean_df, x=x_feature, y=y_feature, hue="label", palette=crops_palette)
sp.legend(bbox_to_anchor=(1, 1), loc="upper left", fontsize="12")
sp.set_xlabel(x_feature, fontsize="12")
sp.set_ylabel(y_feature, fontsize="12")

plt.show()