In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Step 2: Read the dataset and clean column names
file_path = r"C:\Users\marca\Downloads\flavors_of_cacao.csv"

# Load the dataset
df = pd.read_csv(file_path)

# Strip any extra spaces from column names
df.columns = df.columns.str.strip()

# Print column names to verify
print(df.columns)

# Step 3: Check for missing values and remove them
df_cleaned = df.dropna()

# Step 4: Exploring the dataset

# Number of tuples in the dataset
num_tuples = df_cleaned.shape[0]
print(f"Number of tuples in dataset: {num_tuples}")

# Number of unique company names
unique_companies = df_cleaned["Company"].nunique()  # Check actual column name from print(df.columns)
print(f"Number of unique company names: {unique_companies}")

# Number of reviews in 2013
reviews_2013 = df_cleaned[df_cleaned["Review Date"] == 2013].shape[0]
print(f"Number of reviews in 2013: {reviews_2013}")

# Number of missing values in BeanType column
missing_bean_type = df["Bean Type"].isna().sum()
print(f"Number of missing values in BeanType column: {missing_bean_type}")

# Step 5: Visualize the Rating column with a histogram
plt.figure(figsize=(8,5))
sns.histplot(df_cleaned["Rating"], bins=10, kde=True)
plt.title("Distribution of Chocolate Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

# Step 6: Convert and visualize the Cocoa Percent column

# Convert Cocoa Percent column from string to numerical values
df_cleaned["Cocoa Percent"] = df_cleaned["Cocoa Percent"].str.replace('%', '').astype(float)

# Scatter plot of Cocoa Percent vs Rating
plt.figure(figsize=(8,5))
sns.scatterplot(x=df_cleaned["Cocoa Percent"], y=df_cleaned["Rating"], alpha=0.1)
plt.title("Cocoa Percentage vs Rating")
plt.xlabel("Cocoa Percent")
plt.ylabel("Rating")
plt.show()

# Step 7: Normalize the Rating column
scaler = MinMaxScaler()
df_cleaned["Normalized Rating"] = scaler.fit_transform(df_cleaned[["Rating"]])

# Print first few rows to verify
print(df_cleaned[["Rating", "Normalized Rating"]].head())

# Step 8: Encode categorical columns (Company and Location)
encoder = LabelEncoder()
df_cleaned["Company_Encoded"] = encoder.fit_transform(df_cleaned["Company"])
df_cleaned["Location_Encoded"] = encoder.fit_transform(df_cleaned["Company Location"])

# Print first few rows to verify encoding
print(df_cleaned[["Company", "Company_Encoded", "Company Location", "Location_Encoded"]].head())

