In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Load the dataset
df = pd.read_csv('Mall_Customers.csv')

# a) Visualize male and female customers spending scores using box plots.
sns.boxplot(x='Gender', y='Spending Score (1-100)', data=df)
plt.show()

# b) Find the highest, average, and lowest spending scores.
highest_score = df['Spending Score (1-100)'].max()
average_score = df['Spending Score (1-100)'].mean()
lowest_score = df['Spending Score (1-100)'].min()

print(f'Highest Score: {highest_score}, Average Score: {average_score}, Lowest Score: {lowest_score}')

# c) Find the number of entries when spending scores is above average and less than average.
above_average = df[df['Spending Score (1-100)'] > average_score].shape[0]
below_average = df[df['Spending Score (1-100)'] < average_score].shape[0]

print(f'Entries with score above average: {above_average}, Entries with score below average: {below_average}')

# d) Replace the Gender field value like following: 1=> Female, 2=> Male.
df['Gender'].replace({'Female': 1, 'Male': 2}, inplace=True)

# e) Find the numbers of clusters using the elbow method.
X = df.iloc[:, [3, 4]].values  # Considering 'Annual Income' and 'Spending Score' for clustering
wcss = []  # Within-Cluster-Sum-of-Squares

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# f) Identify similar group customers based on the attributes mentioned in the dataset using k-means clustering.
# From the elbow plot, let's assume the optimal number of clusters is 5
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
y_kmeans = kmeans.fit_predict(X)

# Visualizing the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
df = pd.read_csv('Fish.csv')

# A. Find the number of fishes in each species and print them.
species_counts = df['Species'].value_counts()
print(species_counts)

# B. Draw a bar plot for the above scenario.
species_counts.plot(kind='bar')
plt.xlabel('Species')
plt.ylabel('Count')
plt.title('Number of fishes in each species')
plt.show()

# C. Encode the labels.
le = LabelEncoder()
df['Species'] = le.fit_transform(df['Species'])

# D. Split the dataset into a 75:25 ratio.
X = df.drop('Weight', axis=1)
y = df['Weight']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# E. Predict the weights of fish using a classifier
# Here we use RandomForestRegressor as the classifier
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Print the predicted weights
print(y_pred)
