# Exercises

## A. Basics

### Exercise 1

Load the Iris data into a `DataFrame` and print the shape of the data, the type of the data, and the first 3 rows.

In [None]:
import pandas as pd

# Load the data into a DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
iris = pd.read_csv(url, header=None, names=columns)

# Print the shape of the data
print("Shape of the data:", iris.shape)

# Print the type of the data
print("Type of the data:", type(iris))

# Print the first 3 rows of the data
print("First 3 rows of the data:\n", iris.head(3))


### Exercise 2

Print the keys of the Iris data.

In [None]:
print(iris.keys())


### Exercise 3

Get the number of observations and missing values of the Iris data.

In [None]:
print(iris.info())


### Exercise 4

View basic statistical details like percentiles, mean, standard deviation, etc., of the Iris data.

In [None]:
print(iris.describe())


### Exercise 5

Get observations of each species (setosa, versicolor, virginica) from the Iris data.

In [None]:
setosa = iris[iris[4] == 'Iris-setosa']
versicolor = iris[iris[4] == 'Iris-versicolor']
virginica = iris[iris[4] == 'Iris-virginica']
print(setosa)
print(versicolor)
print(virginica)


### Exercise 6

Access the first four columns for the first observation from the Iris data.

In [None]:
print(iris.iloc[0, :4])


## B. Visualization

### Exercise 1

Create a bar plot to get the frequency of the 3 species of the Iris data.

In [None]:
import matplotlib.pyplot as plt

species_count = iris[4].value_counts()
plt.bar(species_count.index, species_count.values)
plt.xlabel('Species')
plt.ylabel('Count')
plt.show()


### Exercise 2

Create a graph to find the relationship between the sepal length and sepal width of each species.

In [None]:
setosa = iris[iris[4] == 'Iris-setosa']
versicolor = iris[iris[4] == 'Iris-versicolor']
virginica = iris[iris[4] == 'Iris-virginica']
plt.scatter(setosa[0], setosa[1], label='setosa')
plt.scatter(versicolor[0], versicolor[1], label='versicolor')
plt.scatter(virginica[0], virginica[1], label='virginica')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.legend()
plt.show()


### Exercise 3

Create a graph to find the relationship between the petal length and petal width of each species.

In [None]:
setosa = iris[iris[4] == 'Iris-setosa']
versicolor = iris[iris[4] == 'Iris-versicolor']
virginica = iris[iris[4] == 'Iris-virginica']
plt.scatter(setosa[2], setosa[3], label='setosa')
plt.scatter(versicolor[2], versicolor[3], label='versicolor')
plt.scatter(virginica[2], virginica[3], label='virginica')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.legend()
plt.show()


### Exercise 4

Create 4 graphs to visualize the distribution of the sepal length, sepal width, petal length, and petal width.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load Iris dataset
iris_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None,
                      names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])

# Create four subplots for each feature
fig, axs = plt.subplots(2, 2, figsize=(10, 8))

# Plot sepal length histogram
axs[0, 0].hist(iris_df['sepal_length'], bins=20, edgecolor='black')
axs[0, 0].set_title('Sepal Length Distribution')

# Plot sepal width histogram
axs[0, 1].hist(iris_df['sepal_width'], bins=20, edgecolor='black')
axs[0, 1].set_title('Sepal Width Distribution')

# Plot petal length histogram
axs[1, 0].hist(iris_df['petal_length'], bins=20, edgecolor='black')
axs[1, 0].set_title('Petal Length Distribution')

# Plot petal width histogram
axs[1, 1].hist(iris_df['petal_width'], bins=20, edgecolor='black')
axs[1, 1].set_title('Petal Width Distribution')

plt.show()


### Exercise 5

Create a jointplot to describe individual distributions on the same plot between sepal length and sepal width.

In [None]:
import seaborn as sns
import pandas as pd

# Load the iris dataset from seaborn library
iris = sns.load_dataset("iris")

# Create a jointplot between sepal length and sepal width
sns.jointplot(x="sepal_length", y="sepal_width", data=iris)

# Show the plot
plt.show()


### Exercise 6

Create a jointplot using hexagonal bins to describe individual distributions on the same plot between sepal length and sepal width.

In [None]:
import seaborn as sns
import pandas as pd

# Load the iris dataset
iris = sns.load_dataset('iris')

# Create a jointplot with hexagonal bins
sns.jointplot(x='sepal_length', y='sepal_width', data=iris, kind='hex')

# Show the plot
plt.show()


### Exercise 7

Create a jointplot using kernel density estimation to describe individual distributions between sepal length and sepal width.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load the iris dataset
iris = sns.load_dataset('iris')

# Create a jointplot with kernel density estimation
sns.jointplot(x='sepal_length', y='sepal_width', data=iris, kind='kde')

# Show the plot
plt.show()


### Exercise 8

Create a jointplot and add regression and KDE to describe individual distributions on the same plot between sepal length and sepal width.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load the iris dataset
iris = sns.load_dataset('iris')

# Create a jointplot with regression and KDE
sns.jointplot(x='sepal_length', y='sepal_width', data=iris, kind='reg', joint_kws={'line_kws':{'color':'red'},'scatter_kws': {'alpha': 0.3}}, color='blue')

# Show the plot
plt.show()


### Exercise 9

Create a pairplot of the Iris data based on species.

In [None]:
import seaborn as sns

# Load the iris dataset
iris = sns.load_dataset('iris')

# Create a pairplot based on species
sns.pairplot(data=iris, hue='species')

# Show the plot
plt.show()


### Exercise 10

Find the correlation between the independent variables of the Iris data and create a heatmap to present their relationships.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load the iris dataset
iris = sns.load_dataset('iris')

# Calculate the correlation matrix
corr = iris.corr()

# Create a heatmap to visualize the correlation matrix
sns.heatmap(corr, cmap='coolwarm', annot=True)

# Show the plot
plt.show()


### Exercise 11

Conduct a principal component analysis (PCA) of the Iris dataset to reduce the number of dimensions from 4 to 2. Remember to create a features matrix with the independent variables first. Plot the results.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Load the iris dataset
iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
                   header=None, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])

# Create features matrix
X = iris.drop('class', axis=1)

# Perform PCA with 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Visualize the results
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=iris['class'].map({'Iris-setosa': 'red', 'Iris-versicolor': 'blue', 'Iris-virginica': 'green'}))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


## C. Machine Learning Algorithms

In this section, utilize at least 3 machine learning algorithms (either discussed in class or from research you've conducted on your own) to create some models for this data.

Some options include:

- Gaussian mixture model (clustering)
- K-nearest neighbors (clustering)
- Gaussian naive Bayes (classification)
- Logistic regression (classification)

Split the Iris dataset into a training set and a testing set. Choose a class of model, instantiate the model instance with hyperparameters, fit the model, and produce some output (e.g., clusters, confusion matrices, etc.).