In [1]:
# Import dependencies
import pandas as pd

In [3]:
# Load and read dataset
file_path = "./iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [9]:
# Drop "class" column
new_iris_df = iris_df.drop(["class"], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


Unsupervised ML only deals with numerical data, so this column can be dropped.

In [13]:
# Reorder columsn so that lengths and widths are together
new_iris_df = new_iris_df[["sepal_length", "petal_length", "sepal_width", "petal_width"]]
new_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [14]:
# Save on a new CSV file
output_file_path = "./new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

After data cleaning and transformations, the preprocessed dataset is saved.

In [16]:
# Load and read dataset
file_path = "./shopping_data.csv"
shopping_df = pd.read_csv(file_path, encoding="ISO-8859-1")
shopping_df.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [17]:
# What data is available?
shopping_df.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [18]:
# What type of data is available?
shopping_df.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

"Card Member" is not a data type that can be used in unsupervised ML.

In [20]:
# What data is missing?
shopping_df.isnull().sum()

CustomerID                0
Card Member               2
Age                       2
Annual Income             0
Spending Score (1-100)    1
dtype: int64

"Card Member", "Age", and "Spending Score" have null values.

In [26]:
# Find null values
for column in shopping_df.columns:
    print(f"Column {column} has {shopping_df[column].isnull().sum()} null values.")

Column CustomerID has 0 null values.
Column Card Member has 2 null values.
Column Age has 2 null values.
Column Annual Income has 0 null values.
Column Spending Score (1-100) has 1 null values.


In [29]:
# What data can be removed?
shopping_df = shopping_df.dropna()
shopping_df.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [30]:
for column in shopping_df.columns:
    print(f"Column {column} has {shopping_df[column].isnull().sum()} null values.")

Column CustomerID has 0 null values.
Column Card Member has 0 null values.
Column Age has 0 null values.
Column Annual Income has 0 null values.
Column Spending Score (1-100) has 0 null values.


In [31]:
# Find duplicate entries
print(f"Duplicate entries: {shopping_df.duplicated().sum()}")

Duplicate entries: 0


In [32]:
# Remove "CustomerID" column
shopping_df.drop(columns=["CustomerID"], inplace=True)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


The CustomerID does not offer any insight into customer shopping habits.

For data processing, the focus is on making sure the data is set up for the unsupervised learning model, which requires the following:
* Null values are handled.
* Only numerical data is used.
* Values are scaled. In other words, data has been manipulated to ensure that the variance between the numbers won’t skew results.

In [34]:
# Transform the string column
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0
    
shopping_df["Card Member"] = shopping_df["Card Member"].apply(change_string)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [35]:
# Transform the annual income column
shopping_df["Annual Income"] = shopping_df["Annual Income"] / 1000
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


The scale for Annual Income is much larger than all the other values in the dataset, but it was adjusted by dividing by 1,000 to rescale those data points.

In [36]:
# Rename the columns so there's no spaces or numbers
shopping_df = shopping_df.rename(columns = {
    "Card Member": "Card_Member",
    "Annual Income": "Annual_Income",
    "Spending Score (1-100)": "Spending_Score"
})
shopping_df.head()

Unnamed: 0,Card_Member,Age,Annual_Income,Spending_Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [37]:
# Save cleaned dataset
output_file_path = "./clean_shopping_data.csv"
shopping_df.to_csv(output_file_path, index=False)