<a href="https://colab.research.google.com/github/pavansai26/perfume-classification-using-machine-learning/blob/main/Perfume_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PERFUME CLASSIFICATION**

# **problem statement**

## classification of perfumes based upon the observation

# **Data Set Information:**

# The data set gathered when we were working at project for Bahrain university between 2002 and 2003.

# This data consists of odors of 20 different perfumes. Data was obtained by using a handheld odor meter (OMX-GR sensor) per second for 28 seconds period.

# **Attribute Information**

# Names of these perfumes are: ajayeb, ajmal, amreaj, aood, asgar_ali, bukhoor, burberry, dehenalaod, junaid, kausar, rose, solidmusk, TeaTreeOil, raspberry, RoseMusk, strawberry, constrected2, carolina_herrera, oudh_ma'alattar, constrected1.

# **Classification Task**

# This notebook demonstrates how a machine learning model can determine the type of perfume based on the odometer reading.

# **importing the necessary libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


# **ignoring the warnings**

In [None]:
import warnings
warnings.filterwarnings('ignore',category = DeprecationWarning)
warnings.filterwarnings('ignore',category = UserWarning)
warnings.filterwarnings('ignore',category = RuntimeWarning)
warnings.filterwarnings('ignore',category = FutureWarning)

# **setting the no.of rows and columns**

In [None]:
pd.set_option('display.max_rows',100000)
pd.set_option('display.max_columns',1000)

# **accesing the drive**

In [None]:
from google.colab import drive
drive.mount('/gdrive')

# **getting the data**

In [None]:
data = pd.read_excel('/gdrive/My Drive/Colab Notebooks/perfume_data.xlsx', header=None)

# **printing the data only top 5**

In [None]:
data.head()

# **finding the total no.of observations**

In [None]:
len(data)

# total 20 observations are present in this dataset

# **finding the total no.of features**

In [None]:
len(data.columns)

# total 29 features are present in this dataset

In [None]:
data.shape

# **Renaming the '0' column to 'perfume'**

In [None]:
data['perfume'] = data[0]
data1 = data.drop(0, axis=1)

In [None]:
data1

In [None]:
perfumes = list(data.iloc[:,-1])

# **names of perfumes presented in this dataset**

In [None]:
perfumes

# **DATA CLEANING**

# **CHECKING FOR NULL VALUES**

In [None]:
data1.isnull().sum()

# there are no null values in the data

# **information about the data**

In [None]:
data1.info()

# **As each row in the data frame consists of 28 Odometer measurement values and the corresponding name of the Perfume.**

# **This has to be converted into  which has each observation as a row and each attribute as column**

# For this a new Dataframe is created and the values are appended correspondingly

In [None]:
df = pd.DataFrame([], columns= ['Observation', 'Perfume'])


# printing the df data frame

In [None]:
df

# **appending the observations to the perfumes which observation belongs to which perfume**

In [None]:
observations = np.array([])
labels = np.array([])

for perfume in range(len(perfumes)):
  for i in range(27):
    obs, label = data1.iloc[perfume,i], data1.iloc[perfume,-1]
    observations = np.append(observations, obs)
    labels = np.append(labels, label)


# **converting arrays to pandas series**

In [None]:
observations = pd.Series(observations)
labels = pd.Series(labels)

# **assigning the observations and labels to the observations and labels columns**

In [None]:
df.Observation = observations
df.Perfume = labels

# **printing pandas series df object**

In [None]:
df

# **converting the pandas series object into data frame object**

In [None]:
df = pd.DataFrame(df)

# **printing the pandas data frame**

In [None]:
df

# **no.of observations in df**

In [None]:
len(df)

# **no.of features in df**

In [None]:
len(df.columns)

# **top 5 observations**

In [None]:
df.head()

# **information about the data**

In [None]:
df.info()

# **shuffling the data set for avoid bias issues**

In [None]:
df = df.sample(frac= 1, random_state= 42).reset_index(drop = True)

# **top 5 observations after shuffling**

In [None]:
df.head()

In [None]:
pd.DataFrame(df.groupby('Perfume').describe()).T

In [None]:
df.groupby('Perfume').mean().plot(kind = 'bar', figsize=(15,8))
plt.ylabel('mean of measurements')
plt.show()

In [None]:
sns.displot(df.Observation)

# **Encoding the labels into numericals**

In [None]:
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Perfume'])

In [None]:
x = df['Observation'].to_numpy().reshape(-1,1)
y = df['Label']

# **Splitting into Training and Test sets**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)


# **Building Model and making Predictions with decision tree**

In [None]:
dtc = DecisionTreeClassifier()

dtc.fit(x_train, y_train)

y_pred = dtc.predict(x_test)

In [None]:
print('Accuracy Score - ',  accuracy_score(y_test, y_pred))

# **Building Model and making Predictions with random forest**

In [None]:
rf = RandomForestClassifier(n_estimators=200)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

In [None]:
print('Accuracy Score - ',  accuracy_score(y_test, y_pred))