# Prototype Phase Assignemnt

## Importing Libraries & Dataset

In [3]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
# importing dataset
df = pd.read_csv('Customers.csv')
df.head() # to see the first 5 rows of the dataset

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6


## Preprocessing Data

In [5]:
# Drop the 'CustomerID' column as it is not needed for classification
df = df.drop('CustomerID', axis=1)

# Convert 'Gender' column to numerical values (0 for Female, 1 for Male)
df['Gender'] = df['Gender'].apply(lambda x: 0 if x == 'Female' else 1)

# Convert 'Age', 'Annual Income ($)' and 'Spending Score (1-100)' columns to integer values
df['Age'] = df['Age'].astype(int)
df['Annual Income ($)'] = df['Annual Income ($)'].astype(int)
df['Spending Score (1-100)'] = df['Spending Score (1-100)'].astype(int)

# Convert 'Profession' column to numerical values using one-hot encoding
df = pd.get_dummies(df, columns=['Profession'])

df.head()

Unnamed: 0,Gender,Age,Annual Income ($),Spending Score (1-100),Work Experience,Family Size,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,1,19,15000,39,1,4,0,0,0,0,0,1,0,0,0
1,1,21,35000,81,3,3,0,0,1,0,0,0,0,0,0
2,0,20,86000,6,1,1,0,0,1,0,0,0,0,0,0
3,0,23,59000,77,0,2,0,0,0,0,0,0,0,1,0
4,0,31,38000,40,2,6,0,0,0,1,0,0,0,0,0


## Training Models

In [6]:
# Split the dataset into training and testing sets
X = df.drop('Gender', axis=1)
y = df['Gender']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Let's choose two classification algorithms to compare their performance

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=42)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)

In [8]:
# Fit the models on the training data

# Decision Tree Classifier
dtc.fit(X_train, y_train)

# Logistic Regression
lr.fit(X_train, y_train)

## Testing Models

In [9]:
# Predict the test set results

# Decision Tree Classifier
y_pred_dtc = dtc.predict(X_test)
accuracy_dtc = accuracy_score(y_test, y_pred_dtc)

# Logistic Regression
y_pred_lr = lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

## Calculating Accuracy of Models

In [10]:
# Print the accuracy scores of the two models
print('Decision Tree Classifier accuracy:', accuracy_dtc)
print('Logistic Regression accuracy:', accuracy_lr)

Decision Tree Classifier accuracy: 0.5016666666666667
Logistic Regression accuracy: 0.6
