In [920]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import LabelEncoder , OneHotEncoder ,MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report



In [921]:
df=pd.read_csv('Customer_Segmentations.csv')

# Data Exploration 

In [922]:
df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [923]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10695 entries, 0 to 10694
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               10695 non-null  int64  
 1   Gender           10695 non-null  object 
 2   Ever_Married     10505 non-null  object 
 3   Age              10695 non-null  int64  
 4   Graduated        10593 non-null  object 
 5   Profession       10533 non-null  object 
 6   Work_Experience  9597 non-null   float64
 7   Spending_Score   10695 non-null  object 
 8   Family_Size      10247 non-null  float64
 9   Var_1            10587 non-null  object 
 10  Segmentation     10695 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 919.2+ KB


In [924]:
df.describe()

Unnamed: 0,ID,Age,Work_Experience,Family_Size
count,10695.0,10695.0,9597.0,10247.0
mean,463468.08864,43.511828,2.619777,2.844052
std,2600.966411,16.774158,3.39079,1.536427
min,458982.0,18.0,0.0,1.0
25%,461220.5,30.0,0.0,2.0
50%,463451.0,41.0,1.0,3.0
75%,465733.5,53.0,4.0,4.0
max,467974.0,89.0,14.0,9.0


In [925]:
def identify_columns(df, columns):
    num_cols = []
    cat_cols = []
    for col in columns:
        if df[col].dtype == "object":
            cat_cols.append(col)
        else:
            num_cols.append(col)
    return num_cols, cat_cols

In [926]:
for col in df.columns :
    if df[col].dtype == 'object':
        print(df[col].value_counts())
        print("--------------------------------")


Gender
Male      5841
Female    4854
Name: count, dtype: int64
--------------------------------
Ever_Married
Yes    6163
No     4342
Name: count, dtype: int64
--------------------------------
Graduated
Yes    6570
No     4023
Name: count, dtype: int64
--------------------------------
Profession
Artist           3318
Healthcare       1750
Entertainment    1250
Engineer          935
Doctor            930
Lawyer            844
Executive         775
Marketing         403
Homemaker         328
Name: count, dtype: int64
--------------------------------
Spending_Score
Low        6494
Average    2599
High       1602
Name: count, dtype: int64
--------------------------------
Var_1
Cat_6    6910
Cat_4    1475
Cat_3    1089
Cat_2     563
Cat_7     269
Cat_1     167
Cat_5     114
Name: count, dtype: int64
--------------------------------
Segmentation
D    3027
A    2818
C    2442
B    2408
Name: count, dtype: int64
--------------------------------


# 1- Data Preprocessing

handling columns Name

In [927]:
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A
...,...,...,...,...,...,...,...,...,...,...,...
10690,467954,Male,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,B
10691,467958,Female,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,A
10692,467960,Female,No,53,Yes,Entertainment,,Low,2.0,Cat_6,C
10693,467961,Male,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,C


In [928]:
df=df.rename(columns={"Var_1":"Category"})
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Category,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A
...,...,...,...,...,...,...,...,...,...,...,...
10690,467954,Male,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,B
10691,467958,Female,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,A
10692,467960,Female,No,53,Yes,Entertainment,,Low,2.0,Cat_6,C
10693,467961,Male,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,C


Handling Duplicate Values

In [929]:
df.duplicated().sum()

38

In [930]:
df.drop_duplicates(inplace=True)

In [931]:
df.duplicated().sum()

0

In [932]:
df=df.set_index('ID')
df

Unnamed: 0_level_0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Category,Segmentation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A
...,...,...,...,...,...,...,...,...,...,...
467954,Male,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,B
467958,Female,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,A
467960,Female,No,53,Yes,Entertainment,,Low,2.0,Cat_6,C
467961,Male,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,C


Handling Null Values

In [933]:
No_nulls=df.isna().sum().reset_index().rename(columns={0:'No_nulls'})
No_nulls

Unnamed: 0,index,No_nulls
0,Gender,0
1,Ever_Married,189
2,Age,0
3,Graduated,100
4,Profession,162
5,Work_Experience,1090
6,Spending_Score,0
7,Family_Size,447
8,Category,108
9,Segmentation,0


In [934]:

column_with_nan=list(No_nulls[No_nulls['No_nulls']> 0 ]['index'])
column_with_nan

['Ever_Married',
 'Graduated',
 'Profession',
 'Work_Experience',
 'Family_Size',
 'Category']

In [935]:
cat_cols_withNan = []
num_cols_withNan = []
num_cols_withNan, cat_cols_withNan = identify_columns(df, column_with_nan)

num_cols_withNan, cat_cols_withNan

(['Work_Experience', 'Family_Size'],
 ['Ever_Married', 'Graduated', 'Profession', 'Category'])

In [936]:
# Handling_cat_cols_withNan

df['Profession'] = df.groupby('Spending_Score')['Profession'].transform(lambda x: x.fillna(x.mode()[0]))
df['Graduated'] = df.groupby('Age')['Graduated'].transform(lambda x: x.fillna(x.mode()[0]))
df['Category'] = df.groupby('Segmentation')['Category'].transform(lambda x: x.fillna(x.mode()[0]))
df['Ever_Married'] = df['Ever_Married'].fillna(df['Ever_Married'].mode()[0])

In [937]:
# Handling_num_cols_withNan

for col in num_cols_withNan :   
    df[col] = df.groupby('Profession')[col].transform(lambda x: x.fillna(x.mean().round()))

In [938]:
df.isna().sum()

Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Category           0
Segmentation       0
dtype: int64

Handling Data Type

In [939]:
df.dtypes

Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Category            object
Segmentation        object
dtype: object

In [940]:
df = df.astype({'Family_Size' : int})

In [941]:
df.dtypes

Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size          int32
Category            object
Segmentation        object
dtype: object

Handling Outlier

In [942]:
num_cols,cat_cols=identify_columns(df,df.columns)

In [943]:

for col in num_cols:
    fig = px.box(df, x=col, title=col)
    fig.show()

In [944]:
def Outliers_Handling(data, num_cols):

    for col in num_cols:
        Q1 = np.quantile(data[col], 0.25)
        Q3 = np.quantile(data[col], 0.75)
        IQR = Q3 - Q1

        Upper_Bound = Q3 + 1.5 * IQR
        Lower_Bound = Q1 - 1.5 * IQR
        
        data[col] = np.clip(data[col], Lower_Bound, Upper_Bound)

    return data

df = Outliers_Handling(df, num_cols)


In [945]:
#After Handling

for col in num_cols:
    fig = px.box(df, x=col, title=col)
    fig.show()

# 2- Handling Imbalanced Data


In [946]:
df

Unnamed: 0_level_0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Category,Segmentation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
462809,Male,No,22.0,No,Healthcare,1.0,Low,4,Cat_4,D
462643,Female,Yes,38.0,Yes,Engineer,3.0,Average,3,Cat_4,A
466315,Female,Yes,67.0,Yes,Engineer,1.0,Low,1,Cat_6,B
461735,Male,Yes,67.0,Yes,Lawyer,0.0,High,2,Cat_6,B
462669,Female,Yes,40.0,Yes,Entertainment,3.0,High,6,Cat_6,A
...,...,...,...,...,...,...,...,...,...,...
467954,Male,No,29.0,No,Healthcare,9.0,Low,4,Cat_6,B
467958,Female,No,35.0,Yes,Doctor,1.0,Low,1,Cat_6,A
467960,Female,No,53.0,Yes,Entertainment,3.0,Low,2,Cat_6,C
467961,Male,Yes,47.0,Yes,Executive,1.0,High,5,Cat_4,C


In [947]:
count_df = df["Segmentation"].value_counts().reset_index()
count_df.columns = ["Segmentation", "Count"]

# Plotting the bar plot
fig = px.bar(
    count_df,
    x="Segmentation",
    y="Count",
    color="Segmentation",  
    color_discrete_map={"A": '#DEBB96', "B": '#BF7E78',"C":'#8C543F',"D":'#3A3A3B'}
)
fig.show()


In [948]:
def oversample_Handling_highly_uniform_columns(df, target_col):
    if df[target_col].dtype == "object":
        value_counts = df[col].value_counts()
        max_count = value_counts.max()
        total_count = df.shape[0]  # Total count of all values in the DataFrame
        if max_count >= total_count * 0.8:
            oversample = RandomOverSampler()
            X_resampled, y_resampled = oversample.fit_resample(
                df.drop(columns=[col]), df[col]
            )
            df = pd.concat([X_resampled, y_resampled], axis=1)
    return df

df=oversample_Handling_highly_uniform_columns(df, 'Segmentation')

In [949]:
count_df = df["Segmentation"].value_counts().reset_index()
count_df.columns = ["Segmentation", "Count"]

# Plotting the bar plot
fig = px.bar(
    count_df,
    x="Segmentation",
    y="Count",
    color="Segmentation",  
    color_discrete_map={"A": '#DEBB96', "B": '#BF7E78',"C":'#8C543F',"D":'#3A3A3B'}
)
fig.show()

# Encoding

In [950]:
df

Unnamed: 0_level_0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Category,Segmentation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
462809,Male,No,22.0,No,Healthcare,1.0,Low,4,Cat_4,D
462643,Female,Yes,38.0,Yes,Engineer,3.0,Average,3,Cat_4,A
466315,Female,Yes,67.0,Yes,Engineer,1.0,Low,1,Cat_6,B
461735,Male,Yes,67.0,Yes,Lawyer,0.0,High,2,Cat_6,B
462669,Female,Yes,40.0,Yes,Entertainment,3.0,High,6,Cat_6,A
...,...,...,...,...,...,...,...,...,...,...
467954,Male,No,29.0,No,Healthcare,9.0,Low,4,Cat_6,B
467958,Female,No,35.0,Yes,Doctor,1.0,Low,1,Cat_6,A
467960,Female,No,53.0,Yes,Entertainment,3.0,Low,2,Cat_6,C
467961,Male,Yes,47.0,Yes,Executive,1.0,High,5,Cat_4,C


Lable Encoding

In [951]:
def lable_columns(df, ordinal_cols):
    encoded_df = df.copy()
    label_encoder = LabelEncoder()
    for col in ordinal_cols:
        encoded_df[col] = label_encoder.fit_transform(df[col])
    return encoded_df

ordinal_cols = ['Ever_Married','Graduated','Spending_Score','Category','Segmentation']
df = lable_columns(df, ordinal_cols)
df

Unnamed: 0_level_0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Category,Segmentation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
462809,Male,0,22.0,0,Healthcare,1.0,2,4,3,3
462643,Female,1,38.0,1,Engineer,3.0,0,3,3,0
466315,Female,1,67.0,1,Engineer,1.0,2,1,5,1
461735,Male,1,67.0,1,Lawyer,0.0,1,2,5,1
462669,Female,1,40.0,1,Entertainment,3.0,1,6,5,0
...,...,...,...,...,...,...,...,...,...,...
467954,Male,0,29.0,0,Healthcare,9.0,2,4,5,1
467958,Female,0,35.0,1,Doctor,1.0,2,1,5,0
467960,Female,0,53.0,1,Entertainment,3.0,2,2,5,2
467961,Male,1,47.0,1,Executive,1.0,1,5,3,2


OneHotEncoding

In [952]:
def OneHotEncoding_columns(df, nominal_cols):
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[nominal_cols])
    one_hot_df = pd.DataFrame(
        one_hot_encoded, columns=encoder.get_feature_names_out(nominal_cols)
    )

    df = df.reset_index()
    df_encoded = pd.concat([df, one_hot_df], axis=1)
    # Drop the original nominal columns
    df_encoded = df_encoded.drop(nominal_cols, axis=1)

    return df_encoded


nominal_cols = ["Profession", "Gender"]
df = OneHotEncoding_columns(df, nominal_cols)
df.set_index('ID',inplace=True)
df

Unnamed: 0_level_0,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Category,Segmentation,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Gender_Female,Gender_Male
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
462809,0,22.0,0,1.0,2,4,3,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
462643,1,38.0,1,3.0,0,3,3,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
466315,1,67.0,1,1.0,2,1,5,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
461735,1,67.0,1,0.0,1,2,5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
462669,1,40.0,1,3.0,1,6,5,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467954,0,29.0,0,9.0,2,4,5,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
467958,0,35.0,1,1.0,2,1,5,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
467960,0,53.0,1,3.0,2,2,5,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
467961,1,47.0,1,1.0,1,5,3,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


# Scaling

In [953]:
num_cols,cat_cols = identify_columns(df,df.columns)
corr_matrix = df[num_cols].corr()
fig = px.imshow(corr_matrix, 
                text_auto=True, 
                aspect="auto", 
                color_continuous_scale='RdBu_r', 
                labels={'color': 'Correlation'},
                template='plotly_dark')

fig.show()


In [954]:
num_cols,cat_cols = identify_columns(df,df.columns)
def min_max_scale(df, num_cols, target_col):
    
    if target_col in num_cols :
        num_cols.remove(target_col)
        
    scaler = MinMaxScaler()
    df_scaled = df.copy()
    df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])
    return df_scaled

df = min_max_scale(df, num_cols, 'Segmentation')
df


Unnamed: 0_level_0,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Category,Segmentation,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Gender_Female,Gender_Male
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
462809,0.0,0.057554,0.0,0.1,1.0,0.500000,0.500000,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
462643,1.0,0.287770,1.0,0.3,0.0,0.333333,0.500000,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
466315,1.0,0.705036,1.0,0.1,1.0,0.000000,0.833333,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
461735,1.0,0.705036,1.0,0.0,0.5,0.166667,0.833333,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
462669,1.0,0.316547,1.0,0.3,0.5,0.833333,0.833333,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467954,0.0,0.158273,0.0,0.9,1.0,0.500000,0.833333,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
467958,0.0,0.244604,1.0,0.1,1.0,0.000000,0.833333,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
467960,0.0,0.503597,1.0,0.3,1.0,0.166667,0.833333,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
467961,1.0,0.417266,1.0,0.1,0.5,0.666667,0.500000,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


# Feature Selection

In [955]:
x = df.drop('Segmentation', axis=1)
y = df['Segmentation']


In [956]:
# Model
model = LogisticRegression(max_iter=1000)

# RFE
rfe = RFE(model, n_features_to_select=15)
fit = rfe.fit(x, y)

Filter = rfe.support_
Ranking = rfe.ranking_
# Selected Features.
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (Filter))
print("Feature Ranking: %s" % (Ranking))


Num Features: 15
Selected Features: [False  True  True  True  True  True  True  True False  True  True  True
  True  True  True  True False  True]
Feature Ranking: [3 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 4 1]


In [957]:
data = x[x.columns[Filter]]
data.loc[:, 'Segmentation'] = y
data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Category,Profession_Artist,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Gender_Male,Segmentation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
462809,0.057554,0.0,0.1,1.0,0.500000,0.500000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3
462643,0.287770,1.0,0.3,0.0,0.333333,0.500000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
466315,0.705036,1.0,0.1,1.0,0.000000,0.833333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
461735,0.705036,1.0,0.0,0.5,0.166667,0.833333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1
462669,0.316547,1.0,0.3,0.5,0.833333,0.833333,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467954,0.158273,0.0,0.9,1.0,0.500000,0.833333,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
467958,0.244604,1.0,0.1,1.0,0.000000,0.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
467960,0.503597,1.0,0.3,1.0,0.166667,0.833333,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
467961,0.417266,1.0,0.1,0.5,0.666667,0.500000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2


# train test split

In [958]:
x = data.drop('Segmentation', axis=1)
y = data['Segmentation']
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=0
)

# LogisticRegression

In [959]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)


In [960]:

# Calculate accuracy
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

# Calculate precision, recall, and F1 score for multiclass
precision = precision_score(y_test, test_predictions, average='weighted')
recall = recall_score(y_test, test_predictions, average='weighted')
f1 = f1_score(y_test, test_predictions, average='weighted')
print("\nPrecision (weighted):", precision)
print("Recall (weighted):", recall)
print("F1 Score (weighted):", f1)


Training Accuracy: 0.46709209209209207
Testing Accuracy: 0.46904315196998125

Confusion Matrix:
[[371  48 123 164]
 [242  75 233  84]
 [132  54 337  79]
 [178  18  60 467]]

Precision (weighted): 0.45864334164075427
Recall (weighted): 0.46904315196998125
F1 Score (weighted): 0.44310824488544404


In [961]:

# Classification report for detailed metrics
class_report = classification_report(y_test, test_predictions)
print("\nClassification Report:")
print(class_report)


Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.53      0.46       706
           1       0.38      0.12      0.18       634
           2       0.45      0.56      0.50       602
           3       0.59      0.65      0.62       723

    accuracy                           0.47      2665
   macro avg       0.46      0.46      0.44      2665
weighted avg       0.46      0.47      0.44      2665



# SVM

In [962]:

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 1)
clf = SVC(kernel='rbf')
clf = clf.fit(x_train, y_train)

train_predictions = clf.predict(x_train)
test_predictions = clf.predict(x_test)


In [None]:

# Calculate accuracy
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

# Calculate precision, recall, and F1 score for multiclass
precision = precision_score(y_test, test_predictions, average='weighted')
recall = recall_score(y_test, test_predictions, average='weighted')
f1 = f1_score(y_test, test_predictions, average='weighted')
print("\nPrecision (weighted):", precision)
print("Recall (weighted):", recall)
print("F1 Score (weighted):", f1)


Training Accuracy: 0.48998998998999
Testing Accuracy: 0.4619136960600375

Confusion Matrix:
[[312 155 100 146]
 [177 168 157  70]
 [117 126 315  73]
 [216  58  39 436]]

Precision (weighted): 0.4637561980649641
Recall (weighted): 0.4619136960600375
F1 Score (weighted): 0.46196508331685543
