In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
# Step 1: Load the datasets
data_old = pd.read_excel('old_data.xlsx')
data_new = pd.read_excel('new_customer_data.xlsx')

In [3]:
copy_old_data = data_old.copy()

# Data row and col formatting

In [4]:
data_old = data_old.drop(['transaction_id','transaction_date', 'product_class','Profit','recency', 'freq', 'monetary','Clusturs'], axis=1)

In [5]:
data_old= data_old[['customer_id', 'gender', 'past_3_years_bike_related_purchases', 'job_industry_category', 'wealth_segment', 'owns_car', 'tenure', 'Age',
       'state', 'property_valuation', 'Customer_segment']]

In [None]:
data_new.head()

In [6]:
data_new= data_new.rename(columns={'index' :'customer_id'})

In [7]:
data_new= data_new[['customer_id','gender', 'past_3_years_bike_related_purchases','job_industry_category', 'wealth_segment', 'owns_car', 'tenure','Age',
       'state', 'property_valuation' ]]

# converting data to numeric form

In [8]:
data_new['gender'] = data_new['gender'].astype('category')
data_new['job_industry_category'] = data_new['job_industry_category'].astype('category')
data_new['wealth_segment'] = data_new['wealth_segment'].astype('category')
data_new[ 'owns_car'] = data_new[ 'owns_car'].astype('category')
data_new['state'] = data_new['state'].astype('category')

In [9]:
data_old['gender'] = data_old['gender'].astype('category')
data_old['job_industry_category'] = data_old['job_industry_category'].astype('category')
data_old['wealth_segment'] = data_old['wealth_segment'].astype('category')
data_old[ 'owns_car'] = data_old[ 'owns_car'].astype('category')
data_old['state'] = data_old['state'].astype('category')
data_old['Customer_segment'] = data_old['Customer_segment'].astype('category')

In [10]:
cat_columns = data_old.select_dtypes(['category']).columns
data_old[cat_columns] =data_old[cat_columns].apply(lambda x: x.cat.codes)
data_old.head()

Unnamed: 0,customer_id,gender,past_3_years_bike_related_purchases,job_industry_category,wealth_segment,owns_car,tenure,Age,state,property_valuation,Customer_segment
0,2950,1,19,2,2,1,10,68,2,6,2
1,2950,1,19,2,2,1,10,68,2,6,2
2,2950,1,19,2,2,1,10,68,2,6,2
3,3120,0,89,3,2,1,10,44,0,5,3
4,3120,0,89,3,2,1,10,44,0,5,3


In [11]:
data_old['Customer_segment'].unique()

array([2, 3, 0, 1], dtype=int8)

In [12]:
data_new.dtypes

customer_id                               int64
gender                                 category
past_3_years_bike_related_purchases       int64
job_industry_category                  category
wealth_segment                         category
owns_car                               category
tenure                                    int64
Age                                       int64
state                                  category
property_valuation                        int64
dtype: object

In [13]:
cat_columns_2 = data_new.select_dtypes(['category']).columns
data_new[cat_columns_2] =data_new[cat_columns_2].apply(lambda x: x.cat.codes)
data_new.head()

Unnamed: 0,customer_id,gender,past_3_years_bike_related_purchases,job_industry_category,wealth_segment,owns_car,tenure,Age,state,property_valuation
0,5667,1,86,5,2,1,14,66,1,6
1,5640,1,69,6,2,0,16,53,0,11
2,5352,0,10,2,0,0,10,49,2,5
3,5424,0,64,5,0,1,5,44,1,1
4,5671,0,34,2,0,0,19,58,0,9


# Labels and featues 

In [14]:
y = data_old['Customer_segment']
X = data_old.drop('Customer_segment', axis=1)

# Training and testing model 

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train =scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [17]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion= 'entropy', random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [18]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

precision_metric = precision_score(y_test, y_pred, average= 'macro')
recall_metric = recall_score(y_test, y_pred, average= 'macro')
accuracy_metric = accuracy_score(y_test, y_pred)

print('precision_metric : {0:0.4f}'.format(precision_metric))
print('recall_metric : {0:0.4f}'.format(recall_metric))
print('accuracy_metric : {0:0.4f}'.format(accuracy_metric))

precision_metric : 0.9795
recall_metric : 0.9740
accuracy_metric : 0.9824


# Now passing new Customer data through the model 

In [19]:
X_data_new = scaler.transform(data_new)

In [20]:
y_pred_2 = classifier.predict(X_data_new)

# getting predicted data in excel format 

In [21]:
y_pred = pd.DataFrame(y_pred)

In [22]:
y_pred[0].unique()

array([2, 1, 3, 0], dtype=int8)

In [23]:
results_df = pd.DataFrame(y_pred_2)

In [24]:
results_df[0].unique()

array([2, 0, 3, 1], dtype=int8)

In [28]:
copy_old_data= copy_old_data.drop(['transaction_id','transaction_date', 'product_class',
       'Profit','recency', 'freq', 'monetary',
       'Clusturs'], axis=1)

In [29]:
copy_old_data.columns

Index(['customer_id', 'gender', 'past_3_years_bike_related_purchases',
       'job_industry_category', 'wealth_segment', 'owns_car', 'tenure', 'Age',
       'state', 'property_valuation', 'Customer_segment'],
      dtype='object')

In [30]:
data_old.columns

Index(['customer_id', 'gender', 'past_3_years_bike_related_purchases',
       'job_industry_category', 'wealth_segment', 'owns_car', 'tenure', 'Age',
       'state', 'property_valuation', 'Customer_segment'],
      dtype='object')

In [33]:
cat_columns =['customer_id', 'gender', 'past_3_years_bike_related_purchases',
       'job_industry_category', 'wealth_segment', 'owns_car', 'tenure', 'Age',
       'state', 'property_valuation', 'Customer_segment']
cat_mappings = {}

for column in cat_columns:
    cat_mappings[column] = dict(enumerate(copy_old_data[column].astype('category').cat.categories))


In [48]:
def cluster_name(row):
    if row[0] == 0:
        return 'BROWNZ'
    elif row[0] == 1:
        return 'GOLD'
    elif row[0] == 3:
        return 'SILVER'
    elif row[0] == 2:
        return 'PLATINUM'

In [40]:
print(cat_mappings['Customer_segment'])

{0: 'BROWNZ', 1: 'GOLD', 2: 'PLATINUM', 3: 'SILVER'}


In [49]:
results_df['0'] = results_df.apply(cluster_name, axis=1)

In [50]:
results_df

Unnamed: 0,0,0.1
0,2,PLATINUM
1,2,PLATINUM
2,0,BROWNZ
3,2,PLATINUM
4,3,SILVER
...,...,...
710,3,SILVER
711,3,SILVER
712,2,PLATINUM
713,2,PLATINUM


In [None]:
results_df.to_excel('y_pred_2.xlsx', index=False)