In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/persona/persona.csv


# **Rule-Based Classification and Potential Customer Earn Calculation**

##  **What is Rule-Based Classification?**

Rule-based classification is a fundamental technique in data science that utilizes predefined “if-then” rules to categorize data into distinct classes. These rules are derived from the inherent patterns and relationships within the data, making the classification process both transparent and interpretable. For example:

* IF a customer age > 30 AND income > $60,000, THEN classify as “eligible for credit” and approve a loan.
Unlike ML, it doesn’t learn from data but relies on explicit knowledge, making it interpretable and quick to implement.

## **Business Problem**

A gaming company wants to create level-based new customer definitions (personas) using some of its customers' characteristics and create segments based on these new customer definitions and estimate how much new customers can earn on average for the company based on these segments.

For example: It is desired to determine how much an average 25-year-old male user from Turkey who is an IOS user can earn.

## **The Data Set Story**
The "persona.csv" dataset contains the prices of products sold by an international game company and includes some demographic information about the users who purchase these products. The dataset consists of records generated from each sales transaction. This means that the table is not de-duplicated. In other words, a user with specific demographic characteristics may have made multiple purchases.

**Features in the Dataset:**
* **price:** The amount spent by the customer
* **source:** The type of device the customer is connected to
* **sex:** The gender of the customer
* **country:** The country of the customer
* **age:** The age of the customer

In [2]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.simplefilter(action="ignore")

In [3]:
# rows and columns settings
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

In [4]:
# loading the dataset
df = pd.read_csv("/kaggle/input/persona/persona.csv")

In [5]:
# writing a function to look at the overall picture
def check_df(dataframe, head=5):
    print('##################### Shape #####################')
    print(dataframe.shape)
    print('##################### Types #####################')
    print(dataframe.dtypes)
    print('##################### Head #####################')
    print(dataframe.head(head))
    print('##################### Tail #####################')
    print(dataframe.tail(head))
    print('##################### NA #####################')
    print(dataframe.isnull().sum())
    print('##################### Quantiles #####################')
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [6]:
check_df(df)

##################### Shape #####################
(5000, 5)
##################### Types #####################
PRICE       int64
SOURCE     object
SEX        object
COUNTRY    object
AGE         int64
dtype: object
##################### Head #####################
   PRICE   SOURCE   SEX COUNTRY  AGE
0     39  android  male     bra   17
1     39  android  male     bra   17
2     49  android  male     bra   17
3     29  android  male     tur   17
4     49  android  male     tur   17
##################### Tail #####################
      PRICE   SOURCE     SEX COUNTRY  AGE
4995     29  android  female     bra   31
4996     29  android  female     bra   31
4997     29  android  female     bra   31
4998     39  android  female     bra   31
4999     29  android  female     bra   31
##################### NA #####################
PRICE      0
SOURCE     0
SEX        0
COUNTRY    0
AGE        0
dtype: int64
##################### Quantiles #####################
        count  mean   std   min    

In [7]:
# average price analysis
agg_df = df.groupby(["COUNTRY", "SOURCE", "SEX", "AGE"]).agg({"PRICE": "mean"}).sort_values("PRICE", ascending=False)

In [8]:
agg_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PRICE
COUNTRY,SOURCE,SEX,AGE,Unnamed: 4_level_1
bra,android,male,46,59.0
usa,android,male,36,59.0
fra,android,female,24,59.0
usa,ios,male,32,54.0
deu,android,female,36,49.0


In [9]:
# Converting the names in the index to variable names.
agg_df = agg_df.reset_index()

## **Age Segmentation** 

In [10]:
# age categorization
agg_df["AGE_CAT"] = pd.cut(agg_df["AGE"], bins=[0, 18, 23, 30, 40, 70],
                           labels=['0_18', '19_23', '24_30', '31_40', "41_" + str(agg_df["AGE"].max())])

## **Persona-Based Variable** 

In [11]:
# put the VALUES of the variables COUNTRY, SOURCE, SEX and age next to each other and combine them with an underscore.
agg_df["customers_level_based"] = [row[0].upper() + "_" +
                                   row[1].upper() + "_" +
                                   row[2].upper() + "_" +
                                   row[5].upper() for row in agg_df.values]

In [12]:
# remove other variables
agg_df = agg_df[["customers_level_based", "PRICE"]]

In [13]:
agg_df["customers_level_based"].value_counts().head()

customers_level_based
BRA_ANDROID_MALE_24_30      7
USA_ANDROID_MALE_41_66      7
USA_IOS_FEMALE_24_30        7
BRA_ANDROID_FEMALE_24_30    7
USA_ANDROID_MALE_24_30      7
Name: count, dtype: int64

It is observed that there is more than one value in the same segment.

In [14]:
# deduplication of segments
agg_df = agg_df.groupby("customers_level_based").agg({"PRICE": "mean"})

In [15]:
# Converting the names in the index to variable names.
agg_df = agg_df.reset_index()

## **Customer Segmentation**  

In [16]:
# Segmentation by PRICE variable
agg_df["SEGMENT"] = pd.qcut(agg_df["PRICE"], 4, labels=["D", "C", "B", "A"])

In [17]:
agg_df.head()

Unnamed: 0,customers_level_based,PRICE,SEGMENT
0,BRA_ANDROID_FEMALE_0_18,35.65,B
1,BRA_ANDROID_FEMALE_19_23,34.08,C
2,BRA_ANDROID_FEMALE_24_30,33.86,C
3,BRA_ANDROID_FEMALE_31_40,34.9,B
4,BRA_ANDROID_FEMALE_41_66,36.74,A


## **New Customer Average Revenue Prediction** 

In [18]:
# 33 year old Turkish woman using ANDROID
new_user_1 = "TUR_ANDROID_FEMALE_31_40"

In [19]:
# Average Revenue Prediction
agg_df[agg_df["customers_level_based"] == new_user_1]

Unnamed: 0,customers_level_based,PRICE,SEGMENT
72,TUR_ANDROID_FEMALE_31_40,41.83,A


In [20]:
# 50 year old Brazilian man using IOS
new_user_2 = "BRA_IOS_MALE_41_66"

In [21]:
# Average Revenue Prediction
agg_df[agg_df["customers_level_based"] == new_user_2]

Unnamed: 0,customers_level_based,PRICE,SEGMENT
19,BRA_IOS_MALE_41_66,31.08,D
