<a href="https://www.kaggle.com/omerparlak/rule-based-classification?scriptVersionId=88172420" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Business Problem**

### A game company wants to create level-based new customer definitions (personas) by using some features (Country, Source, Age, Sex) of its customers and to create segments according to these new customer definitions and to estimate how much profit can be generated from  the new customers according to these segments.

### In this study, how to do rule-based classification and customer-based revenue calculation have been discussed step by step.

In [1]:
import pandas as pd
df = pd.read_csv("../input/persona1/persona.csv")
df.head()

Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [2]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################", end="\n\n")
    print(dataframe.shape, end="\n\n")
    print("##################### Types #####################", end="\n\n")
    print(dataframe.dtypes, end="\n\n")
    print("##################### Head #####################", end="\n\n")
    print(dataframe.head(head), end="\n\n")
    print("##################### Tail #####################", end="\n\n")
    print(dataframe.tail(head), end="\n\n")
    print("##################### NA #####################", end="\n\n")
    print(dataframe.isnull().sum(), end="\n\n")
    print("##################### Quantiles #####################", end="\n\n")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T, end="\n\n")
    
    
def data_analysis(data):
    print("##################### Unique Values of Source #####################", end="\n\n")
    print(data[["SOURCE"]].nunique())

    print("##################### Frequency of Source #####################", end="\n\n")
    print(data[["SOURCE"]].value_counts())

    print("##################### Unique Values of Price #####################", end="\n\n")
    print(data[["PRICE"]].nunique())

    print("##################### Number of product sales by sales price #####################", end="\n\n")
    print(data[["PRICE"]].value_counts())

    print("##################### Number of product sales by country #####################", end="\n\n")
    print(data["COUNTRY"].value_counts(ascending=False, normalize=True))

    print("##################### Total & average amount of sales by country #####################", end="\n\n")
    print(data.groupby("COUNTRY").agg({"PRICE": ["mean", "sum"]}))
 
    print("##################### Average amount of sales by source #####################", end="\n\n")
    print(data.groupby("SOURCE").agg({"PRICE": "mean"}))

    print("##################### Average amount of sales by source and country #####################", end="\n\n")
    print(data.pivot_table(values=['PRICE'], index=['COUNTRY'], columns=["SOURCE"], aggfunc=["mean"]))

    
def create_segments(data):
    agg_df = data.groupby(["COUNTRY", "SOURCE", "SEX", "AGE"]).agg({"PRICE": "mean"}).sort_values(by="PRICE", ascending=False)
    agg_df = agg_df.reset_index()
    # age convert to age_cat
    bins = [0, 19, 25, 31, 41, agg_df["AGE"].max()]
    mylabels = ['0_18', '19_24', '25_30', '31_40', '41_' + str(agg_df["AGE"].max())]
    agg_df["AGE_CAT"] = pd.cut(agg_df["AGE"], bins, labels=mylabels)
    # define persona
    agg_df["CUSTOMERS_LEVEL_BASED"] = [str(row[0]).upper() + "_" + str(row[1]).upper() + "_" + str(row[2]).upper() + "_" + str(row[5]).upper()
                                       for row in agg_df.values]
    agg_df = agg_df[["CUSTOMERS_LEVEL_BASED", "PRICE"]]
    agg_df = agg_df.groupby("CUSTOMERS_LEVEL_BASED").agg({"PRICE": "mean"}).sort_values(by="PRICE", ascending=False).reset_index()
    # create segments
    agg_df["SEGMENT"] = pd.qcut(agg_df["PRICE"], 4, ["D", "C", "B", "A"])
    
    
    return agg_df


def new_user(country, source, sex, age):
    age_cat_list = ["0_18", "19_23", "24_30", "31_40", "41_70"]
    if 18 >= age >= 0:
        return country.upper() + "_" + source.upper() + "_" + sex.upper() + "_" + age_cat_list[0]
    elif 23 >= age >= 19:
        return country.upper() + "_" + source.upper() + "_" + sex.upper() + "_" + age_cat_list[1]
    elif 30 >= age >= 24:
        return country.upper() + "_" + source.upper() + "_" + sex.upper() + "_" + age_cat_list[2]
    elif 40 >= age >= 31:
        return country.upper() + "_" + source.upper() + "_" + sex.upper() + "_" + age_cat_list[3]
    elif 70 >= age >= 41:
        return country.upper() + "_" + source.upper() + "_" + sex.upper() + "_" + age_cat_list[4]



def ruled_based_classification(user):
    print(agg_df[agg_df["CUSTOMERS_LEVEL_BASED"] == user])

# **Data Analysis**

In [3]:
check_df(df)

##################### Shape #####################

(5000, 5)

##################### Types #####################

PRICE       int64
SOURCE     object
SEX        object
COUNTRY    object
AGE         int64
dtype: object

##################### Head #####################

   PRICE   SOURCE   SEX COUNTRY  AGE
0     39  android  male     bra   17
1     39  android  male     bra   17
2     49  android  male     bra   17
3     29  android  male     tur   17
4     49  android  male     tur   17

##################### Tail #####################

      PRICE   SOURCE     SEX COUNTRY  AGE
4995     29  android  female     bra   31
4996     29  android  female     bra   31
4997     29  android  female     bra   31
4998     39  android  female     bra   31
4999     29  android  female     bra   31

##################### NA #####################

PRICE      0
SOURCE     0
SEX        0
COUNTRY    0
AGE        0
dtype: int64

##################### Quantiles #####################

       0.00  0.05  0.50 

In [4]:
data_analysis(df)

##################### Unique Values of Source #####################

SOURCE    2
dtype: int64
##################### Frequency of Source #####################

SOURCE 
android    2974
ios        2026
dtype: int64
##################### Unique Values of Price #####################

PRICE    6
dtype: int64
##################### Number of product sales by sales price #####################

PRICE
29       1305
39       1260
49       1031
19        992
59        212
9         200
dtype: int64
##################### Number of product sales by country #####################

usa    0.4130
bra    0.2992
deu    0.0910
tur    0.0902
fra    0.0606
can    0.0460
Name: COUNTRY, dtype: float64
##################### Total & average amount of sales by country #####################

             PRICE       
              mean    sum
COUNTRY                  
bra      34.327540  51354
can      33.608696   7730
deu      34.032967  15485
fra      33.587459  10177
tur      34.787140  15689
usa      34.007264 

# **Create Personas and Segments**

In [5]:
agg_df = create_segments(df)
agg_df.head()

Unnamed: 0,CUSTOMERS_LEVEL_BASED,PRICE,SEGMENT
0,FRA_ANDROID_FEMALE_19_24,43.375,A
1,TUR_ANDROID_FEMALE_31_40,43.0,A
2,TUR_IOS_MALE_31_40,42.333333,A
3,FRA_IOS_MALE_19_24,42.0,A
4,BRA_ANDROID_MALE_41_66,41.5,A


In [6]:
# Segment Analysis:

agg_df.groupby("SEGMENT").agg({"PRICE": ["mean", "max", "sum"]})

Unnamed: 0_level_0,PRICE,PRICE,PRICE
Unnamed: 0_level_1,mean,max,sum
SEGMENT,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
D,29.289408,31.833333,790.814026
C,33.230841,34.072453,897.232718
B,34.867464,35.74215,941.421528
A,38.517678,43.375,1039.977297


## **New Users**

In [7]:
# Country : TURKEY (TUR)
# Source : ANDROID
# Sex : FEMALE
# Age : 33

user1 = new_user("TUR", "ANDROID", "FEMALE", 33)
user1

'TUR_ANDROID_FEMALE_31_40'

In [8]:
# Country : FRANCE (FRA)
# Source : IOS
# Sex : FEMALE
# Age : 35

user2 = new_user("FRA", "IOS", "FEMALE", 35)
user2

'FRA_IOS_FEMALE_31_40'

## **Predict for New Users**

In [9]:
ruled_based_classification(user1)

      CUSTOMERS_LEVEL_BASED  PRICE SEGMENT
1  TUR_ANDROID_FEMALE_31_40   43.0       A


In [10]:
ruled_based_classification(user2)

   CUSTOMERS_LEVEL_BASED  PRICE SEGMENT
70  FRA_IOS_FEMALE_31_40   33.0       C
