# Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Get the data

In [None]:
df=df=pd.read_csv("../input/persona-data/persona.csv")

In [None]:
data=df.copy()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum().sum()

In [None]:
df.describe().T

# EDA

In [None]:
numeric_cols=list(df._get_numeric_data().columns)
print(numeric_cols)

In [None]:
categoric_cols=list(set(df.columns)-set(numeric_cols))
print(categoric_cols)

In [None]:
f, ax = plt.subplots(figsize= [20,15])
sns.heatmap(df.corr(), annot=True, fmt=".2f", ax=ax, cmap = "magma" )
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

In [None]:
for col in categoric_cols:
    print(f'###############{col}###############')
    print(df[col].value_counts())

In [None]:
for col in categoric_cols:
    
    sns.countplot(x=df[col])
    plt.show()

In [None]:
plt.figure(figsize=[16, 8])
plt.pie(x=df['COUNTRY'].value_counts(), autopct="%.2f", labels=df['COUNTRY'].value_counts().index)
plt.title("Country distribution", fontsize=14)

In [None]:
for i in df._get_numeric_data().columns:
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 4))
    sns.histplot(df[i], bins=10, ax=axes[0])
    axes[0].set_title(i)
    
    sns.boxplot(df[i], ax=axes[1])
    axes[1].set_title(i)
   
    sns.kdeplot(df[i], ax=axes[2])
    axes[2].set_title(i)
    plt.show()

In [None]:
#How much was earned in total from sales by country?
print(df.groupby('COUNTRY').agg({'PRICE':'sum'}))

In [None]:
#What are the PRICE averages by country?
print(df.groupby('COUNTRY').agg({'PRICE':'mean'}))

In [None]:
#What are the PRICE averages by SOURCEs?
print(df.groupby('SOURCE').agg({'PRICE':'mean'}))

In [None]:
#What are the PRICE averages in the COUNTRY-SOURCE breakdown?
df.groupby(["COUNTRY","SOURCE"]).agg({"PRICE":"mean"})

In [None]:
#What are the total gains broken down by COUNTRY, SOURCE, SEX, AGE?
df.groupby(["COUNTRY","SOURCE","SEX","AGE"]).agg({"PRICE": "mean"})

In [None]:
#Sort the output by PRICE
agg_df = df.groupby(["COUNTRY","SOURCE","SEX","AGE"]).agg({"PRICE":"mean"}).sort_values("PRICE", ascending=False)

In [None]:
agg_df.head()

In [None]:
agg_df.index

In [None]:
agg_df.reset_index(inplace=True)

In [None]:
agg_df.head()

In [None]:
#Convert age variable to categorical variable and add it to agg_df
agg_df["AGE_CAT"] = pd.cut(agg_df["AGE"], bins=[0, 18, 24, 30, 40, 70], labels=["0_18", "19_23", "24_30", "31_40", "41_70"])

In [None]:
agg_df.head()

In [None]:
#Identify new level-based customers
agg_df["customers_level_based"]=[ 
    agg_df['COUNTRY'][i]+'_'+agg_df['SOURCE'][i]+'_'+agg_df['SEX'][i]+'_'+str(agg_df['AGE_CAT'][i])
 for i in agg_df.index]

In [None]:
agg_df.head()

In [None]:
agg_df["customers_level_based"].value_counts()

In [None]:
agg_df.groupby(["customers_level_based"]).agg({'PRICE':'mean'})

In [None]:
agg_df=agg_df.groupby(["customers_level_based"]).agg({'PRICE':'mean'}).reset_index()

In [None]:
agg_df.head()

In [None]:
agg_df["SEGMENT"] = pd.qcut(agg_df["PRICE"], 4, labels=["D","C","B","A"])

In [None]:
agg_df.head()

In [None]:
agg_df.groupby("SEGMENT").agg({"PRICE" : ["mean", "max", "sum"]})

# Prediction

###### Classify new customers according to their segments and estimate how much income it can generate.

In [None]:
#For example, which segment does a 35 year old French woman using ios belong to?
new_user1 = "FRA_IOS_FEMALE_31_40"
agg_df[agg_df["customers_level_based"] == new_user1.lower()] #C segmenti

In [None]:
#Which segment does a 26 year old American man using IOS belong to?
new_user2 = "USA_IOS_MALE_24_30"
agg_df[agg_df["customers_level_based"] == new_user2.lower()]

In [None]:
new_user3 = "TUR_ANDROID_FEMALE_31_40"
agg_df[agg_df["customers_level_based"] == new_user3.lower()]