In [85]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [86]:
df = pd.read_csv("persona.csv", delimiter=',' , low_memory=False )


In [87]:
df.head()


Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   PRICE    5000 non-null   int64 
 1   SOURCE   5000 non-null   object
 2   SEX      5000 non-null   object
 3   COUNTRY  5000 non-null   object
 4   AGE      5000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 234.4+ KB


In [89]:
df.tail()

Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
4995,29,android,female,bra,31
4996,29,android,female,bra,31
4997,29,android,female,bra,31
4998,39,android,female,bra,31
4999,29,android,female,bra,31


In [90]:
df.describe()


Unnamed: 0,PRICE,AGE
count,5000.0,5000.0
mean,34.132,23.5814
std,12.464897,8.995908
min,9.0,15.0
25%,29.0,17.0
50%,39.0,21.0
75%,39.0,27.0
max,59.0,66.0


In [91]:
df.shape


(5000, 5)

In [92]:
df.columns

Index(['PRICE', 'SOURCE', 'SEX', 'COUNTRY', 'AGE'], dtype='object')

In [93]:
df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            4990, 4991, 4992, 4993, 4994, 4995, 4996, 4997, 4998, 4999],
           dtype='int64', length=5000)

In [94]:
df['SOURCE'].unique()
df['SOURCE'].value_counts()

android    2974
ios        2026
Name: SOURCE, dtype: int64

In [95]:
df.PRICE.nunique()

6

In [96]:
df.PRICE.value_counts()

29    1305
39    1260
49    1031
19     992
59     212
9      200
Name: PRICE, dtype: int64

In [97]:
df.COUNTRY.value_counts()



usa    2065
bra    1496
deu     455
tur     451
fra     303
can     230
Name: COUNTRY, dtype: int64

In [98]:
# Soru 6: Ülkelere göre satışlardan toplam ne kadar kazanılmış?

df.groupby('COUNTRY').agg({'PRICE': 'sum'})



Unnamed: 0_level_0,PRICE
COUNTRY,Unnamed: 1_level_1
bra,51354
can,7730
deu,15485
fra,10177
tur,15689
usa,70225


In [99]:
# df.groupby('COUNTRY')['PRICE'].sum()

# Soru 7: SOURCE türlerine göre göre satış sayıları nedir?

df.SOURCE.value_counts()




android    2974
ios        2026
Name: SOURCE, dtype: int64

In [100]:
# Soru 8: Ülkelere göre PRICE ortalamaları nedir?

df.groupby('COUNTRY').agg({'PRICE': 'mean'})



Unnamed: 0_level_0,PRICE
COUNTRY,Unnamed: 1_level_1
bra,34.32754
can,33.608696
deu,34.032967
fra,33.587459
tur,34.78714
usa,34.007264


In [101]:
# Soru 9: SOURCE'lara göre PRICE ortalamaları nedir?

df.groupby(['SOURCE']).agg({'PRICE': 'mean'})



Unnamed: 0_level_0,PRICE
SOURCE,Unnamed: 1_level_1
android,34.174849
ios,34.069102


In [102]:
# Soru 10: COUNTRY-SOURCE kırılımında PRICE ortalamaları nedir?

df.groupby(['SOURCE', 'COUNTRY']).agg({'PRICE': 'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,PRICE
SOURCE,COUNTRY,Unnamed: 2_level_1
android,bra,34.387029
android,can,33.330709
android,deu,33.869888
android,fra,34.3125
android,tur,36.229437
android,usa,33.760357
ios,bra,34.222222
ios,can,33.951456
ios,deu,34.268817
ios,fra,32.776224


In [103]:
agg_df = df.groupby(['COUNTRY', 'SOURCE', 'SEX', 'AGE']).agg({'PRICE': 'mean'})

In [104]:

agg_df.sort_values('PRICE', ascending=False)

len(agg_df.columns)


agg_df = agg_df.reset_index()
agg_df.columns

Index(['COUNTRY', 'SOURCE', 'SEX', 'AGE', 'PRICE'], dtype='object')

In [105]:
my_bins = [0, 18, 23, 30, 40, agg_df['AGE'].max()]

mylabels = ['0_18', '19_23', '24_30', '31_40', '41_' + str(agg_df['AGE'].max())]
# mylabels = ['0_18', '19_23', '24_30', '31_40', f'41_{agg_df["AGE"].max()}']

# age'i bölelim:
pd.cut(agg_df['AGE'], bins=my_bins, labels=mylabels)

agg_df['AGE_CAT'] = pd.cut(agg_df['AGE'], bins=my_bins, labels=mylabels)
agg_df.head()



Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,AGE_CAT
0,bra,android,female,15,38.714286,0_18
1,bra,android,female,16,35.944444,0_18
2,bra,android,female,17,35.666667,0_18
3,bra,android,female,18,32.255814,0_18
4,bra,android,female,19,35.206897,19_23


In [106]:
agg_df.drop(['AGE', 'PRICE'], axis=1).values



array([['bra', 'android', 'female', '0_18'],
       ['bra', 'android', 'female', '0_18'],
       ['bra', 'android', 'female', '0_18'],
       ...,
       ['usa', 'ios', 'male', '41_66'],
       ['usa', 'ios', 'male', '41_66'],
       ['usa', 'ios', 'male', '41_66']], dtype=object)

In [107]:

liste = ['A', 'B', 'C']
'-'.join(liste)


'A-B-C'

In [108]:
agg_df["CUSTOMERS_LEVEL_BASED"] = ["_".join(i).upper() for i in agg_df.drop(['AGE', 'PRICE'], axis=1).values]
agg_df



Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,AGE_CAT,CUSTOMERS_LEVEL_BASED
0,bra,android,female,15,38.714286,0_18,BRA_ANDROID_FEMALE_0_18
1,bra,android,female,16,35.944444,0_18,BRA_ANDROID_FEMALE_0_18
2,bra,android,female,17,35.666667,0_18,BRA_ANDROID_FEMALE_0_18
3,bra,android,female,18,32.255814,0_18,BRA_ANDROID_FEMALE_0_18
4,bra,android,female,19,35.206897,19_23,BRA_ANDROID_FEMALE_19_23
...,...,...,...,...,...,...,...
343,usa,ios,male,42,30.250000,41_66,USA_IOS_MALE_41_66
344,usa,ios,male,50,39.000000,41_66,USA_IOS_MALE_41_66
345,usa,ios,male,53,34.000000,41_66,USA_IOS_MALE_41_66
346,usa,ios,male,55,29.000000,41_66,USA_IOS_MALE_41_66


In [109]:


# Gereksiz değişkenleri çıkar

agg_df.head()
agg_df = agg_df[['CUSTOMERS_LEVEL_BASED', 'PRICE']]

agg_df = agg_df.groupby('CUSTOMERS_LEVEL_BASED')['PRICE'].mean().reset_index()

###










In [110]:



#############################################
# GÖREV 7: Yeni müşterileri (USA_ANDROID_MALE_0_18) segmentlere ayırınız.


[23, 27, 34, 34, 35, 39, 41, 48]

agg_df['SEGMENT'] = pd.qcut(agg_df.PRICE, q=4, labels=['D', 'C', 'B','A'])
agg_df.head()




Unnamed: 0,CUSTOMERS_LEVEL_BASED,PRICE,SEGMENT
0,BRA_ANDROID_FEMALE_0_18,35.645303,B
1,BRA_ANDROID_FEMALE_19_23,34.07734,C
2,BRA_ANDROID_FEMALE_24_30,33.863946,C
3,BRA_ANDROID_FEMALE_31_40,34.898326,B
4,BRA_ANDROID_FEMALE_41_66,36.737179,A


In [111]:

agg_df.groupby('SEGMENT').agg({'PRICE': 'mean'}).reset_index()


Unnamed: 0,SEGMENT,PRICE
0,D,29.20678
1,C,33.509674
2,B,34.999645
3,A,38.691234


In [115]:

new_user = 'TUR_ANDROID_FEMALE_31_40'
agg_df[agg_df['CUSTOMERS_LEVEL_BASED'] == new_user]

# 35 yaşında IOS kullanan bir Fransız kadını hangi segmente ve ortalama ne kadar gelir kazandırması beklenir?

new_user = 'FRA_IOS_FEMALE_31_40'
agg_df[agg_df['CUSTOMERS_LEVEL_BASED'] == new_user]
agg_df[agg_df['CUSTOMERS_LEVEL_BASED'] == 'BRA_ANDROID_FEMALE_0_18']


df[['PRICE', 'AGE']].corr()

Unnamed: 0,PRICE,AGE
PRICE,1.0,-0.010202
AGE,-0.010202,1.0
