# Inferential Statistics

In [1]:
# data manipulation
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
sns.set_palette('muted')

# statistics
from scipy import stats

In [2]:
reviews_df = pd.read_csv('basenotes_reviews_df_clean.csv')

In [3]:
reviews_df.head()

Unnamed: 0,rating_value,review_id,user_id,user_name,user_location,scent_id,scent_name,review_text,scent_brand
0,2.0,232695,13378905,Sniffers,United States,26133451.0,Cendres de Thé,On opening I get a cardamom smelling tea with ...,Phaedon
1,1.0,232691,1361,drseid,United States,26158904.0,Salted Green Mango,Salted Green Mango opens with subdued bitter o...,Strangers Parfumerie
2,2.0,232688,26195006,speedracer,United States,26161313.0,Nuit d'Issey Polaris,"Spicy-warm vanilla, green cypress and slightly...",Issey Miyake
3,2.0,232682,13381457,rbaker,United Kingdom,26151148.0,Garden Lilies,"The lily is present form the start, quite nice...",Jo Malone London
4,2.0,232676,28472,rogalal,United States,26135345.0,Santa Subita,"After a shock of mint quickly passes, this mos...",Technique Indiscrete


## 1. Username

In [4]:
def t_test(data, column, value):
    '''Function to run a t-test and return a value with its p-value.'''
    has = data[data[column] == value]['rating_value']
    has_not = data[data[column] != value]['rating_value']
    t, p = stats.ttest_ind(has, has_not)
    return p

In [7]:
# set alpha
alpha = 0.05

# calculate counts for each username
user_name_counts = reviews_df['user_name'].value_counts()

# exclude users that appear less than 30 times
user_names_30 = user_name_counts[user_name_counts >= 30].index

# perform t-test on ratings with and without each username
pvalue_user_names = [(i, t_test(reviews_df, 'user_name', i)) for i in user_names_30]

# return usernames with p-values below alpha along with their p-values
sig_user_names = [(i[0], float(i[1])) for i in pvalue_user_names if float(i[1]) < alpha]

In [9]:
len(sig_user_names)

415

In [10]:
def sig_avg_diff(data, column, value):
    '''Return difference of mean ratings with and without each username.'''
    has = np.mean(data[data[column] == value]['rating_value'])
    has_not = np.mean(data[data[column] != value]['rating_value'])
    diff = has - has_not
    return diff

In [11]:
# make list of significant usernames
sig_user_names_list = [i[0] for i in sig_user_names]

# significant avg. difference of mean rating for each username
username_avg_diff = [(i, sig_avg_diff(reviews_df, 'user_name', i)) for i in sig_user_names_list]

# create dataframe of significant avg. difference
username_avg_diff_df = pd.DataFrame(username_avg_diff, columns=['user_name', 'sig_avg_diff'])

In [12]:
# print 10 usernames with the largest significant avg. increase in rating
top_user_names_avg_diff = username_avg_diff_df.sort_values('sig_avg_diff', ascending=False)[:10]
print(f"10 users with largest increase in significant avg. difference of rating:\n{top_user_names_avg_diff}")

# print 10 drugs with largest average decrease
bottom_user_names_avg_diff = username_avg_diff_df.sort_values('sig_avg_diff', ascending=True)[:10]
print(f"\n10 users with largest decrease in significant avg. difference of rating:\n{bottom_user_names_avg_diff}")

10 users with largest increase in significant avg. difference of rating:
              user_name  sig_avg_diff
297          Oslo-Fjord      0.497505
320             Shahram      0.497493
338        Indagnacious      0.497483
395            prince64      0.497461
399           cologne65      0.497458
43   Redneck Perfumisto      0.487950
295                 wtb      0.475760
193            ericrico      0.471272
364            milamber      0.469689
59           RUDOLFO512      0.469518

10 users with largest decrease in significant avg. difference of rating:
                 user_name  sig_avg_diff
220            photofinish     -1.253150
191          JennieJenJamz     -0.918674
294                 everso     -0.758186
209  Angelo Orazio Pregoni     -0.752966
282                duncanw     -0.742876
254       Allen-on-Holiday     -0.721074
232               calnadur     -0.719581
405     noideawhatimsaying     -0.696326
319            Bo Darville     -0.693301
219            Jay-Lux'ea

## 2. User Country

In [25]:
# set alpha
alpha = 0.05

# calculate counts for each country
country_counts = reviews_df['user_location'].value_counts()

# exclude users that appear less than 30 times
country_30 = country_counts[country_counts >= 30].index

# perform t-test on ratings with and without each username
pvalue_country = [(i, t_test(reviews_df, 'user_location', i)) for i in country_30]

# return usernames with p-values below alpha along with their p-values
sig_country = [(i[0], float(i[1])) for i in pvalue_country if float(i[1]) < alpha]

In [28]:
len(sig_country)

37

In [30]:
# make list of significant countries
sig_country_list = [i[0] for i in sig_country]

# significant avg. difference of mean rating for each country
country_avg_diff = [(i, sig_avg_diff(reviews_df, 'user_location', i)) for i in sig_country_list]

# create dataframe of significant avg. difference
country_avg_diff_df = pd.DataFrame(country_avg_diff, columns=['user_location', 'sig_avg_diff'])

In [31]:
# print 10 countries with the largest significant avg. increase in rating
top_country_avg_diff = country_avg_diff_df.sort_values('sig_avg_diff', ascending=False)[:10]
print(f"10 countries with largest increase in significant avg. difference of rating:\n{top_country_avg_diff}")

# print 10 countries with largest average decrease
bottom_country_avg_diff = country_avg_diff_df.sort_values('sig_avg_diff', ascending=True)[:10]
print(f"\n10 country with largest decrease in significant avg. difference of rating:\n{bottom_country_avg_diff}")

10 countries with largest increase in significant avg. difference of rating:
   user_location  sig_avg_diff
35        Guyana      0.309923
32         China      0.273596
27       Denmark      0.216525
33      Slovakia      0.205137
9        Nigeria      0.192611
26     Lithuania      0.190755
24     Argentina      0.151406
29     Hong Kong      0.141015
17   New Zealand      0.136561
11         Spain      0.126664

10 country with largest decrease in significant avg. difference of rating:
   user_location  sig_avg_diff
31        Jordan     -0.374621
36        Kuwait     -0.336038
18          Cuba     -0.319398
28       Morocco     -0.261173
34  Saudi Arabia     -0.252743
25       Austria     -0.220089
4    Netherlands     -0.206838
30        Cyprus     -0.180901
10        France     -0.172943
20        Norway     -0.145056


## 3. Scents

In [32]:
# set alpha
alpha = 0.05

# calculate counts for each scent
scent_counts = reviews_df['scent_name'].value_counts()

# exclude scents that appear less than 30 times
scent_30 = scent_counts[scent_counts >= 30].index

# perform t-test on ratings with and without each scents
pvalue_scent = [(i, t_test(reviews_df, 'scent_name', i)) for i in scent_30]

# return scents with p-values below alpha along with their p-values
sig_scent = [(i[0], float(i[1])) for i in pvalue_scent if float(i[1]) < alpha]

In [33]:
len(sig_scent)

407

In [35]:
# make list of significant scents
sig_scent_list = [i[0] for i in sig_scent]

# significant avg. difference of mean rating for each scent
scent_avg_diff = [(i, sig_avg_diff(reviews_df, 'scent_name', i)) for i in sig_scent_list]

# create dataframe with significant avg. differences
scent_avg_diff_df = pd.DataFrame(scent_avg_diff, columns=['scent_name', 'sig_avg_diff'])

In [37]:
# print 10 scents with the largest significant avg. increase in rating
top_scent_avg_diff = scent_avg_diff_df.sort_values('sig_avg_diff', ascending=False)[:10]
print(f"10 scents with largest increase in significant avg. difference of rating:\n{top_scent_names_avg_diff}")

# print 10 scents with largest average decrease
bottom_scent_avg_diff = scent_avg_diff_df.sort_values('sig_avg_diff', ascending=True)[:10]
print(f"\n10 scents with largest decrease in significant avg. difference of rating:\n{bottom_scent_avg_diff}")

10 scents with largest increase in significant avg. difference of rating:
                                      scent_name  sig_avg_diff
372                                 Agua Lavanda      0.497461
397                          Moschino pour Homme      0.497455
149                         Bois des Îles Parfum      0.427130
341                                   Polo Crest      0.411737
168                              Italian Cypress      0.402283
284                                         Nemo      0.402229
304                     Gianfranco Ferré for Man      0.397461
339  parfums*PARFUMS Series 3 Incense: Jaisalmer      0.383159
350               Fendi (original) / Fendi Donna      0.379795
357                                       Ténéré      0.379795

10 scents with largest decrease in significant avg. difference of rating:
                              scent_name  sig_avg_diff
363                              cK Free     -1.078624
398                            Hot Water     -0.

## 3. Brands

In [39]:
# set alpha
alpha = 0.05

# calculate counts for each brand
brand_counts = reviews_df['scent_brand'].value_counts()

# exclude brands that appear less than 30 times
brand_30 = brand_counts[brand_counts >= 30].index

# perform t-test on ratings with and without each brands
pvalue_brand = [(i, t_test(reviews_df, 'scent_brand', i)) for i in brand_30]

# return usernames with p-values below alpha along with their p-values
sig_brand = [(i[0], float(i[1])) for i in pvalue_brand if float(i[1]) < alpha]

In [40]:
len(sig_brand)

217

In [21]:
# make list of significant brands
sig_scent_brands_list = [i[0] for i in sig_scent_brands]

# significant avg. difference of mean rating for each brand
scent_brands_avg_diff = [(i, sig_avg_diff(reviews_df, 'scent_brand', i)) for i in sig_scent_brands_list]

# create dataframe of significant avg. difference
scent_brands_avg_diff_df = pd.DataFrame(scent_brands_avg_diff, columns=['scent_brand', 'sig_avg_diff'])

In [22]:
# print 10 brands with the largest significant avg. increase in rating
top_scent_brands_avg_diff = scent_brands_avg_diff_df.sort_values('sig_avg_diff', ascending=False)[:10]
print(f"10 brands with largest increase in significant avg. difference of rating:\n{top_scent_brands_avg_diff}")

# print 10 brands with largest average decrease
bottom_scent_brands_avg_diff = scent_brands_avg_diff_df.sort_values('sig_avg_diff', ascending=True)[:10]
print(f"\n10 brands with largest decrease in significant avg. difference of rating:\n{bottom_scent_brands_avg_diff}")

10 brands with largest increase in significant avg. difference of rating:
              scent_brand  sig_avg_diff
131          Sultan Pasha      0.399580
197        Fort and Manlé      0.392191
173              Révillon      0.382101
210  Fragrances of France      0.372436
120     Long Lost Perfume      0.367566
192     Fragrance Du Bois      0.351110
175               Sospiro      0.340607
141         Areej le Doré      0.339629
159           Arabian Oud      0.338765
177       Parfums Vintage      0.337467

10 brands with largest decrease in significant avg. difference of rating:
                  scent_brand  sig_avg_diff
148               Exceptional     -0.760121
185                   Porsche     -0.609205
166                 Illuminum     -0.539863
188          Pierre Guillaume     -0.480555
201  People of the Labyrinths     -0.474174
212                  Candie's     -0.470475
138                     Usher     -0.467599
183                     Wings     -0.460224
194        Chri