In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
url_clothing = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/Womens_Clkothing_Reviews.csv'
data_clothing = pd.read_csv(url_clothing)

### [0]

In [3]:
base = data_clothing.dropna(subset=data_clothing.columns[:6])

In [4]:
base = base.assign(ageCD=base['Age'].apply(
                             lambda x: 'A' if (x >= 20) & (x < 30) else (
                                       'B' if (x >= 30) & (x < 40) else (
                                       'C' if (x >= 40) & (x < 50) else (
                                       'D' if (x >= 50) & (x < 60) else 'E')))))

In [5]:
base = base.assign(Pos_Neg=base['Rating'].apply(lambda x: 1 if x > 4 else 0))

### [1]

In [6]:
d1 = base.dropna()

In [7]:
dep_std = d1.groupby('DepartmentName').agg({'Rating': 'std'})
dep_mean = d1.groupby('DepartmentName').agg({'Rating': 'mean'})

In [8]:
dep = pd.merge(dep_std, dep_mean, left_index=True, right_index=True)
dep = dep.rename(columns={'Rating_x': 'std', 'Rating_y': 'mean'})

In [9]:
dep = dep.assign(CV=dep['std'] / dep['mean'])

In [10]:
dep = dep.sort_values(by='CV', ascending=False)

In [11]:
top_dev = dep.index[0]
cv = dep.iloc[0, 2]

print(f"{top_dev}, {cv:.2f}")

Trend, 0.35


### [2]

In [13]:
d2 = base.groupby('ClothingID').agg({'Seq': 'count', 'Pos_Neg': 'sum'})

In [15]:
d2 = d2[d2['Seq'] >= 30]

In [17]:
d2 = d2.assign(PosRatio=d2['Pos_Neg'] / d2['Seq'])

In [19]:
d2 = d2.sort_values(by='PosRatio', ascending=False)

In [30]:
cloth = d2.index[1]
posrate = d2.loc[cloth, 'PosRatio']

In [32]:
print(f"{cloth:.0f}, {math.floor(posrate * 1000) / 1000:.3f}")

964, 0.794


### [3]

In [42]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

formula = 'Rating ~ C(ageCD)'

result = ols(formula=formula, data=base).fit()
anova_table = anova_lm(result)

anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(ageCD),4.0,41.74282,10.435705,8.49895,7.553928e-07
Residual,22083.0,27115.310421,1.227882,,


In [46]:
f_val = anova_table.loc['C(ageCD)', 'F']
p_val = anova_table.loc['C(ageCD)', 'PR(>F)']

print(f"f-value: {math.floor(f_val * 10000) / 10000:.4f}")
print(f"p-value is less than 0.05 so H0 is rejected,")
print(f"meanding Rating is Different by ageCD")

f-value: 8.4989
p-value is less than 0.05 so H0 is rejected,
meanding Rating is Different by ageCD


### [4]

In [121]:
d4 = base[base['ClassName'] != '0']
d4 = pd.get_dummies(data=d4, columns=['ageCD'], drop_first=True)

def getPrior(seq):
    if len(d4[d4['Seq'] == seq - 1]):
        return d4[d4['Seq'] == seq - 1]['Rating'].iloc[0]
    else:
        return d4[d4['Seq'] == seq]['Rating'].iloc[0] 

d4 = d4.assign(prior_Rating=d4['Seq'].apply(lambda x: getPrior(x)))

In [122]:
train = d4[d4['ClassName'] != 'Intimates']
test = d4[d4['ClassName'] == 'Intimates']

X_var = list(d4.columns[3:6]) + list(d4.columns[-5:])

In [126]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2', C=100000, random_state=1234,
                           solver='newton-cg').fit(train[X_var],
                                                   train['RecommendedIND'])

result = model.predict_proba(test[X_var])
result = pd.DataFrame(result)
result = result.assign(Reco=result[1].apply(lambda x: 'Y' if x >= 0.5 else 'N'))

In [133]:
test_y = test['RecommendedIND'].reset_index()
result = pd.merge(result, test_y, left_index=True, right_index=True)

In [135]:
result = result.assign(accu=(result['Reco'] == result['RecommendedIND']) * 1)

In [138]:
accuracy = result['accu'].sum() / result['accu'].count()
print(f"{accuracy:.3f}")

0.946


In [81]:
#123456789#123456789#123456789#123456789#123456789#123456789#123456789#123456789

In [None]:
#123456789#123456789#123456789#123456789#123456789#123456789#123456789#123456789