<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_09_2nd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

url = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/Womens_Clothing_Reviews.csv'
data = pd.read_csv(url)

In [0]:
# [0]

# dropna
base = data.dropna(subset=['Seq', 'ClothingID', 'Age', 'TitleLength',
                           'ReviewTextLength', 'Rating'])

# get age category
def getAgeCt(x):
    if (x >= 20) & (x < 30):
        return 'A'
    elif (x >= 30) & (x < 40):
        return 'B'
    elif (x >= 40) & (x < 50):
        return 'C'
    elif (x >= 50) & (x < 60):
        return 'D'
    else:
        return 'E'

base = base.assign(ageCD=base.Age.apply(lambda x: getAgeCt(x)))

# get rating category
base = base.assign(Pos_Neg=base.Rating.apply(lambda x: 1 if x > 4 else 0))

In [97]:
# [1] coefficient of variation

# get mean and std
dept_mean = base.groupby('DepartmentName').agg({'Rating': 'mean'})
dept_std = base.groupby('DepartmentName').agg({'Rating': 'std'})

# merge mean and std in one table
dept = pd.merge(dept_mean, dept_std, how='inner', left_index=True,
                right_index=True, suffixes=('_mean', '_std'))

# calculate coefficient of variation
dept = dept.assign(CV=dept.apply(lambda x: x.Rating_std / x.Rating_mean,
                                 axis=1))

# get max CV
dept = dept.sort_values(by='CV', ascending=False)
top_dept = dept.index[0]
top_CV = dept.CV.iloc[0]

print(f'Answer [1] : {top_dept}, {top_CV:.2f}')

Answer [1] : Trend, 0.35


In [98]:
# [2]

import math

# groupby ClothingID and get rating counts and pos rating counts by sum
data2 = base.groupby('ClothingID').agg({'Rating': 'count', 'Pos_Neg': 'sum'})

# filter by counts >= 30
data2 = data2[data2.Rating >= 30]

# calculate positive rate
data2 = data2.assign(PR=data2.apply(lambda x: x['Pos_Neg'] / x['Rating'],
                                    axis=1))

# get 2nd highest positive rating item
data2 = data2.sort_values(by='PR', ascending=False)
top2_clothing = data2.index[1]
top2_PR = data2.PR.iloc[1]

print(f'Answer [2] : {top2_clothing:.0f}, {math.floor(top2_PR * 1000) / 1000}')

Answer [2] : 964, 0.794


In [99]:
# [3] ANOVA test

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import math

# perform ANOVA test
ols_reg = ols(formula='Rating ~ C(ageCD)', data=base).fit()
anova_table = anova_lm(ols_reg)

# get test result
p_val = anova_table['PR(>F)'].iloc[0]
t_val = anova_table['F'].iloc[0]

if p_val < 0.05:
    reject = 'Reject H0, so MUs are not same'
else:
    reject = 'Cannot reject H0, so MUs are same'

print(f'Answer [3] : {reject}, {math.floor(t_val * 10000) / 10000}')

Answer [3] : Reject H0, so MUs are not same, 8.4989


In [124]:
# [4] LogisticRegression, get_dummies

from sklearn.linear_model import LogisticRegression

# exclude ClassName == '0'
data4 = base[base.ClassName != '0']

# get dummy vairables
data4 = pd.get_dummies(data4, columns={'ageCD'}, drop_first=True)

# get previous rating
def getPrevR(x):
    if len(data4[data4.Seq == x - 1]) > 0:
        return data4[data4.Seq == (x - 1)].Rating.values[0]
    else:
        return data4[data4.Seq == x].Rating.values[0]

data4 = data4.assign(prior_Rating=data4.Seq.apply(lambda x: getPrevR(x)))

# prepare training and test set
X_cols = list(data4.columns[3:6]) + list(data4.columns[10:])
train_X = data4[data4.ClassName != 'Intimates'][X_cols]
train_y = data4[data4.ClassName != 'Intimates']['RecommendedIND']
test_X = data4[data4.ClassName == 'Intimates'][X_cols]
test_y = data4[data4.ClassName == 'Intimates']['RecommendedIND']

# train
lr = LogisticRegression(C=100000, random_state=1234, penalty='l2',
                        solver='newton-cg')
model = lr.fit(train_X, train_y)

# predict
pred = model.predict_proba(test_X)

# get accuracy
pred_df = pd.DataFrame(pred, columns=['N', 'Y'])
pred_df = pred_df.assign(esti=pred_df.Y.apply(lambda x: 'Y' if x >= 0.5 else 'N'))
test_y_df = pd.DataFrame(test_y)
test_y_df.reset_index(inplace=True)
pred_df = pd.merge(pred_df, test_y_df.RecommendedIND, left_index=True, right_index=True)
pred_df = pred_df.assign(accuracy=pred_df.apply(lambda x:
                                         1 if x.esti == x.RecommendedIND else 0,
                                                axis=1))
accuracy = pred_df.accuracy.sum() / pred_df.accuracy.count()

print(f'Answer [4] : {accuracy:.3f}')

Answer [4] : 0.946
