In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from skmultilearn.problem_transform import BinaryRelevance
from xgboost import XGBClassifier

In [2]:
def age_bin(data):
    age_group = ['balita', 'kanak-kanak', 'remaja awal', 'remaja akhir', 'dewasa awal', 'dewasa akhir', 'lansia awal', 'lansia akhir', 'manula']
    age_range = [
        (data['age'] >= 0) & (data['age'] <= 5),
        (data['age'] >= 6) & (data['age'] <= 11),
        (data['age'] >= 12) & (data['age'] <= 16),
        (data['age'] >= 17) & (data['age'] <= 25),
        (data['age'] >= 26) & (data['age'] <= 35),
        (data['age'] >= 36) & (data['age'] <= 45),
        (data['age'] >= 46) & (data['age'] <= 55),
        (data['age'] >= 56) & (data['age'] <= 65),
        (data['age'] > 65)]
    data['age_bin'] = np.select(age_range, age_group)
    data = data.drop('age', axis=1)
    return data

In [3]:
def encoding(x):
    age_group = ['balita', 'kanak-kanak', 'remaja awal', 'remaja akhir', 'dewasa awal', 'dewasa akhir', 'lansia awal', 'lansia akhir', 'manula']
    gender_encode = pd.get_dummies(x.gender.apply(pd.Series).stack()).sum(level=0)
    x = x.drop(columns=['gender'])
    x = pd.concat([x, gender_encode], axis=1)
    status_encode = pd.get_dummies(x.status.apply(pd.Series).stack()).sum(level=0)
    x = x.drop(columns=['status'])
    x = pd.concat([x, status_encode], axis=1)
    occ_encode = pd.get_dummies(x.occupation.apply(pd.Series).stack()).sum(level=0)
    x = x.drop(columns=['occupation'])
    x = pd.concat([x, occ_encode], axis=1)
    age_encode = pd.get_dummies(x.age_bin.apply(pd.Series).stack()).sum(level=0)
    x = x.drop(columns=['age_bin'])
    diff = list(set(age_group) - set(age_encode.columns))
    age_encode[diff] = 0
    x = pd.concat([x, age_encode], axis=1)
    return x

In [4]:
def get_interest(row):
    inter = []
    for c in interest.columns:
        if row[c] != 0:
            inter.append(c)
    return inter

In [5]:
def get_feature(test_df):
    feat = test_df.copy()
    for j in range(len(feat)):
        for i in feat.columns:
            if feat.iloc[j][i] == 1:
                feat.iloc[j, feat.columns.get_loc(i)] = i
    return feat

In [6]:
def preprocessing_test(data):
    age_group = ['balita', 'kanak-kanak', 'remaja awal', 'remaja akhir', 'dewasa awal', 'dewasa akhir', 'lansia awal', 'lansia akhir', 'manula']
    fixed_status = ['MARRIED', 'SINGLE']
    fixed_gender = ['MALE', 'FEMALE']
    fixed_inter = ['ART', 'AUTOMOTIVE', 'BOOKS', 'COOKING', 'CULINARY', 'FASHION', 'GADGET', 'GAMING', 'INTERIOR_DESIGN', 'MAKE_UP_AND_SKIN_CARE', 'MOVIES', 'MUSIC', 'PHOTOGRAPHY', 'SPORTS', 'TRAVELLING']
    fixed_occ = ['ENTREPRENEUR', 'FULL_TIME_EMPLOYEE', 'OTHERS', 'PART_TIME_EMPLOYEE', 'RETIRED', 'STUDENT', 'UNEMPLOYED']
    
    data = age_bin(data)
    x_test = encoding(data)
    inter_encode = pd.get_dummies(x_test.interests.apply(pd.Series).stack()).sum(level=0)
    x_test = x_test.drop(columns=['interests'])
    x_test = pd.concat([x_test, inter_encode], axis=1)
    
    diff = list(set(fixed_status) - set(x_test.columns))
    x_test[diff] = 0
    diff = list(set(fixed_gender) - set(x_test.columns))
    x_test[diff] = 0
    diff = list(set(fixed_inter) - set(x_test.columns))
    x_test[diff] = 0
    diff = list(set(fixed_occ) - set(x_test.columns))
    x_test[diff] = 0
    
    x_test = x_test.reindex(sorted(x_test.columns), axis=1)
    return x_test

In [7]:
def predict(x_test):
    gift_model = pickle.load(open('gift_recommendation_model.sav', 'rb'))
    pred_score = gift_model.predict_proba(x_test)
    return pred_score

In [8]:
def boost_cat(res, concat_res, feat):
    add_value = pd.read_csv('category_c2.csv').drop('Unnamed: 0', axis=1)
    add = add_value.set_index('cat_name').T
    add = add.reindex(sorted(add.columns), axis=1).reset_index()
    add_result = res.copy()
    for i in range(len(concat_res)):
        for j in feat.columns:
            if concat_res.iloc[i:i+1][j].isin(add['index']).any():
                a = add.loc[add['index']  == concat_res.iloc[i][j]]
                for k in a.columns:
                    if a[k].any() == 1:
                        ad = float(a[k]) * 0.2
                        temp = float(add_result.iloc[i:i+1][k]) * 0.8
                        add_result.iloc[i:i+1][k] = temp + ad
                        
    return add_result

In [27]:
def gift_recommendation(gender, age, occupation, status, interest):
    order = pd.read_csv('order.csv')
    data = [gender, age, occupation, status, interest]
    test_df = pd.DataFrame([data], columns = ['gender', 'age', 'occupation', 'status', 'interests'] )
    x_test = preprocessing_test(test_df)
    
    pred_score = predict(x_test)
    
    res = pd.DataFrame(pred_score.toarray(), columns = order.cat_name.sort_values(ascending=True))
    feature = get_feature(x_test)
    concat_res = pd.concat([feature, res], axis=1)
    add_res = boost_cat(res, concat_res, feature)

    if test_df['age'].item() < 21:
        add_res[['bliblimart - 21+', 'kesehatan & kecantikan - 21+ kesehatan seksual' ]] = 0
   
    add_res = add_res.rename(columns = {'home & living - hewan peliharaan/pet supplies':'home & living - hewan peliharaan pet supplies'})
    add_res = add_res.rename(columns = {'mainan & video games - die-cast & r/c':'mainan & video games - die-cast & r c'})
    
    result = add_res.T
    result = result.reset_index()
    result.columns = ['category', 'probability']
    return np.array(result.sort_values(by=['probability'], ascending=False)['category'])

In [None]:
res = gift_recommendation('FEMALE', 32, 'FULL_TIME_EMPLOYEE', 'MARRIED', ['COOKING', 'TRAVELLING', 'FASHION'])
res