In [51]:
import pandas as pd
import re
import collections
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from nltk.corpus import stopwords
import numpy as np

In [52]:
data_df = pd.read_csv("data/training/data_train.csv", dtype='unicode')
data_df.fillna(value='', inplace=True)
data_df = data_df.apply(lambda x: x.astype(str).str.lower())

In [53]:
data_df[['clarity','conciseness']] = data_df[['clarity','conciseness']].apply(pd.to_numeric)

In [54]:
def cleanhtml(raw_html):
    result = re.sub(re.compile('<.*?>'), ' ', raw_html)
    return result
data_df['short_description_clear_html'] = data_df.apply(lambda row: cleanhtml(row['short_description']), axis=1)

In [55]:
# train_df, test_df = train_test_split(data_df, test_size = 0.2)
train_df = data_df
test_df = pd.read_csv("data/validation/data_valid_2.csv")
test_df.fillna(value='', inplace=True)
test_df = test_df.apply(lambda x: x.astype(str).str.lower())

In [56]:
test_df.head()

Unnamed: 0,country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description,price,product_type
0,my,ap564elasstwanmy,apple macbook pro mgxc2zp/a 16gb i7 15.4-inch ...,computers & laptops,laptops,macbooks,os x lion<br> intel core i7<br> 15-inch retina...,12550.0,local
1,my,br924hbaa5b3tlanmy,brand's® american ginseng triple pack (3x 6's)...,health & beauty,food supplements,well being,<ul> <li>traditionally used to calm the mind a...,105.0,local
2,my,ca673elaa5ug3xanmy,canon eos m10 mirrorless digital camera 18mp w...,cameras,mirrorless,,<div> <ul> <li>18.0mp aps-c cmos sensor</li> <...,1588.0,local
3,my,de759elaa7qm1xanmy,"dell led monitor 23"" (e2316h)",computers & laptops,computer accessories,monitors,"<div class=""prod_content""> <div class=""prod_de...",565.0,local
4,my,es802otaabhay8anmy,esprit tallac brave nubuck sand es107601001 be...,watches sunglasses jewellery,watches,men,<ul> <li>stainless steel case</li> <li>mineral...,279.0,local


In [57]:
full_categories_set = set()

In [58]:
for index, row in train_df.iterrows():
    full_category = row['category_lvl_1'] + "_" + row['category_lvl_2'] \
        + "_" + row['category_lvl_3']
    full_categories_set.add(full_category)

In [59]:
def create_bag_of_word(texts):
    return [ collections.Counter(re.findall(r'\w+', txt)) for txt in texts]

In [60]:
clarity_word_most_frequence_dict = {}
un_clarity_word_most_frequency_dict = {}
conciseness_word_most_frequency_dict = {}
un_conciseness_word_most_frequency_dict = {}


stopwords_set = set(stopwords.words('english'))

for full_category in full_categories_set:        

    categories = full_category.split(sep="_", maxsplit = 3)
    category_level_1 = categories[0]
    category_level_2 = categories[1]
    category_level_3 = categories[2]
    
    ##------------ create dict for clairy product --------------------- #########
    clarity_product_by_category = train_df[(train_df.category_lvl_1 == category_level_1) &
                                   (train_df.category_lvl_2 == category_level_2) &
                                   (train_df.category_lvl_3 == category_level_3) &
                                   (train_df.clarity == 1)]
    bag_of_words_title = create_bag_of_word([clarity_product_by_category.title.str.cat(sep=' ')])
  
    most_common_words = []
    for word, count in bag_of_words_title[0].most_common():           
        if (count > 10) & (not word.isnumeric()) & (word not in stopwords_set):
            if category_level_1 == "fashion":
                if word not in ['women', 'womens', 'girl', 'girls', 'woman'] and (count > 20):
                    most_common_words.append(word)
            else:
                most_common_words.append(word)
    
    clarity_word_most_frequence_dict[full_category] = most_common_words
    
    ##------------ create dict for unclarify product -------------#######
    

    unclarity_product_by_category = train_df[(train_df.category_lvl_1 == category_level_1) &
                                   (train_df.category_lvl_2 == category_level_2) &
                                   (train_df.category_lvl_3 == category_level_3) &
                                   (train_df.clarity == 0)]
    bag_of_words_title = create_bag_of_word([unclarity_product_by_category.title.str.cat(sep=' ')])
  
    most_common_words = []
    for word, count in bag_of_words_title[0].most_common():           
        if (count > 10) & (not word.isnumeric()) & (word not in stopwords_set):
            if category_level_1 == "fashion":
                if word not in ['women', 'womens', 'girl', 'girls', 'woman'] and (count > 20):
                    most_common_words.append(word)
            else:
                most_common_words.append(word)
    
    un_clarity_word_most_frequency_dict[full_category] = most_common_words
    
    ##--------------- create dict for conciseness ---------------------------------####
    
    conciseness_product_by_category = train_df[(train_df.category_lvl_1 == category_level_1) &
                                   (train_df.category_lvl_2 == category_level_2) &
                                   (train_df.category_lvl_3 == category_level_3) &
                                   (train_df.conciseness == 1)]
    
    bag_of_words_title = create_bag_of_word([conciseness_product_by_category.title.str.cat(sep=' ')])
    
    most_common_words = []
    for word, count in bag_of_words_title[0].most_common():           
        if (count > 10) & (not word.isnumeric()) & (word not in stopwords_set):
            most_common_words.append(word)
    
    conciseness_word_most_frequency_dict[full_category] = most_common_words
    
    ##--------------- create dict for unconciseness ---------------------------------####
    
    unconciseness_product_by_category = train_df[(train_df.category_lvl_1 == category_level_1) &
                                   (train_df.category_lvl_2 == category_level_2) &
                                   (train_df.category_lvl_3 == category_level_3) &
                                   (train_df.conciseness == 0)]
    
    bag_of_words_title = create_bag_of_word([unconciseness_product_by_category.title.str.cat(sep=' ')])
    
    most_common_words = []
    for word, count in bag_of_words_title[0].most_common(): 
        if (count > 10) & (not word.isnumeric()) & (word not in stopwords_set):
            most_common_words.append(word)
    
    un_conciseness_word_most_frequency_dict[full_category] = most_common_words

# Test model

In [61]:
clarities_pred = np.zeros(len (test_df))
conciseness_pred = np.zeros(len(test_df))

clarity_wrong_cases_dict = {}
conciseness_wrong_case_dict = {}

i = 0
for index, row in test_df.iterrows():
    full_category = row['category_lvl_1'] + "_" + row['category_lvl_2'] + "_" + row['category_lvl_3']
    
    most_common_words_clarity = \
        clarity_word_most_frequence_dict.get(full_category)
    most_common_words_unclarity = un_clarity_word_most_frequency_dict.get(full_category)  
      
    nb_of_match_clarity = 0
    for word in row['title'].split():
        if word in most_common_words_clarity:
            nb_of_match_clarity += 1
            
            
    nb_of_match_unclarity = 0
    for word in row['title'].split():
        if word in most_common_words_unclarity:
            nb_of_match_unclarity += 1
            
    
    if (nb_of_match_clarity == 0) & (nb_of_match_unclarity == 0):
        clarities_pred[i] = 0.5
    elif (nb_of_match_clarity > 0) & (nb_of_match_unclarity > 0) :
        clarities_pred[i] = nb_of_match_clarity * 1.0 / (nb_of_match_clarity + nb_of_match_unclarity)
    elif (nb_of_match_clarity > 0) & (nb_of_match_unclarity == 0):
        clarities_pred[i] = 1
    elif (nb_of_match_clarity == 0) & (nb_of_match_unclarity > 0):
        clarities_pred[i] = 0
        
    if clarities_pred[i] == 0:
        conciseness_pred[i] = 0
    else:
        most_common_words_conciseness = conciseness_word_most_frequency_dict.get(full_category)
        most_common_words_unconciseness = un_conciseness_word_most_frequency_dict.get(full_category)
        
        nb_of_match_conciseness = 0
        for word in row['title'].split():
            if word in most_common_words_conciseness:
                nb_of_match_conciseness += 1


        nb_of_match_unconciseness = 0
        for word in row['title'].split():
            if word in most_common_words_unconciseness:
                nb_of_match_unconciseness += 1
                
        if (nb_of_match_conciseness == 0) & (nb_of_match_unconciseness == 0):
            conciseness_pred[i] = 0.5
        elif (nb_of_match_conciseness > 0) & (nb_of_match_unconciseness > 0) :
            conciseness_pred[i] = \
                    nb_of_match_conciseness * 1.0 / (nb_of_match_conciseness + nb_of_match_unconciseness)
        elif (nb_of_match_conciseness > 0) & (nb_of_match_unconciseness == 0):
            conciseness_pred[i] = 1
        elif (nb_of_match_conciseness == 0) & (nb_of_match_unconciseness > 0):
            conciseness_pred[i] = 0

#     if clarities_pred[i] != row['clarity']:
#         if categories_word_wrong_case_dict.get(full_category) == None:
#             categories_word_wrong_case_dict[full_category] = 0
#         categories_word_wrong_case_dict[full_category] += 1
        
#     if conciseness_pred[i] != row['conciseness']:
#         if categories_word_wrong_case_conciseness_dict.get(full_category) == None:
#             categories_word_wrong_case_conciseness_dict[full_category] = 0
#         categories_word_wrong_case_conciseness_dict[full_category] += 1
    
    i = i + 1   
    

In [62]:
# from sklearn.metrics import mean_squared_error
# clarity_error = np.sqrt(mean_squared_error(test_df.clarity.as_matrix(), clarities_pred))
# conciseness_error = np.sqrt(mean_squared_error(test_df.conciseness.as_matrix(), conciseness_pred))

# print("clarity_error: ", clarity_error)
# print("conciseness_error: ", conciseness_error)
# print("mean: ", np.mean([clarity_error, conciseness_error]))

In [63]:
clarities_pred_df = pd.DataFrame(data = clarities_pred.tolist())
clarities_pred_df.to_csv("clarities_pred.csv", index = False)

conciseness_pred_df = pd.DataFrame(data = conciseness_pred.tolist())
conciseness_pred_df.to_csv("conciseness_pred.csv", index = False)