# "Fit", "Dry Clean Only","Category"

This notebook contains funcation to generate tags for "Fit","Dry Clean Only" and "Category"

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk import punkt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
lem = WordNetLemmatizer()

def lem_sentences(sentence):
    tokens = nltk.word_tokenize(sentence)
    lemmed_tokens = [lem.lemmatize(token) for token in tokens]
    return ' '.join(lemmed_tokens)

#I copy this function from this stackoverflow website
# https://stackoverflow.com/questions/43795310/apply-porters-stemmer-to-a-pandas-column-for-each-word

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def keepAlpha(sentence):
    alpha_sent = ""
    for word in word_tokenize(sentence):
        alpha_word = re.sub('[^\w]+', '', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    
    return cleaned
#The above 4 define function are based on this github page
#https://github.com/nkartik94/Multi-Label-Text-Classification/blob/master/Mark_6.ipynb         

def cleanComma(sentence): #function to clean the word of any punctuation or special characters
    cleaned = sentence.strip(',')
    cleaned = re.sub(r',{2,}',r',',cleaned)
    return cleaned

def is_clothing(x):
    if x in cloth_list:
        return 1
    elif x in non_cloth_list:
        return 0

In [3]:
#the tf-idf vecotirzer used for every classification model here
stop_list=stopwords.words('english')
vectorizer = TfidfVectorizer(ngram_range=(1,3),
                             token_pattern=r'\b[a-zA-Z0-9]{3,}\b',
                             max_df=0.5,
                             min_df=10, stop_words=stop_list)

In [13]:
def text_preprocessing(brand,brand_category,product_full_name,description,details):
    input_text= brand+' '+brand_category+' '+product_full_name+' '+description+' '+details
    input_text=input_text.strip().lower()
    #Using Lemmatization to keep the original form of word
    input_text=lem_sentences(input_text)
    #clean the html sign since it would not give us insight
    input_text=cleanHtml(input_text)
    #removing special chracter such as trademark
    input_text=keepAlpha(input_text)
    #clean punctuation
    input_text=cleanPunc(input_text)
    input_text =pd.Series(input_text)
    return input_text


def clothing_or_not(brand,brand_category,product_full_name,description,details):
    #this function determine if an itme is a clothing or not
    input_text=text_preprocessing(brand,brand_category,product_full_name,description,details)
    X_cloth=is_clothing_df['input']
    y_cloth=is_clothing_df['clothing']

    SVC_pipeline_cloth = Pipeline([('tfidf1', vectorizer),
                ('clf1', LinearSVC()),])

    SVC_pipeline_cloth.fit(X_cloth, y_cloth)
    prediction = SVC_pipeline_cloth.predict(input_text)
    return prediction

def clothing_fit_tag(brand,brand_category,product_full_name,description,details):
    #this function determine the fit tags for clothing item
    item_is_clothing = clothing_or_not(brand,brand_category,product_full_name,description,details)
    if item_is_clothing == 1:
        input_text=text_preprocessing(brand,brand_category,product_full_name,description,details)
        X_fit=fit_tags['input']
        y_fit=fit_tags['attribute_value']

        GradientFC_pipeline = Pipeline([('tfidf', vectorizer),
                ('clf', OneVsRestClassifier(GradientBoostingClassifier())),
            ])

        GradientFC_pipeline.fit(X_fit, y_fit)
        prediction = GradientFC_pipeline.predict(input_text)
        return prediction
    else:
        return ""
        
def clothing_dryclean_tag(brand,brand_category,product_full_name,description,details):
    #this function determine if a clothing item is dry_clean_only
    item_is_clothing = clothing_or_not(brand,brand_category,product_full_name,description,details)
    if item_is_clothing == 1:
        input_text=text_preprocessing(brand,brand_category,product_full_name,description,details)
        X_dryclean=dryclean_tags['input']
        y_dryclean=dryclean_tags['dry_clean']


        GradientFC_pipeline = Pipeline([('tfidf', vectorizer),
                ('clf', GradientBoostingClassifier()),
            ])

        GradientFC_pipeline.fit(X_dryclean, y_dryclean)
        prediction = GradientFC_pipeline.predict(input_text)
        if prediction == 1:
            return 'yes'
        else:
            return 'no'
    else:
        return ""
    
    
def item_category_tag(brand,brand_category,product_full_name,description,details):
    # this function determine which category an item belong to
    input_text=text_preprocessing(brand,brand_category,product_full_name,description,details)
    X_category=category_tags['input']
    y_category=category_tags['attribute_category']


    SVC_pipeline = Pipeline([('tfidf', vectorizer),
                ('clf', LinearSVC()),])

    SVC_pipeline.fit(X_category, y_category)
    prediction = SVC_pipeline.predict(input_text)
    if prediction == 'other':
        # item such as sunglass, belt, accessory, and case belongs to "other" category
        return ""
    else:
        return prediction
                       

## Load Data

In [5]:
tag_data = pd.read_csv('tagged_product_attributes.csv')
full_data = pd.read_csv('full_data.csv')

#the following dataset is derived from the above two dataset
#These dataset would be used for model fiting latter
is_clothing_df=pd.read_csv('is_clothing_or_not.csv', index_col=0)
fit_tags=pd.read_csv('fit_tags.csv', index_col=0)
dryclean_tags=pd.read_csv('dryclean_tags.csv', index_col=0)
category_tags=pd.read_csv('category_tags.csv', index_col=0)

## Model Deployment

In [6]:
#This item is a sunglasses
brand4=full_data.loc[4]['brand']
brand_category4=full_data.loc[4]['brand_category']
product_full_name4=full_data.loc[4]['product_full_name']
description4=full_data.loc[4]['description']
details4=full_data.loc[4]['details']

In [7]:
#This item is pants
brand10=full_data.loc[10]['brand']
brand_category10=full_data.loc[10]['brand_category']
product_full_name10=full_data.loc[10]['product_full_name']
description10=full_data.loc[10]['description']
details10=full_data.loc[10]['details']

In [8]:
clothing_fit_tag(brand4,brand_category4,product_full_name4,description4,details4)

''

In [9]:
clothing_dryclean_tag(brand4,brand_category4,product_full_name4,description4,details4)

''

In [10]:
item_category_tag(brand4,brand_category4,product_full_name4,description4,details4)

''

In [11]:
clothing_fit_tag(brand10,brand_category10,product_full_name10,description10,details10)

array(['straight/regular'], dtype='<U16')

In [14]:
clothing_dryclean_tag(brand10,brand_category10,product_full_name10,description10,details10)

'yes'

In [15]:
item_category_tag(brand10,brand_category10,product_full_name10,description10,details10)

array(['bottom'], dtype=object)

## Generate columns for "Fit", "Dry Clean Only", and "Category

In [16]:
full_data.shape

(48979, 13)

In [17]:
#concat the column to get the input text
full_data ['input']=full_data[['brand','brand_category','product_full_name','description','details']].fillna('')\
.agg(' '.join, axis=1).str.lower()

In [18]:
#preprocess the text
full_data['input']= full_data['input'].apply(lem_sentences)
full_data['input'] = full_data['input'].apply(cleanHtml)
full_data['input']= full_data['input'].apply(cleanPunc)
full_data['input'] = full_data['input'].apply(keepAlpha)

In [19]:
full_text=full_data['input']

In [20]:
#to see if an item is clothing or not
X=is_clothing_df['input']
y=is_clothing_df['clothing']

SVC_pipeline = Pipeline([('tfidf', vectorizer),
                ('clf', LinearSVC()),
            ])

SVC_pipeline.fit(X, y)
prediction_cloth = SVC_pipeline.predict(full_text)
full_data['clothing']=prediction_cloth

In [21]:
#get the "fit" tag for clothing item
X_fit=fit_tags['input']
y_fit=fit_tags['attribute_value']

GradientFC_pipeline = Pipeline([('tfidf', vectorizer),
                ('clf', OneVsRestClassifier(GradientBoostingClassifier())),
            ])

GradientFC_pipeline.fit(X_fit, y_fit)
prediction_fit = GradientFC_pipeline.predict(full_text)
full_data['fit']=prediction_fit

In [22]:
# get the "dry clean only" tag for clothing item
X_dryclean=dryclean_tags['input']
y_dryclean=dryclean_tags['dry_clean']


GradientFC_pipeline = Pipeline([('tfidf', vectorizer),
                ('clf', GradientBoostingClassifier()),
            ])

GradientFC_pipeline.fit(X_dryclean, y_dryclean)
prediction_dryclean = GradientFC_pipeline.predict(full_text)

full_data['dry_clean_only']=prediction_dryclean
full_data['dry_clean_only']=np.where(full_data['dry_clean_only']==1,'yes','no')

In [23]:
# get the "category" tag for every item
X_category=category_tags['input']
y_category=category_tags['attribute_category']


SVC_pipeline = Pipeline([('tfidf', vectorizer),
                ('clf', LinearSVC()),])

SVC_pipeline.fit(X_category, y_category)
prediction_category = SVC_pipeline.predict(full_text)
full_data['category']=prediction_category
full_data['category']=np.where(full_data['category']!='other',full_data['category'],"")

In [24]:
full_data['fit']=np.where(full_data['clothing']==1,full_data['fit'],'')
full_data['dry_clean_only']=np.where(full_data['clothing']==1,full_data['dry_clean_only'],'')

In [25]:
#for the record in the training data, get the real tag
full_data2=full_data.merge(fit_tags[['product_id','attribute_value']],on='product_id',how='left')
full_data3=full_data2.merge(dryclean_tags[['product_id','attribute_value']],on='product_id',how='left',suffixes=['_fit','_dc'])
full_data4=full_data3.merge(category_tags[['product_id','attribute_category']],on='product_id',how='left')

In [26]:
full_data4['fit']=np.where(full_data4['attribute_value_fit'].isna(),full_data4['fit'],full_data4['attribute_value_fit'])
full_data4['dry_clean_only']=np.where(full_data4['attribute_value_dc'].isna(),full_data4['dry_clean_only'],\
                                      full_data4['attribute_value_dc'])
full_data4['category']=np.where(full_data4['attribute_category'].isna(),full_data4['category'],full_data4['attribute_category'])

In [30]:
full_data5=full_data4.drop(['attribute_value_fit','attribute_value_dc','attribute_category','clothing','input'],axis=1)

In [32]:
full_data5.to_csv('full_data_with_3_columns.csv')