### Introduction
LEGO is a popular brand of toy building bricks. They are often sold in sets with in order to build a specific object. Each set contains a number of parts in different shapes, sizes and colors. This database contains information on which parts are included in different LEGO sets. It was originally compiled to help people who owned some LEGO sets already figure out what other sets they could build with the pieces they had.

### Acknowledgements
This dataset was compiled by [Rebrickable](https://rebrickable.com/about/), which is a website to help identify what LEGO sets can be built given bricks and pieces from other LEGO sets. You can use these files for any purpose. I obtained this data from the [Kaggle](https://www.kaggle.com/rtatman/lego-database)

### Data Description
There are 8 data files representing the whole dataset in terms of dimensions and fact files. The schema information can be found below

#### Schema Information
![alt text](downloads_schema.png)


### Index
1. [Data Visualizations](#Data-Visualizations)
1. [Data Modeling](#Data-Modeling)

In [1]:
import pandas as pd
import numpy as np
import os

#Bokeh imports
from bokeh.core.properties import value
from bokeh.models import (HoverTool, FactorRange, Plot, LinearAxis, Grid,
                          Range1d)
from bokeh.io import show, output_notebook
from bokeh.models.glyphs import VBar
from bokeh.plotting import figure
from bokeh.embed import components
from bokeh.models.sources import ColumnDataSource
from bokeh.core.properties import value
from bokeh.palettes import viridis, mpl, brewer, d3
output_notebook()

In [4]:
DATA_DIR = 'datasets'
datasets = {}
for f in os.listdir(DATA_DIR):
    if f.endswith('.csv'):
        datasets[f.replace('.csv', '')] = pd.read_csv(os.path.join(DATA_DIR, f))

## Data Visualizations

These visualizations are same as the one that can be seen on the webpage served by app.py

In [2]:
def plot_bar_chart(df, x_name, y_name, y_label, x_label, width, height):
    df[x_name] = df[x_name].astype(str)
    plot = figure(x_range = df[x_name].values, 
                  plot_width=width, plot_height=height, h_symmetry=False,
                  v_symmetry=False, tools="pan,wheel_zoom,box_zoom,reset",
                  min_border=0, toolbar_location="above",
                  responsive=True, outline_line_color="#666666")
    source = ColumnDataSource(df)
    plot.vbar(x=x_name,top=y_name, width=0.5, source = source)
    xaxis, yaxis = LinearAxis(), LinearAxis()
    plot.add_layout(Grid(dimension=0, ticker=xaxis.ticker))
    plot.add_layout(Grid(dimension=1, ticker=yaxis.ticker))
    plot.toolbar.logo = None
    plot.min_border_top = 0
    plot.xgrid.grid_line_color = None
    plot.ygrid.grid_line_color = "#999999"
    plot.yaxis.axis_label = y_label
    plot.ygrid.grid_line_alpha = 0.1
    plot.xaxis.axis_label = x_label
    plot.xaxis.major_label_orientation = 1
    plot.add_tools(HoverTool(tooltips=[(x_label, '@'+x_name), (y_label, '@'+y_name)]))
    return plot

def getTopNEntries(df, topN, x_name, y_name, final_agg = 'mean'):
    if df.shape[0] <= topN:
        return df
    lengthN = True
    df_temp = df.copy().reset_index(drop=True)
    if final_agg != 'mean':
        lengthN = False
        df_temp.loc[topN] = ['others', df_temp[topN:][y_name].agg('sum')]
    return df_temp.loc[:(topN - 1) if lengthN else topN].copy()

def plot_agg_bar(df_set, x_name, y_name, width=1200, height=300, 
                       year_range = (1950, 2017), orderBy = False,
                       orderType = True,
                       TopX = None, X_label_name = None, y_label_name = None,
                       agg = 'mean'):
    X_label_name = x_name if X_label_name is None else X_label_name
    y_label_name = y_name if y_label_name is None else y_label_name    
    data = pd.DataFrame(df_set.groupby(x_name).agg({y_name: agg})).reset_index()
    if orderBy: 
        data.sort_values(y_name, inplace=True, ascending = orderType)
    if TopX is not None:
        data = getTopNEntries(data, TopX, x_name, y_name, agg)
    return plot_bar_chart(data, x_name, y_name, y_label_name, X_label_name, width, height)

def plot_bar_stacked_chart(df, x_name, y_label, x_label, width, height, useColsColr = False, createLegend = False):
    stackCol = list(df.columns.values)
    
    colorPallette = np.array(mpl['Plasma'][256])
    np.random.shuffle(colorPallette)
    colors = map(lambda x: '#' + x.lower(), df.columns.values) if useColsColr else d3['Category20'][20]#list(colorPallette)[:len(stackCol)]
    #
    legends = [value(x) for x in stackCol] if createLegend else None
    
    df = df.reset_index()
    tooltipVals = map(lambda x: (x.replace(',', ''), '@{'+ x +'}'), df.columns.values)
    df[x_name] = df[x_name].astype(str)
    source = ColumnDataSource(data=df)
    x_values = map(lambda x: str(x), df[x_name].values)
    plot = figure(x_range = x_values, plot_width=width, plot_height=height, 
                  tools="pan,wheel_zoom,box_zoom,reset",
                  min_border=0, outline_line_color="#666666")
    
    plot.vbar_stack(stackCol, x=x_name, width=0.5, color=colors, source=source,
                    legend = legends)
    xaxis, yaxis = LinearAxis(), LinearAxis()
    plot.add_layout(Grid(dimension=0, ticker=xaxis.ticker))
    plot.add_layout(Grid(dimension=1, ticker=yaxis.ticker))
    plot.xgrid.grid_line_color = None
    plot.ygrid.grid_line_color = "#999999"
    plot.yaxis.axis_label = y_label
    plot.ygrid.grid_line_alpha = 0.1
    plot.xaxis.axis_label = x_label
    plot.xaxis.major_label_orientation = 1
    plot.add_tools(HoverTool(tooltips=tooltipVals))
    return plot

#### Avg Number of Parts Used By Year

In [5]:
show(plot_agg_bar(datasets['sets'], 'year', 'num_parts',
                  X_label_name = "Year", 
                  y_label_name = 'Avg number of Parts', 
                  agg = 'mean'))

  warn(message)


#### Set Color Composition By Year

In [9]:
TopNColor = 50
yearColordf = pd.merge(pd.merge(pd.merge(datasets['sets'], datasets['inventories'], on='set_num').rename(columns = {'id': 'inventory_id'}), 
                                datasets['inventory_parts'], on = 'inventory_id'), datasets['colors'], 
                                left_on = 'color_id', right_on = 'id' )

topColors = yearColordf.groupby('rgb')['quantity'].sum().reset_index().sort_values(by='quantity', ascending=False).reset_index(drop=True)[:TopNColor]
groupedData = yearColordf[yearColordf.rgb.isin(topColors.rgb)][['year', 'rgb', 'quantity']].groupby(['year', 'rgb']).quantity.sum().reset_index()
pivot_data = groupedData.pivot(index = 'year', values = 'quantity', columns = 'rgb').fillna(0)

p = plot_bar_stacked_chart(pivot_data, 'year', 'Colors', 'Year', 1200, 300, useColsColr = True)
show(p)

#### Theme Composition By Part Category

In [7]:

TopNTheme = 50
TopNPartType = 20
setPartDf = pd.merge(pd.merge(pd.merge(datasets['sets'][['set_num', 'theme_id']], datasets['inventories'][['set_num', 'id']], on='set_num').rename(columns = {'id': 'inventory_id'}), 
                              datasets['inventory_parts'][['inventory_id', 'part_num', 'quantity']], on = 'inventory_id'), datasets['parts'][['part_cat_id', 'part_num']], 
                              on ='part_num')
setPartDf = pd.merge(setPartDf, datasets['part_categories'], left_on = 'part_cat_id', right_on = 'id').drop(['id'], 1).rename(columns = {'name': 'part_cat_name'})
ThemePartDf = pd.merge(setPartDf, datasets['themes'], left_on = 'theme_id', right_on = 'id')
topThemes = ThemePartDf.groupby('name')['id'].count().reset_index().rename(columns = {'id': 'count'}).sort_values(by='count', ascending=False)[:TopNTheme]
topPartType = ThemePartDf.groupby('part_cat_name')['id'].count().reset_index().rename(columns = {'id': 'count'}).sort_values(by='count', ascending=False)[:TopNPartType]
ThemePartTypeCount = ThemePartDf[(ThemePartDf.name.isin(topThemes.name)) & (ThemePartDf.part_cat_name.isin(topPartType.part_cat_name))]
ThemePartTypeCount = ThemePartTypeCount.groupby(['name', 'part_cat_name'])['quantity'].sum().reset_index()
ThemePartTypeCount_pivot = ThemePartTypeCount.pivot(index = 'name', values = 'quantity', columns = 'part_cat_name').fillna(0)
p = plot_bar_stacked_chart(ThemePartTypeCount_pivot, 'name', 'Colors', 'Year', 800, 500, createLegend=True)
p.legend.label_text_font_size = "5pt"
p.legend.glyph_height= 5
p.legend.glyph_width= 5
p.legend.label_height= 5
p.legend.label_width= 5
show(p)

#### Avg Number Of Parts Used By Theme

In [10]:

plot_df = pd.merge(datasets['sets'], datasets['themes'], left_on = 'theme_id', right_on = 'id')
show(plot_agg_bar(plot_df, 'name_y', 'num_parts',
                  X_label_name = "Theme Name", 
                  y_label_name = 'Average number of parts', 
                  orderBy = True, orderType = False, TopX = 50,
                  width=600, height=300,
                  agg = 'mean'))

  warn(message)


#### Number Of Parts By Part Type

In [11]:
plot_df = pd.merge(datasets['parts'], datasets['part_categories'], left_on = 'part_cat_id', right_on = 'id')
show(plot_agg_bar(plot_df, 'name_y', 'part_num',
                  X_label_name = "Theme Name", 
                  y_label_name = 'Average number of parts',
                  agg = 'count', width=600, height=300, ))

  warn(message)


### Data Modeling

#### Part type prediction by part description

The model that is being built tries to predict the part category given the part description. As there are 50+ part categories, there are two ways to build a model for this
1. ** Multiclass classifier **: I have used Random Forest Classifier for this as it is a good general purpose black box classifier. Also, it sort of makes sense that a dense enough forest with much more trees than the number of classes can effectively predict the class by building a set of trees for each of the classes

2. ** One Vs Rest Classifier **: I used a Naive Bayes SVM model first proposed [here](https://www.aclweb.org/anthology/P12-2018), to build binary classifiers for each of the part category. The reason for using the specified model is that it is shown to provide significant performance improvement and is very quick as well. An implementation detail, I have used Logistic Regression instead of SVM as sklearn internally uses the same liblinear library for both the implementations.  

In [12]:
ml_df = pd.merge(datasets['parts'], datasets['part_categories'], left_on = 'part_cat_id', right_on = 'id')
ml_df.rename(columns = {'name_x': 'part_desc', 'name_y': 'part_name'}, inplace = True)
ml_df.head()

Unnamed: 0,part_num,part_desc,part_cat_id,id,part_name
0,0687b1,Set 0687 Activity Booklet 1,17,17,Non-LEGO
1,10016414,Sticker Sheet #1 for 41055-1,17,17,Non-LEGO
2,10019stk01,Sticker for Set 10019 - (43274/4170393),17,17,Non-LEGO
3,10026stk01,Sticker for Set 10026 - (44942/4184185),17,17,Non-LEGO
4,10029stk01,Sticker for Set 10029 - (4216816),17,17,Non-LEGO


In [13]:
# As all the words are potentially descriptive we don't any sort of preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, log_loss, roc_auc_score
from datetime import datetime
from scipy import sparse
from sklearn.linear_model import LogisticRegression

In [14]:
class nbsvm():
    def pr(self, y_i, y, data):
        p = data[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)

    def fit(self, data, y):
        y = y.values
        #print 'y shape', y.shape
        r = sparse.csr_matrix(np.log(self.pr(1,y, data) / self.pr(0,y, data)))
        m = LogisticRegression(C=4, dual=True)
        x_nb = data.multiply(r)
        #np.multiply(r.reshape(1, len(r)), data.values)
        #print np.isnan(x_nb).any(), np.isnan(y).any(), np.isinf(x_nb).any(), np.isinf(y).any()
        self.model = (m.fit(x_nb, y), r)
        return self.model

    def predict_proba(self, data):
        m, r = self.model
        #return m.predict_proba(np.multiply(r.reshape(1, len(r)), data.values))
        return m.predict_proba(data.multiply(r))
    
def evaluate(y_true, y_pred):
    return {
       'accuracy Score': accuracy_score(y_true, y_pred)
    }

In [15]:
# Multi Class classifier
CV = 5
vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                      smooth_idf=1, sublinear_tf=1)
#using 10% of the data for test purpose

models = [('RFC', RandomForestClassifier(n_estimators = 500, max_depth = 30))]
accuracies = []
for mdlName, mdl in models:
    print('Model Name', mdlName)
    for x in xrange(CV):
        train_x, test_x, train_y, test_y = train_test_split(ml_df['part_desc'], ml_df['part_name'], test_size = 0.1)
        trn_term_doc = vec.fit_transform(train_x)
        test_term_doc = vec.transform(test_x)  
        mdl.fit(trn_term_doc, train_y.reset_index(drop=True))
        pred = mdl.predict(test_term_doc)
        model_eval = evaluate(test_y, pred)
        accuracies.append(model_eval['accuracy Score'])
        print 'For Fold: {}, aacuracy achieved{}'.format(x, accuracies[-1])
    print 'Mean Accuracy', np.array(accuracies).mean()

('Model Name', 'RFC')
For Fold: 0, aacuracy achieved0.736153846154
For Fold: 1, aacuracy achieved0.741153846154
For Fold: 2, aacuracy achieved0.747307692308
For Fold: 3, aacuracy achieved0.729230769231
For Fold: 4, aacuracy achieved0.728461538462
Mean Accuracy 0.7364615384615384


In [17]:
#one vs rest modeling
oneVRestAcc = []
labelEnc = pd.get_dummies(ml_df['part_name'])
models = [('nbsvm', nbsvm())]#, ('extraTreeClassifier', ExtraTreesClassifier(n_jobs=-1, random_state=3))]
for mdlName, mdl in models:
    print('Model Name', mdlName)
    for x in xrange(CV):
        train_x, test_x, train_y, test_y = train_test_split(ml_df['part_desc'], labelEnc, test_size = 0.1)
        x_ = test_y.stack()
        test_y_categorical = pd.Series(pd.Categorical(x_[x_!=0].index.get_level_values(1)))
        trn_term_doc = vec.fit_transform(train_x)
        test_term_doc = vec.transform(test_x)
        preds = np.zeros((test_term_doc.shape[0], labelEnc.shape[1]))
        for i, j in enumerate(labelEnc.columns):
            mdl.fit(trn_term_doc, train_y[j].reset_index(drop=True))
            preds[:,i] = mdl.predict_proba(test_term_doc)[:,1]
        pred_y = []
        for i in range(preds.shape[0]):
            pred_y.append(test_y.columns.values[np.argmax(preds[i,:])])
        accuracy = accuracy_score(test_y_categorical, pred_y)
        oneVRestAcc.append(accuracy)
        print 'For Fold: {}, aacuracy achieved: {}'.format(x, accuracy)
    print 'Mean Accuracy', np.array(oneVRestAcc).mean()
    print '*'*53

('Model Name', 'nbsvm')
For Fold: 0, aacuracy achieved0.903846153846
For Fold: 1, aacuracy achieved0.909615384615
For Fold: 2, aacuracy achieved0.904230769231
For Fold: 3, aacuracy achieved0.910769230769
For Fold: 4, aacuracy achieved0.913461538462
Mean Accuracy 0.9083846153846153
*****************************************************


## Conclusion
Building individual models for each of the part category in a one-vs-all fashion and using a model (NBSVM) which is more suited for NLP task lead to better results.

### Future Work
Any sort of text preprocessing was not done here as the part descriptions seemed pretty standard as well as cause common preprocessing steps like removing special characters might not be the right thing to do here, as something like a part number can be very indictive of the part type. So it would be worth trying some text preprocessing to see if it improves the model performance or not.