# SGD Classifier

### Training model on small dataset, which contains almost equal number of observations for each category.

In [2]:
# Importing libraries

import numpy as np
import pandas as pd
pd.options.display.max_rows = 500

In [23]:
#loading data into dataframe

data = pd.read_csv('1210_6000.csv') #Training data
finery = pd.read_csv('1210test_data.csv') #Test data

In [4]:
#looking at data

data.head().transpose()

Unnamed: 0,0,1,2,3,4
brandnamestore,astrthelabel anika top planetblue,bebe grosgrain trim bustier bebe,j.crewfactory drapey tank top with silky hem j...,freepeople jordan burnout tee nordstrom,openingceremony broderie anglaise cotton top t...
categoryId,110,110,110,110,110
itemName,anika top,grosgrain trim bustier,drapey tank top with silky hem,jordan burnout tee,broderie anglaise cotton top
brandnamestore_tokens,"['astrthelabel', 'anika', 'top', 'planetblue']","['bebe', 'grosgrain', 'trim', 'bustier', 'bebe']","['crewfactory', 'drapey', 'tank', 'top', 'silk...","['freepeople', 'jordan', 'burnout', 'tee', 'no...","['openingceremony', 'broderie', 'anglaise', 'c..."
brandnamestore_tokens_clean,astrthelabel anika top planetblue,bebe grosgrain trim bustier bebe,crewfactory drapey tank top silky hem crewfactory,freepeople jordan burnout tee nordstrom,openingceremony broderie anglaise cotton top t...


In [5]:
#checking Nan values
data.isnull().sum()

#found one missing value

brandnamestore                 0
categoryId                     0
itemName                       0
brandnamestore_tokens          0
brandnamestore_tokens_clean    1
dtype: int64

In [6]:
#Dropping rows with Nan values

data.dropna(inplace=True)

In [7]:
# There is Nan in one row in finery test data after tokenization, so i filled that with empty string.

finery.fillna('',inplace=True)

In [8]:
# assigning rain data 

X = data['brandnamestore_tokens_clean']
Y = data['categoryId']

In [9]:
# train_test_split for test and train data

# try:
#     from sklearn.model_selection import train_test_split
# except: 
#     from sklearn.cross_validation import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=42)

In [10]:
# X_test.head()

In [11]:
# X_train.head()

In [12]:
# y_test.shape

In [13]:
# y_train.shape

In [14]:
# taking product token for test data 

finery_test = finery.brandnamestore_tokens_clean

In [15]:
# Removing word 'Top' from stopwords and create myown stop words
# because 'Top' is a women's clothing keyword

mystopwords = ['all',
 'amount',
 'am',
 'might',
 'give',
 'here',
 'alone',
 'nobody',
 'formerly',
 'indeed',
 'made',
 'seeming',
 'five',
 'with',
 'amongst',
 'somewhere',
 'someone',
 'an',
 'he',
 'hereafter',
 'empty',
 'their',
 'among',
 'out',
 'themselves',
 'bottom',
 'no',
 'make',
 'behind',
 'co',
 'to',
 'they',
 'anyhow',
 'each',
 'detail',
 'sixty',
 'her',
 'what',
 'would',
 'sincere',
 'this',
 'for',
 'across',
 'least',
 'too',
 'any',
 'up',
 'becomes',
 'myself',
 'those',
 'some',
 'upon',
 'onto',
 'therein',
 'herein',
 'many',
 'third',
 'along',
 'will',
 're',
 'ours',
 'already',
 'before',
 'during',
 'twenty',
 'whence',
 'though',
 'last',
 'none',
 'serious',
 'show',
 'less',
 'a',
 'i',
 'enough',
 'since',
 'first',
 'must',
 'whenever',
 'cannot',
 'nor',
 'should',
 'thereupon',
 'it',
 'put',
 'whatever',
 'until',
 'something',
 'unless',
 'much',
 'therefore',
 'fire',
 'every',
 'still',
 'hers',
 'latterly',
 'whole',
 'so',
 'bill',
 'go',
 'own',
 'eight',
 'is',
 'us',
 'beside',
 'whereafter',
 'yet',
 'anyway',
 'she',
 'perhaps',
 'why',
 'both',
 'hundred',
 'four',
 'namely',
 'who',
 'its',
 'over',
 'there',
 'my',
 'whereas',
 'doing',
 'say',
 'fify',
 'except',
 'most',
 'besides',
 'another',
 'describe',
 'only',
 'these',
 'nowhere',
 'may',
 'everyone',
 'ltd',
 'been',
 'hasnt',
 'was',
 'towards',
 'never',
 'were',
 'under',
 'we',
 'if',
 'several',
 'that',
 'computer',
 'side',
 'elsewhere',
 'un',
 'wherein',
 'see',
 'well',
 'between',
 'amoungst',
 'else',
 'his',
 'always',
 'throughout',
 'be',
 'ever',
 'wherever',
 'itself',
 'did',
 'via',
 'had',
 'whom',
 'being',
 'back',
 'get',
 'your',
 'and',
 'former',
 'six',
 'etc',
 'everywhere',
 'now',
 'while',
 'same',
 'doesn',
 'whither',
 'beforehand',
 'using',
 'anything',
 'didn',
 'sometime',
 'hence',
 'done',
 'inc',
 'other',
 'really',
 'part',
 'ie',
 'hereupon',
 'thru',
 'about',
 'where',
 'found',
 'them',
 'mill',
 'name',
 'become',
 'thence',
 'ten',
 'further',
 'system',
 'next',
 'call',
 'below',
 'on',
 'because',
 'as',
 'again',
 'above',
 'otherwise',
 'seems',
 'three',
 'mostly',
 'the',
 'take',
 'our',
 'although',
 'once',
 'one',
 'per',
 'also',
 'few',
 'are',
 'yours',
 'has',
 'ourselves',
 'km',
 'how',
 'yourself',
 'than',
 'full',
 'meanwhile',
 'thus',
 'however',
 'within',
 'two',
 'regarding',
 'various',
 'moreover',
 'himself',
 'at',
 'anywhere',
 'con',
 'noone',
 'without',
 'cant',
 'do',
 'mine',
 'often',
 'herself',
 'rather',
 'whether',
 'toward',
 'twelve',
 'around',
 'nine',
 'thereafter',
 'against',
 'into',
 'eg',
 'others',
 'of',
 'after',
 'de',
 'front',
 'seem',
 'in',
 'whereby',
 'or',
 'used',
 'could',
 'whoever',
 'eleven',
 'such',
 'whose',
 'thin',
 'but',
 'off',
 'latter',
 'even',
 'cry',
 'anyone',
 'afterwards',
 'becoming',
 'yourselves',
 'quite',
 'thick',
 'forty',
 'couldnt',
 'find',
 'neither',
 'became',
 'hereby',
 'not',
 'which',
 'then',
 'him',
 'more',
 'somehow',
 'everything',
 'by',
 'together',
 'interest',
 'fifteen',
 'keep',
 'kg',
 'nevertheless',
 'can',
 'you',
 'have',
 'sometimes',
 'either',
 'does',
 'when',
 'thereby',
 'don',
 'seemed',
 'move',
 'from',
 'almost',
 'nothing',
 'down',
 'very',
 'please',
 'me',
 'just',
 'fill',
 'due',
 'through',
 'whereupon',
 'beyond']

# SGD Classifier

# 1

In [16]:
#Running SGD Classifier on best parameters of grid search
#Making Pipeline

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
p = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=mystopwords,use_idf=True, min_df = 5,max_df = 0.5, ngram_range = (1,3))),
    ('sgd', SGDClassifier(alpha=0.00001, average=False, class_weight='balanced', epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0,
       learning_rate='optimal', loss='log', max_iter=1000, n_iter=None,
       n_jobs=-1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False))
])


In [17]:
#Fitting model on entire Data

p.fit(X,Y)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=5,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...'l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False))])

In [18]:
#used for train test split

# predictions = p.predict(X_test)

# correct = sum(predictions == y_test)
# incorrect = len(predictions) - correct
# print(('got {} / {} correct [{:.2%}]').format(correct, correct + incorrect, correct / float(correct + incorrect)))

In [19]:
# p.score(X_test, y_test)

In [20]:
# checking probability of word or product name

predictions = p.predict_proba(['boots'])
list(filter(lambda x: x[1] > 0.1, zip(p.classes_, predictions[0])))

[(200.0, 0.9443169968441077)]

In [21]:
# probability of product to belong in each category

predictions

array([[1.68913428e-02, 3.25343505e-03, 1.76620489e-03, 3.17376182e-03,
        1.20805766e-03, 2.17336103e-03, 4.02099833e-03, 3.93849667e-03,
        9.44316997e-01, 1.93430478e-03, 2.46969357e-03, 3.44961128e-03,
        1.88583481e-03, 1.22175735e-04, 9.39572474e-03]])

In [22]:
#Predicting finest test data

pred = p.predict(finery_test)

In [57]:
#creating new 'Predict' column in finery dataframe

finery['Predict'] = pred

In [58]:
#writing output into 'csv' file

finery.to_csv('SGD1212w_predictnp.csv', index=False)