In [18]:
import pandas as pd
df = pd.read_csv("train.csv")
df = df.dropna()
df.shape

(2476, 7)

In [19]:
from sklearn.model_selection import train_test_split

X = df[['id', 'description']]
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [20]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1980, 2), (496, 2), (1980,), (496,))

In [21]:
import spacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_sm")

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

tokens = []

for doc in tokenizer.pipe(X_train['description']):
    doc_tokens = []
    for token in doc:
        if (token.is_stop == False) and (token.is_punct == False):
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
    
X_train['tokens'] = tokens
X_train['tokens'].head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2104    [sourced, whiskey,, moved, bourbon, barrels,, ...
222     [bottled, commemorate, 150th, anniversary, can...
281     [thomas, chen, introduced, canadian, rockies, ...
2631    [mix, bourbon, quarter, casks, finished, pedro...
732     [marriage, 13, 16, year, old, bourbons, honori...
Name: tokens, dtype: object

In [28]:
train_docs = list(X_train['tokens'])

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=6000, tokenizer=lambda doc: doc, lowercase=False)

dtm_train = tfidf.fit_transform(train_docs)

# View Feature Matrix as DataFrame
train_df = pd.DataFrame(dtm_train.todense(), columns = tfidf.get_feature_names())
train_df.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,$160.,$60,(375,(400,"(5,000",(50,...,½,ìle,‘house’,‘rothes,‘the,“a,“aged,“ardbeg,“new”,€50
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
from sklearn.linear_model import LogisticRegression 

classifier = LogisticRegression(solver='lbfgs')

classifier.fit(dtm_train.todense(), y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [30]:
nlp = spacy.load("en_core_web_sm")

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

tokens = []

for doc in tokenizer.pipe(X_test['description']):
    doc_tokens = []
    for token in doc:
        if (token.is_stop == False) and (token.is_punct == False):
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
    
X_test['tokens'] = tokens
X_test['tokens'].head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


1000    [sounding, like, character, asterix, whisky,, ...
2458    [pittsburgh, distillery’s, series, “whims”, on...
1061    [youngest, manse, brae, triumvirate,, freshest...
2737    [inchmurrin, enjoyed, higher, profile, early, ...
1182    [2009,, brewers, bob, baxter, alan, hansen, ad...
Name: tokens, dtype: object

In [31]:
test_docs = list(X_test['tokens'])

tfidf = TfidfVectorizer(stop_words='english', max_features=6000, tokenizer=lambda doc: doc, lowercase=False)

dtm_test = tfidf.fit_transform(test_docs)

# View Feature Matrix as DataFrame
test_df = pd.DataFrame(dtm_test.todense(), columns = tfidf.get_feature_names())
test_df.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,"""comfort""","""polished","""sure",#2096,...,“ultra,“weight”,“whims”,“work,“young,Unnamed: 17,"€1,000",€21,€40,€42
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.164746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193724,0.0
3,0.122392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
classifier.score(dtm_test.todense(), y_test)

0.6169354838709677

In [14]:
import sys
!{sys.executable} -m pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/84/4e2cae6247f397f83d8adc5c2a2a0c5d7d790a14a4c7400ff6574586f589/xgboost-0.90.tar.gz (676kB)
[K    100% |████████████████████████████████| 686kB 7.2MB/s 
Building wheels for collected packages: xgboost
  Building wheel for xgboost (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/ljohnson/Library/Caches/pip/wheels/e9/48/4d/de4187b5270dff71d3697c5a7857a1e2d9a0c63a28b3462eeb
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.90


In [34]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier(max_depth=7, n_jobs=-1)
xgb_classifier.fit(dtm_train.todense(), y_train)
xgb_classifier.score(dtm_test.todense(), y_test)

0.5625

In [35]:
# trying it the J.C. way
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

vect = TfidfVectorizer(stop_words='english', max_features=5000, tokenizer=lambda doc: doc, lowercase=False)
sgdc = SGDClassifier()

pipe = Pipeline([('vect', vect), ('clf', sgdc)])

# Fit Pipeline
pipe.fit(train_docs, y_train)
# test pipeline
pipe.score(test_docs, y_test)

  'stop_words.' % sorted(inconsistent))


0.9233870967741935

In [37]:
# with a grid search
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'clf__max_iter':(20, 10, 100)
}

grid_search = GridSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)

grid_search.fit(train_docs, y_train)
grid_search.score(test_docs, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   16.2s finished
  'stop_words.' % sorted(inconsistent))


0.9254032258064516

In [53]:
# Lets submit a prediction with the test data
test = pd.read_csv("test.csv")

X = test[['id', 'description']]

In [54]:
test.shape, X.shape

((288, 6), (288, 2))

In [55]:
tokenizer = Tokenizer(nlp.vocab)

tokens = []

for doc in tokenizer.pipe(X['description']):
    doc_tokens = []
    for token in doc:
        if (token.is_stop == False) and (token.is_punct == False):
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
    
X['tokens'] = tokens
X['tokens'].head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


0    [think, carnival, aromas—the, good, ones,, any...
1    [blend, bourbons,, 6, 12, years, old;, rye, wh...
2    [nose, focused, cereal,, hints, fresh, ripe, c...
3    [swiss-based, chapter, 7, released, 19, year, ...
4    [valkyrie, replaces, current, dark, origins, e...
Name: tokens, dtype: object

In [56]:
docs = list(X['tokens'])

In [65]:
#going to submit this prediction
y_pred = grid_search.predict(docs)

sample_submission = pd.read_csv('sample_submission.csv')
submission = sample_submission.copy()
submission['category'] = y_pred
submission = submission.astype('int64')

submission.to_csv('LJ-first-submission.csv', index=False)
#read it back to make sure we have ints not floats
submission = pd.read_csv("LJ-first-submission.csv")
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [66]:
# download file

from IPython.display import HTML
import base64

def create_download_link( df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(submission)

In [67]:
#trying out LSI

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, 
                   algorithm='randomized',
                   n_iter=10)

In [68]:
# LSI

lsi = Pipeline([('vect', vect), ('svd', svd)])

In [73]:
# Pipe

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

params = {
    'lsi__vect__max_df':(0.5, 0.75, 1.0)
}

grid_search = GridSearchCV(pipe, params, cv=5, n_jobs=-1, verbose=1)

grid_search.fit(train_docs, y_train)
grid_search.score(test_docs, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   10.1s finished
  'stop_words.' % sorted(inconsistent))


0.9173387096774194