In [1]:
import pandas as pd
df = pd.read_csv("train.csv")
df = df.dropna()
df.shape

(2476, 7)

In [2]:
from sklearn.model_selection import train_test_split

X = df[['id', 'description']]
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1980, 2), (496, 2), (1980,), (496,))

In [4]:
import spacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_sm")

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

tokens = []

for doc in tokenizer.pipe(X_train['description']):
    doc_tokens = []
    for token in doc:
        if (token.is_stop == False) and (token.is_punct == False):
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
    
X_train['tokens'] = tokens
X_train['tokens'].head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


830     [compared, 10, year, old,, one’s, richer,, dar...
906     [singleton, bottlings,, dufftown, aimed, europ...
2613    [originally, one-off, bottling, friends, class...
1840    [similar, personality, younger, standard, 1993...
838     [cask, 328, filled, glenturret, december, 16,,...
Name: tokens, dtype: object

In [6]:
train_docs = list(X_train['tokens'])

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=5000, tokenizer=lambda doc: doc, lowercase=False)

dtm_train = tfidf.fit_transform(train_docs)

# View Feature Matrix as DataFrame
train_df = pd.DataFrame(dtm_train.todense(), columns = tfidf.get_feature_names())
train_df.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,$160.,(375,(400,"(5,000",(50,(500,...,ìle,‘bodega’,‘house’,‘rothes,“aged,“single,“small,“young,Unnamed: 20,€50
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from sklearn.linear_model import LogisticRegression 

classifier = LogisticRegression(solver='lbfgs')

classifier.fit(dtm_train.todense(), y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
nlp = spacy.load("en_core_web_sm")

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

tokens = []

for doc in tokenizer.pipe(X_test['description']):
    doc_tokens = []
    for token in doc:
        if (token.is_stop == False) and (token.is_punct == False):
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
    
X_test['tokens'] = tokens
X_test['tokens'].head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


608     [style:, highland, single, malt, scotch, color...
1642    [aged, island, dram, mull, distillery’s, lengt...
2382    [aged, bourbon, cask., big,, vibrantly, fruity...
1669    [single, barrel, releases, aren’t, uncommon, c...
423     [deep, amber., generous, sweet, sherried, nose...
Name: tokens, dtype: object

In [10]:
test_docs = list(X_test['tokens'])

tfidf = TfidfVectorizer(stop_words='english', max_features=5000, tokenizer=lambda doc: doc, lowercase=False)

dtm_test = tfidf.fit_transform(test_docs)

# View Feature Matrix as DataFrame
test_df = pd.DataFrame(dtm_test.todense(), columns = tfidf.get_feature_names())
test_df.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,"""whisky",#1,#35,#55-6f,#9315.,$100.,...,“sour,“spirits,“straight,“straight.”,“texas,“traditional,“upgrade”,“whiskey”,“wine,“work
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284077,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
classifier.score(dtm_test.todense(), y_test)

0.6310483870967742

In [14]:
import sys
!{sys.executable} -m pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/84/4e2cae6247f397f83d8adc5c2a2a0c5d7d790a14a4c7400ff6574586f589/xgboost-0.90.tar.gz (676kB)
[K    100% |████████████████████████████████| 686kB 7.2MB/s 
Building wheels for collected packages: xgboost
  Building wheel for xgboost (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/ljohnson/Library/Caches/pip/wheels/e9/48/4d/de4187b5270dff71d3697c5a7857a1e2d9a0c63a28b3462eeb
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.90


In [17]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier(max_depth=14)
xgb_classifier.fit(dtm_train.todense(), y_train)
xgb_classifier.score(dtm_test.todense(), y_test)

0.6048387096774194

In [16]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier()
clf.fit(dtm_train.todense(), y_train)
clf.score(dtm_test.todense(), y_test)



0.5967741935483871

In [None]:
y_pred = classifier.predict(test)

sample_submission = pd.read_csv('https://raw.githubusercontent.com/livjab/DS3-Kaggle-Comptetion/master/sample_submission.csv')
submission = sample_submission.copy()
submission['status_group'] = y_pred

from google.colab import files
submission.to_csv('LJ-fourth-submission.csv', index=False)
files.download('LJ-fourth-submission.csv')