In [1]:
# Import Statements
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

In [2]:
# load competition data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train.shape, test.shape

((2586, 3), (288, 2))

In [4]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [5]:
test.head()

Unnamed: 0,id,description
0,955,"Think carnival aromas—the good ones, anyway—me..."
1,3532,"A blend of three bourbons, between 6 and 12 ye..."
2,1390,"The nose is focused on cereal, hints of fresh ..."
3,1024,Swiss-based Chapter 7 released this 19 year ol...
4,1902,Valkyrie replaces the current Dark Origins exp...


In [6]:
nlp = spacy.load("en_core_web_lg")

In [7]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [11]:
X = get_word_vectors(train['description'])

In [12]:
rfc = RandomForestClassifier(n_estimators=100)

In [13]:
rfc.fit(X, train['category'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
X_test = get_word_vectors(test['description'])

In [16]:
pred = rfc.predict(X_test)

In [17]:
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [18]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,1
2,1390,1
3,1024,1
4,1902,1


In [19]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 2 columns):
id          288 non-null int64
category    288 non-null int64
dtypes: int64(2)
memory usage: 4.6 KB


In [20]:
# Save your Submission File
# Best to Use an Integer or Timestamp for different versions of your model
submission.to_csv('./data/submission_01.csv', index=False)