In [2]:
import numpy as np
import pandas as pd
from sklearn import model_selection, feature_extraction, preprocessing, svm, pipeline, metrics, tree, linear_model
import json

# Raw Data

In [4]:
with open('train_data.json') as fd:
    train_data = pd.DataFrame(json.load(fd))
    
with open('test_data.json') as fd:
    test_data = pd.DataFrame(json.load(fd))

It's good to use 80% and 20% of the dataset for training and testing respectively.

In [5]:
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(
    train_data.drop(columns=['category']), 
    train_data.drop(columns=['text']), 
    test_size=0.2)

# Experimenting with Different Classifiers

The following experiments are already using pipelining. The pipeline structures the raw data. It also extracts and selects features from the structured data.

## Logistic Regression

In [6]:
clf_pipeline_logistic_reg = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', linear_model.LogisticRegression())])

clf_pipeline_logistic_reg.fit(X_train['text'], y_train);

  y = column_or_1d(y, warn=True)


### Prediction on Validation Set

In [7]:
logistic_reg_predictions = clf_pipeline_logistic_reg.predict(X_validation['text'])

print(metrics.classification_report(y_validation, logistic_reg_predictions))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85      2492
           1       0.85      0.86      0.86      2508

   micro avg       0.85      0.85      0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



## Decision Tree

In [8]:
clf_pipeline_tree = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', tree.DecisionTreeClassifier())])

clf_pipeline_tree.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [9]:
tree_predictions = clf_pipeline_tree.predict(X_validation['text'])

print(metrics.classification_report(y_validation, tree_predictions))

              precision    recall  f1-score   support

           0       0.71      0.70      0.71      2492
           1       0.71      0.72      0.71      2508

   micro avg       0.71      0.71      0.71      5000
   macro avg       0.71      0.71      0.71      5000
weighted avg       0.71      0.71      0.71      5000



## SVM

In [10]:
clf_pipeline_svm = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

clf_pipeline_svm.fit(X_train['text'], y_train);

  y = column_or_1d(y, warn=True)


### Prediction on Validation Set

In [11]:
svm_predictions = clf_pipeline_svm.predict(X_validation['text'])

print(metrics.classification_report(y_validation, svm_predictions))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88      2492
           1       0.88      0.89      0.89      2508

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



# Experimenting with Different Feature Extraction Pipelines

## Binary Occurences

In [12]:
clf_pipeline_svm_bin = pipeline.Pipeline([
    ('vect', feature_extraction.text.HashingVectorizer(binary=True)),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

clf_pipeline_svm_bin.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [13]:
svm_bin_predictions = clf_pipeline_svm_bin.predict(X_validation['text'])

print(metrics.classification_report(y_validation, svm_bin_predictions))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      2492
           1       0.88      0.89      0.88      2508

   micro avg       0.88      0.88      0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



## TF-IDF Weighting

In [14]:
clf_pipeline_svm_tfidf = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

clf_pipeline_svm_tfidf.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [15]:
svm_tfidf_predictions = clf_pipeline_svm_tfidf.predict(X_validation['text'])

print(metrics.classification_report(y_validation, svm_tfidf_predictions))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89      2492
           1       0.89      0.90      0.90      2508

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



## Custom Parameters
These parameters will used in GridResearch and RandomResearch to see if they can improve accuracy.<br>

textblob_tokenizer, stemming_tokenizer, token.tokenize, and nltk.word_tokenize are the custom paramerters for the tokenizing process in CountVectorizer.

In [17]:
from textblob import TextBlob
from nltk.stem.porter import PorterStemmer
import re
porter_stemmer = PorterStemmer()

# Try TextBlob
def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

# Try NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [18]:
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

In [20]:
import nltk
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Using TF-IDF and Linear SVM

This is our best classifier so far!

In [21]:
clf_pipeline_svm = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

### With Grid Search CV

In [24]:
parameters = {'vect__ngram_range': [(1,1), (1,2), (2,2)],
              'vect__max_features':[None],
              'vect__binary': [True],
              'vect__strip_accents': ['ascii'],
              'vect__analyzer':['word'],
              'vect__max_df' :[1.0],
              'vect__tokenizer':[token.tokenize],
              'vect__strip_accents': ['unicode'],
              'norm__norm': ['l2'],
              'tfidf__norm': ['l1'],
              'tfidf__smooth_idf': [False],
              'tfidf__use_idf': [True],
              'clf__random_state': [42],
              'clf__C':[1, 10],
              'clf__fit_intercept': [True]}

grid_search_cv = model_selection.GridSearchCV(clf_pipeline_svm, parameters, cv=2, n_jobs=6, verbose=10, error_score=0)
grid_search_cv.fit(X_train['text'], y_train)

print('Best Parameters:', grid_search_cv.best_params_);

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   10.8s
[Parallel(n_jobs=6)]: Done   3 out of  12 | elapsed:   22.6s remaining:  1.1min
[Parallel(n_jobs=6)]: Done   5 out of  12 | elapsed:   25.0s remaining:   35.0s
[Parallel(n_jobs=6)]: Done   7 out of  12 | elapsed:   32.7s remaining:   23.3s
[Parallel(n_jobs=6)]: Done   9 out of  12 | elapsed:   51.9s remaining:   17.2s
[Parallel(n_jobs=6)]: Done  12 out of  12 | elapsed:   56.8s finished
  y = column_or_1d(y, warn=True)


Best Parameters: {'clf__C': 10, 'clf__random_state': 551, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


#### Prediction on Validation Set

In [25]:
predictions = grid_search_cv.predict(X_validation['text'])

print(metrics.classification.classification_report(y_validation, predictions))

              precision    recall  f1-score   support

           0       0.91      0.90      0.90      2492
           1       0.90      0.91      0.91      2508

   micro avg       0.91      0.91      0.91      5000
   macro avg       0.91      0.91      0.91      5000
weighted avg       0.91      0.91      0.91      5000



### With Randomized Search CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as randint
from scipy.stats import uniform

seed = 42

random_search = RandomizedSearchCV(clf_pipeline_svm, param_distributions = parameters, cv=2, verbose = 10, random_state = seed, n_iter = 60)
random_search.fit(X_train['text'], y_train)

#### Prediction on Validation Set

In [None]:
y_pred_randcv = random_search.predict(X_validation['text'])
print(metrics.classification_report(y_validation, y_pred_randcv))

# Prediction on Test Set Using Our Best Classifier

In [14]:
test_set_predictions = grid_search_cv.predict(test_data['text'])

data = {'Id' : test_data['id'], 'Category': test_set_predictions}
submission = pd.DataFrame(data=data)
submission = submission.apply(pd.to_numeric).sort_values(by=['Id'])

In [15]:
submission.to_csv('submission.csv', encoding='utf-8', index=False)