In [1]:
import numpy as np
import pandas as pd
import os
from sklearn import model_selection, feature_extraction, preprocessing, svm, pipeline, metrics, tree, linear_model
from project_utilities import import_train_data, import_test_data

# Raw Data

In [2]:
os.chdir(r'C:\Users\35904\Desktop\Mcgill_Study\551ML\project2\train\train')
path = os.getcwd()
import_train_data = import_train_data(path)

os.chdir(r'C:\Users\35904\Desktop\Mcgill_Study\551ML\project2\test\test')
path = os.getcwd()
import_test_data = import_test_data(path)

In [3]:
train_data = pd.DataFrame(import_train_data)
test_data = pd.DataFrame(import_test_data)

It's good to use 80% and 20% of the dataset for training and testing respectively.

In [4]:
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(
    train_data.drop(columns=['category']), 
    train_data.drop(columns=['text']), 
    test_size=0.2,random_state = 42)

# Experimenting with Different Classifiers

The following experiments are already using pipelining. The pipeline structures the raw data. It also extracts and selects features from the structured data.

## Logistic Regression

In [None]:
clf_pipeline_logistic_reg = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', linear_model.LogisticRegression())])

clf_pipeline_logistic_reg.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [None]:
logistic_reg_predictions = clf_pipeline_logistic_reg.predict(X_validation['text'])

print(metrics.classification_report(y_validation, logistic_reg_predictions))

## Decision Tree

In [None]:
clf_pipeline_tree = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', tree.DecisionTreeClassifier())])

clf_pipeline_tree.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [None]:
tree_predictions = clf_pipeline_tree.predict(X_validation['text'])

print(metrics.classification_report(y_validation, tree_predictions))

## Linear SVM

In [None]:
clf_pipeline_svm = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

clf_pipeline_svm.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [None]:
svm_predictions = clf_pipeline_svm.predict(X_validation['text'])

print(metrics.classification_report(y_validation, svm_predictions))

# Experimenting with Different Feature Extraction Pipelines

## Binary Occurences

In [None]:
clf_pipeline_svm_bin = pipeline.Pipeline([
    ('vect', feature_extraction.text.HashingVectorizer(binary=True)),#Convert a collection of text documents to a matrix of token occurrences
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

clf_pipeline_svm_bin.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [None]:
svm_bin_predictions = clf_pipeline_svm_bin.predict(X_validation['text'])

print(metrics.classification_report(y_validation, svm_bin_predictions))

## TF-IDF Weighting

In [None]:
clf_pipeline_svm_tfidf = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer(tokenizer = token.tokenize)), #(nltk.word_tokenize,textblob_tokenizer,token.tokenize is good)
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

clf_pipeline_svm_tfidf.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [None]:
svm_tfidf_predictions = clf_pipeline_svm_tfidf.predict(X_validation['text'])

print(metrics.classification_report(y_validation, svm_tfidf_predictions))

# Some customer parameters
These parameters will used in GridResearch and RandomResearch to see if they can improve the occurency

```textblob_tokenizer```, ```stemming_tokenizer```, ```token.tokenize```, and ```nltk.word_tokenize``` is the customer paramerters for paremeter ```tokenize``` in ```CountVectorizer```, and ```MyAnalyzer``` is the customer parameters for ```analysis``` of ```CountVectorizer```

### textblob_tokenizer and stemming_tokenizer
There two worse the results

In [1]:
from textblob import TextBlob
from nltk.stem.porter import PorterStemmer
import re
porter_stemmer = PorterStemmer()

# Use TextBlob
def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

### token.tokenize
Improve the results which is the best one

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

### nltk.word_tokenize
Improve the results

In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\35904\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### MyAnalyzer
Worse the results

In [None]:
class MyAnalyzer(object):
    
    # load spaCy's english model and define the tokenizer/lemmatizer
    def __init__(self):
        spacy.load('en')
        self.lemmatizer_ = spacy.lang.en.English()
        
    # allow the class instance to be called just like
    # just like a function and applies the preprocessing and
    # tokenize the document
    def __call__(self, doc):
        doc_clean = unescape(doc).lower()
        tokens = self.lemmatizer_(doc_clean)
        return([token.lemma_ for token in tokens])
    
analyzer = MyAnalyzer()


## Using TF-IDF and Linear SVM with GridSearch Cross Validation

This is our best classifier so far!

In [None]:
clf_pipeline_svm = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

### NuSVC (optional)

In [7]:
clf_pipeline_svm = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),#tokenizer=nltk.word_tokenize
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.NuSVC())])

In [None]:
clf_pipeline_svm.get_params()

In [None]:
random_state = 42

parameters = {'vect__ngram_range': [(1,2)],#(1,1), (2,2),(1,3),(2,3)
              'vect__max_features':[None],  # max (20000, 68354) #,10000,20000, 30000,40000, 50000,60000
              'vect__binary': [True],#  , False
              'vect__strip_accents': ['ascii'], #,'unicode',None
              'vect__analyzer':['word'],# 'char', 'char_wb'
              'vect__max_df' :[1.0], # 'C': [0.1, 1, 10, 100]
              'vect__tokenizer':[token.tokenize], #None, textblob_tokenizer,stemming_tokenizer, nltk.word_tokenize,my_tokenizer,token.tokenize
              'vect__strip_accents': ['unicode'],# None,
              'norm__norm': ['l2'], #,'l1'
              'tfidf__norm': ['l1'], #, 'l2'
              'tfidf__smooth_idf': [False],#True, 
              'tfidf__use_idf': [True],
#               'clf__kernel':['rbf','linear','poly','sigmoid','precomputed'],
#               'clf__gamma':[0.001, 0.01, 0.1, 1],
#               'clf__degree':[1,2,3,4],
#               'clf__nu':[0.2, 0.4, 0.5, 0.6, 0.8],
              'clf__random_state': [42],
              'clf__C':[10],
              'clf__fit_intercept': [True], #, False
             }

grid_search_cv = model_selection.GridSearchCV(clf_pipeline_svm, parameters, cv=2, n_jobs=6, verbose=20)
grid_search_cv.fit(X_train['text'], y_train)

print('Best Parameters:', grid_search_cv.best_params_);

### Prediction on Validation Set

In [None]:
predictions = grid_search_cv.predict(X_validation['text'])

print(metrics.classification.classification_report(y_validation, predictions))

### Linear SVM with GridSearch

In [None]:
import numpy as np
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as randint
from scipy.stats import uniform

seed = 42

parameters = {'vect__ngram_range': [(1,2)],#(1,1), (2,2),(1,3),(2,3)
              'vect__max_features':[None],  # max (20000, 68354) #,10000,20000, 30000,40000, 50000,60000
              'vect__binary': [True],#  , False
              'vect__strip_accents': ['ascii'], #,'unicode',None
              'vect__analyzer':['word'],# 'char', 'char_wb'
              'vect__max_df' :[1.0], # 'C': [0.1, 1, 10, 100]
              'vect__tokenizer':[token.tokenize], #None, textblob_tokenizer,stemming_tokenizer, nltk.word_tokenize,my_tokenizer,token.tokenize
              'vect__strip_accents': ['unicode'],# None,
              'norm__norm': ['l2'], #,'l1'
              'tfidf__norm': ['l1'], #, 'l2'
              'tfidf__smooth_idf': [False],#True, 
              'tfidf__use_idf': [True],
#               'clf__kernel':['rbf','linear','poly','sigmoid'], #,'precomputed'
#               'clf__gamma':[0.001, 0.01, 0.1, 1],
#               'clf__degree':[1,2,3,4],
#               'clf__nu':[0.2, 0.4, 0.5, 0.6, 0.8],
#               'clf__random_state': [42],
              'clf__C':[10],
              'clf__fit_intercept': [True], #, False
              
             }

random_search = RandomizedSearchCV(clf_pipeline_svm, param_distributions = parameters, cv=2, verbose = 10, random_state = seed, n_iter = 60)
random_search.fit(X_train['text'], y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 60 candidates, totalling 120 fits
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.8, clf__kernel=linear, clf__gamma=0.001, clf__degree=3 


  y = column_or_1d(y, warn=True)


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.8, clf__kernel=linear, clf__gamma=0.001, clf__degree=3, score=0.8804119588041196, total=10.0min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.8, clf__kernel=linear, clf__gamma=0.001

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 13.8min remaining:    0.0s


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.8, clf__kernel=linear, clf__gamma=0.001, clf__degree=3, score=0.8865886588658866, total= 7.9min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.2, clf__kernel=linear, clf__gamma=1, cl

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 25.0min remaining:    0.0s


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.2, clf__kernel=linear, clf__gamma=1, clf__degree=3, score=0.9011098890110989, total=12.1min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.2, clf__kernel=linear, clf__gamma=1, clf__d

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 40.2min remaining:    0.0s


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.2, clf__kernel=linear, clf__gamma=1, clf__degree=3, score=0.9041904190419042, total=12.0min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.6, clf__kernel=linear, clf__gamma=0.01, clf

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 55.3min remaining:    0.0s


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.6, clf__kernel=linear, clf__gamma=0.01, clf__degree=1, score=0.8982101789821018, total= 8.0min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.6, clf__kernel=linear, clf__gamma=0.01, 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 66.4min remaining:    0.0s


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.6, clf__kernel=linear, clf__gamma=0.01, clf__degree=1, score=0.9007900790079008, total= 7.9min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.2, clf__kernel=poly, clf__gamma=0.001, c

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 77.3min remaining:    0.0s


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.2, clf__kernel=poly, clf__gamma=0.001, clf__degree=3, score=0.8478152184781522, total= 1.7min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.2, clf__kernel=poly, clf__gamma=0.001, cl

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 79.8min remaining:    0.0s


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.2, clf__kernel=poly, clf__gamma=0.001, clf__degree=3, score=0.7038703870387039, total= 1.7min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.6, clf__kernel=sigmoid, clf__gamma=1, clf

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 82.2min remaining:    0.0s


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.6, clf__kernel=sigmoid, clf__gamma=1, clf__degree=1, score=0.8988101189881011, total= 7.4min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.6, clf__kernel=sigmoid, clf__gamma=1, clf_

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 92.7min remaining:    0.0s


[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.6, clf__kernel=sigmoid, clf__gamma=1, clf__degree=1, score=0.9006900690069007, total= 7.4min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.8, clf__kernel=linear, clf__gamma=1, clf__

[CV]  vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.4, clf__kernel=rbf, clf__gamma=0.01, clf__degree=2, score=0.9013098690130987, total= 8.8min
[CV] vect__tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>, vect__strip_accents=unicode, vect__ngram_range=(1, 2), vect__max_features=None, vect__max_df=1.0, vect__binary=True, vect__analyzer=word, tfidf__use_idf=True, tfidf__smooth_idf=False, tfidf__norm=l1, norm__norm=l2, clf__random_state=42, clf__nu=0.4, clf__kernel=rbf, clf__gamma=0.01, clf__d



TypeError: Sparse precomputed kernels are not supported.

In [None]:
report(random_search.cv_results_)
y_pred = random_search.predict(X_validation['text'])
print(metrics.classification_report(y_validation, y_pred))

# Prediction on Test Set Using Our Best Classifier

In [None]:
test_set_predictions = grid_search_cv.predict(test_data['text'])
data = {'Id' : test_data['id'], 'Category': test_set_predictions}
submission = pd.DataFrame(data=data)
submission = submission.apply(pd.to_numeric).sort_values(by=['Id'])

In [None]:
submission.to_csv('submission6.csv', encoding='utf-8', index=False)

In [None]:
# # submission 6 parameters

# Best Parameters: {'clf__C': 10, 'clf__fit_intercept': True, 'clf__random_state': 42, 'norm__norm': 'l2', 'tfidf__norm': 'l1', 'tfidf__smooth_idf': False, 'tfidf__use_idf': True, 'vect__analyzer': 'word', 'vect__binary': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__ngram_range': (1, 2), 'vect__strip_accents': 'unicode', 'vect__tokenizer': <bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>}