In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection, feature_extraction, preprocessing, svm, pipeline, metrics
from project_utilities import import_train_data, import_test_data

# Raw Data

In [2]:
train_data = pd.DataFrame(import_train_data())
test_data = pd.DataFrame(import_test_data())

It's good to use 80% and 20% of the dataset for training and testing respectively.

In [3]:
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(
    train_data.drop(columns=['category']), 
    train_data.drop(columns=['text']), 
    test_size=0.2)

# Pipeline

The pipeline structures the raw data. It also extracts and selects features from the structured data.

## Using TF-IDF and Linear SVM with GridSearch Cross Validation

In [4]:
classifier_pipeline = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

In [5]:
random_state = 551
parameters = {'vect__ngram_range': [(1,1), (1,2), (2,2)],
             'tfidf__use_idf': [True, False],
             'clf__C': [1, 10],
             'clf__random_state': [551]}

grid_search_cv = model_selection.GridSearchCV(classifier_pipeline, parameters, cv=2, n_jobs=6, verbose=10)
grid_search_cv.fit(X_train['text'], y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   11.2s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   31.1s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   54.7s
[Parallel(n_jobs=6)]: Done  16 out of  24 | elapsed:  1.2min remaining:   35.3s
[Parallel(n_jobs=6)]: Done  19 out of  24 | elapsed:  1.5min remaining:   23.7s
[Parallel(n_jobs=6)]: Done  22 out of  24 | elapsed:  1.7min remaining:    9.3s
[Parallel(n_jobs=6)]: Done  24 out of  24 | elapsed:  1.8min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid='warn', n_jobs=6,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2), (2, 2)], 'tfidf__use_idf': [True, False], 'clf__C': [1, 10], 'clf__random_state': [551]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [6]:
print('Best Parameters:', grid_search_cv.best_params_)

Best Parameters: {'clf__C': 10, 'clf__random_state': 551, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


# Prediction on Validation Set

In [7]:
predictions = grid_search_cv.predict(X_validation['text'])

print(metrics.classification.classification_report(y_validation, predictions))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2507
           1       0.89      0.91      0.90      2493

   micro avg       0.90      0.90      0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



# Prediction on Test Set

In [16]:
predictions_test_set = grid_search_cv.predict(test_data['text'])

d = {'Id' : test_data['id'], 'Category': predictions_test_set}
submission = pd.DataFrame(data=d).sort_values(by=['Id'])

In [25]:
submission.to_csv('submission.csv', encoding='utf-8', index=False)