In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection, feature_extraction, preprocessing, svm, pipeline, metrics, tree, linear_model
from project_utilities import import_train_data, import_test_data

# Raw Data

In [2]:
train_data = pd.DataFrame(import_train_data())
test_data = pd.DataFrame(import_test_data())

It's good to use 80% and 20% of the dataset for training and testing respectively.

In [3]:
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(
    train_data.drop(columns=['category']), 
    train_data.drop(columns=['text']), 
    test_size=0.2)

# Experimenting with Different Classifiers

The following experiments are already using pipelining. The pipeline structures the raw data. It also extracts and selects features from the structured data.

## Logistic Regression

In [4]:
clf_pipeline_logistic_reg = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', linear_model.LogisticRegression())])

clf_pipeline_logistic_reg.fit(X_train['text'], y_train);

  y = column_or_1d(y, warn=True)


### Prediction on Validation Set

In [5]:
logistic_reg_predictions = clf_pipeline_logistic_reg.predict(X_validation['text'])

print(metrics.classification_report(y_validation, logistic_reg_predictions))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      2452
           1       0.86      0.87      0.86      2548

   micro avg       0.86      0.86      0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



## Decision Tree

In [6]:
clf_pipeline_tree = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', tree.DecisionTreeClassifier())])

clf_pipeline_tree.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [7]:
tree_predictions = clf_pipeline_tree.predict(X_validation['text'])

print(metrics.classification_report(y_validation, tree_predictions))

              precision    recall  f1-score   support

           0       0.71      0.71      0.71      2452
           1       0.72      0.72      0.72      2548

   micro avg       0.72      0.72      0.72      5000
   macro avg       0.72      0.72      0.72      5000
weighted avg       0.72      0.72      0.72      5000



## Linear SVM

In [8]:
clf_pipeline_svm = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

clf_pipeline_svm.fit(X_train['text'], y_train);

  y = column_or_1d(y, warn=True)


### Prediction on Validation Set

In [9]:
svm_predictions = clf_pipeline_svm.predict(X_validation['text'])

print(metrics.classification_report(y_validation, svm_predictions))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2452
           1       0.89      0.91      0.90      2548

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



# Experimenting with Different Feature Extraction Pipelines

## Binary Occurences

In [10]:
clf_pipeline_svm_bin = pipeline.Pipeline([
    ('vect', feature_extraction.text.HashingVectorizer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

clf_pipeline_svm_bin.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [11]:
svm_bin_predictions = clf_pipeline_svm_bin.predict(X_validation['text'])

print(metrics.classification_report(y_validation, svm_bin_predictions))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2452
           1       0.89      0.90      0.90      2548

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



## TF-IDF Weighting

In [12]:
clf_pipeline_svm_tfidf = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

clf_pipeline_svm_tfidf.fit(X_train['text'], y_train);

### Prediction on Validation Set

In [13]:
svm_tfidf_predictions = clf_pipeline_svm_tfidf.predict(X_validation['text'])

print(metrics.classification_report(y_validation, svm_tfidf_predictions))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2452
           1       0.90      0.91      0.90      2548

   micro avg       0.90      0.90      0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



## Using TF-IDF and Linear SVM with GridSearch Cross Validation

This is our best classifier so far!

In [14]:
clf_pipeline_svm = pipeline.Pipeline([
    ('vect', feature_extraction.text.CountVectorizer()),
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('norm', preprocessing.Normalizer()),
    ('clf', svm.LinearSVC())])

In [15]:
random_state = 551
parameters = {'vect__ngram_range': [(1,1), (1,2), (2,2)],
              'tfidf__use_idf': [True],
              'clf__C': [1, 10],
              'clf__random_state': [551]}

grid_search_cv = model_selection.GridSearchCV(clf_pipeline_svm, parameters, cv=2, n_jobs=6, verbose=10)
grid_search_cv.fit(X_train['text'], y_train)

print('Best Parameters:', grid_search_cv.best_params_);

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   13.4s
[Parallel(n_jobs=6)]: Done   3 out of  12 | elapsed:   26.0s remaining:  1.3min
[Parallel(n_jobs=6)]: Done   5 out of  12 | elapsed:   27.8s remaining:   39.0s
[Parallel(n_jobs=6)]: Done   7 out of  12 | elapsed:   36.1s remaining:   25.8s
[Parallel(n_jobs=6)]: Done   9 out of  12 | elapsed:   53.9s remaining:   17.9s
[Parallel(n_jobs=6)]: Done  12 out of  12 | elapsed:   58.8s finished
  y = column_or_1d(y, warn=True)


Best Parameters: {'clf__C': 10, 'clf__random_state': 551, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


### Prediction on Validation Set

In [16]:
predictions = grid_search_cv.predict(X_validation['text'])

print(metrics.classification.classification_report(y_validation, predictions))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      2452
           1       0.91      0.93      0.92      2548

   micro avg       0.91      0.91      0.91      5000
   macro avg       0.92      0.91      0.91      5000
weighted avg       0.92      0.91      0.91      5000



# Prediction on Test Set

In [16]:
predictions_test_set = grid_search_cv.predict(test_data['text'])

d = {'Id' : test_data['id'], 'Category': predictions_test_set}
submission = pd.DataFrame(data=d).sort_values(by=['Id'])

In [25]:
submission.to_csv('submission.csv', encoding='utf-8', index=False)