Transforming training set words to features with TF-IDF values

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
import numpy as np

df=pd.read_csv('training.csv')

vectorizer = TfidfVectorizer()

#give each word a tf-idf value
train_features = vectorizer.fit_transform(df['article_words'])

print(train_features.shape)

# Set topics for the training set
y_train = df['topic']

print(y_train)


(9500, 35822)
0       FOREX MARKETS
1       MONEY MARKETS
2              SPORTS
3       FOREX MARKETS
4          IRRELEVANT
            ...      
9495          DEFENCE
9496       IRRELEVANT
9497    FOREX MARKETS
9498       IRRELEVANT
9499    FOREX MARKETS
Name: topic, Length: 9500, dtype: object


Transforming test set words to features with TF-IDF values

In [2]:
df_test=pd.read_csv('test.csv')

#give each word a tf-idf value
test_features = vectorizer.transform(df_test['article_words'])

print(test_features.shape)

# Set topics for the test set
y_test = df_test['topic']

print(y_test)

(500, 35822)
0          IRRELEVANT
1          IRRELEVANT
2       FOREX MARKETS
3          IRRELEVANT
4          IRRELEVANT
            ...      
495        IRRELEVANT
496            SPORTS
497     MONEY MARKETS
498    SHARE LISTINGS
499        IRRELEVANT
Name: topic, Length: 500, dtype: object


Finding hyper-tuning the Stochastic Gradient Descent to find the  best parameters

In [3]:
import numpy as np
from sklearn import linear_model
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV

# Setting up the parameters for hyper-tuning
parameters = {
                'tol': (1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8),
                'learning_rate': ('constant', 'optimal'), 
                'eta0': [x * 0.01 for x in range(5, 16)]
             }

# Preventing the SGD Classifer from shuffling to keep values consistent
sgd = linear_model.SGDClassifier(shuffle=False)

# Using the default 5-fold cross-validation for hyper-tuning and fitting the classifier
SGD_clf = GridSearchCV(sgd, parameters)
SGD_clf.fit(train_features, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=False,
                                     tol=0.001, validation_fraction=0.1,
                                     verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'eta0': [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12,
                                  0.13, 0.14, 0.15],
                         'learning_rate': ('constant', 'optimal'),
    

GridSearchCV(cv=None, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=False,
                                     tol=0.001, validation_fraction=0.1,
                                     verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'eta0': [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12,
                                  0.13, 0.14, 0.15],
                         'learning_rate': ('constant', 'optimal'),
                         'tol': (0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06, 1e-07,
                                 1e-08)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [4]:
# Displaying the results of the cross-validation hyper-tuning
df_SGD = pd.DataFrame(SGD_clf.cv_results_)
df_SGD

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_eta0,param_learning_rate,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.294770,0.041806,0.015891,0.003538,0.05,constant,0.1,"{'eta0': 0.05, 'learning_rate': 'constant', 't...",0.755263,0.764737,0.762105,0.756316,0.751053,0.757895,0.004915,176
1,0.306445,0.022210,0.016666,0.001715,0.05,constant,0.01,"{'eta0': 0.05, 'learning_rate': 'constant', 't...",0.765263,0.768947,0.763684,0.767368,0.758947,0.764842,0.003450,173
2,0.682532,0.021429,0.013806,0.002863,0.05,constant,0.001,"{'eta0': 0.05, 'learning_rate': 'constant', 't...",0.777895,0.775263,0.775263,0.770000,0.770000,0.773684,0.003158,126
3,2.135914,0.022121,0.014401,0.003458,0.05,constant,0.0001,"{'eta0': 0.05, 'learning_rate': 'constant', 't...",0.774211,0.780000,0.782105,0.771579,0.774211,0.776421,0.003958,62
4,3.715690,0.093014,0.014670,0.003411,0.05,constant,1e-05,"{'eta0': 0.05, 'learning_rate': 'constant', 't...",0.774737,0.779474,0.782632,0.772632,0.774211,0.776737,0.003728,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,1.429490,0.249383,0.017670,0.007107,0.15,optimal,0.0001,"{'eta0': 0.15, 'learning_rate': 'optimal', 'to...",0.776316,0.780000,0.784737,0.771053,0.780000,0.778421,0.004552,45
172,3.517115,0.690001,0.017817,0.004448,0.15,optimal,1e-05,"{'eta0': 0.15, 'learning_rate': 'optimal', 'to...",0.776842,0.778947,0.787368,0.772105,0.781053,0.779263,0.005022,1
173,5.059753,0.504376,0.015058,0.001513,0.15,optimal,1e-06,"{'eta0': 0.15, 'learning_rate': 'optimal', 'to...",0.776842,0.778947,0.787368,0.772105,0.780526,0.779158,0.004989,34
174,7.516248,1.387870,0.024220,0.009982,0.15,optimal,1e-07,"{'eta0': 0.15, 'learning_rate': 'optimal', 'to...",0.776842,0.779474,0.787368,0.772105,0.780526,0.779263,0.004989,1


In [7]:
# Filtered out table with relevant changed parameers with their accuracy values
df_SGD[['param_eta0', 'param_learning_rate', 'param_tol', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_eta0,param_learning_rate,param_tol,mean_test_score,rank_test_score
0,0.05,constant,0.1,0.757895,176
1,0.05,constant,0.01,0.764842,173
2,0.05,constant,0.001,0.773684,126
3,0.05,constant,0.0001,0.776421,62
4,0.05,constant,1e-05,0.776737,59
...,...,...,...,...,...
171,0.15,optimal,0.0001,0.778421,45
172,0.15,optimal,1e-05,0.779263,1
173,0.15,optimal,1e-06,0.779158,34
174,0.15,optimal,1e-07,0.779263,1


In [9]:
SGD_clf.best_score_

0.7792631578947369

SGD_clf.best_score_ = 0.7792631578947369

In [10]:
SGD_clf.best_params_

{'eta0': 0.05, 'learning_rate': 'optimal', 'tol': 1e-05}

SGD_clf.best_params_ = 
{'eta0': 0.05, 'learning_rate': 'optimal', 'tol': 1e-05}

In [11]:
SGD_clf.best_estimator_

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.05, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=False, tol=1e-05,
              validation_fraction=0.1, verbose=0, warm_start=False)

SGD_clf.best_estimator_ = SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.05, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=False, tol=1e-05,
              validation_fraction=0.1, verbose=0, warm_start=False)

Assigning the best estimator as the new SGD Classifier with the best estimated parameters

In [12]:
from sklearn.linear_model import SGDClassifier

# Assigning the existing saved SGD classifier best_estimator with the best parameters
SGD_best_estimator = SGDClassifier(alpha=0.0001, average=False, class_weight=None, early_stopping=False, epsilon=0.1, eta0=0.05, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5, random_state=None, shuffle=False, tol=1e-05, validation_fraction=0.1, verbose=0, warm_start=False)


Pipelining to find the best classifier between the Stochastic Gradient Descent and Multinomial Naive Bayes Classifier

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Setting up the pipeline for each classifier
pipeline_SGD =Pipeline([
    ('sgd_clf', SGD_best_estimator)])

pipeline_MNB = Pipeline([
    ('mnb_clf', MultinomialNB(fit_prior=False)),
])

In [14]:
# Initialising values
pipelines = [pipeline_SGD, pipeline_MNB]
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [15]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Stochastic Gradient Descent', 1: 'Multinomial Naive Bayes'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(train_features, y_train)

In [16]:
# Printing out the accuracy values for each classifier
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(test_features,y_test)))

Stochastic Gradient Descent Test Accuracy: 0.758
Multinomial Naive Bayes Test Accuracy: 0.724


In [17]:
# Iterating through the pipelines to find the best classifier
for i,model in enumerate(pipelines):
    if model.score(test_features,y_test)>best_accuracy:
        best_accuracy=model.score(test_features,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy: {}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy: Stochastic Gradient Descent


Assigning the best classifier with the best accuracy from the pipeline to calculate the top 10 articles

In [34]:
# Obtaining the best classifier from the pipeline and making the predictions
best_clf = pipelines[best_classifier].steps[0][1]
best_clf_prediction = best_clf.predict(test_features)
print('test data accuracy: {}'.format(np.mean(best_clf_prediction == y_test)))

best_clf_prediction_train = best_clf.predict(train_features)
print('train data accuracy: {}'.format(np.mean(best_clf_prediction_train == y_train)))

test data accuracy: 0.758
train data accuracy: 0.9022105263157895


In [35]:
from sklearn.metrics import classification_report

print(classification_report(y_test, best_clf_prediction, target_names=target_names))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.33      0.67      0.44         3
BIOGRAPHIES PERSONALITIES PEOPLE       1.00      0.20      0.33        15
                         DEFENCE       0.88      0.54      0.67        13
                DOMESTIC MARKETS       0.50      0.50      0.50         2
                   FOREX MARKETS       0.41      0.27      0.33        48
                          HEALTH       0.69      0.64      0.67        14
                      IRRELEVANT       0.86      0.89      0.87       266
                   MONEY MARKETS       0.52      0.68      0.59        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       0.40      0.29      0.33         7
                          SPORTS       0.95      0.97      0.96        60

                        accuracy                           0.76       500
                       macro avg    