Transforming training set words to features with TF-IDF values

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import csv
import numpy as np
from sklearn.model_selection import GridSearchCV

df=pd.read_csv('training.csv')

# cleanup_nums = {"topic": {"IRRELEVANT": 0, "ARTS CULTURE ENTERTAINMENT": 1, "BIOGRAPHIES PERSONALITIES PEOPLE": 2, "DEFENCE": 3, "DOMESTIC MARKETS": 4,
#                                   "FOREX MARKETS": 5, "HEALTH": 6, "MONEY MARKETS": 7, "SCIENCE AND TECHNOLOGY": 8, "SHARE LISTINGS": 9, "SPORTS":10 }}


# df.replace(cleanup_nums, inplace=True)

vectorizer = TfidfVectorizer()

#give each word a tf-idf value
train_features = vectorizer.fit_transform(df['article_words'])

print(train_features.shape)

y_train = df['topic']

print(y_train)


(9500, 35822)
0       FOREX MARKETS
1       MONEY MARKETS
2              SPORTS
3       FOREX MARKETS
4          IRRELEVANT
            ...      
9495          DEFENCE
9496       IRRELEVANT
9497    FOREX MARKETS
9498       IRRELEVANT
9499    FOREX MARKETS
Name: topic, Length: 9500, dtype: object


Transforming test set words to features with TF-IDF values

In [2]:
df_test=pd.read_csv('test.csv')

# cleanup_nums = {"topic": {"IRRELEVANT": 0, "ARTS CULTURE ENTERTAINMENT": 1, "BIOGRAPHIES PERSONALITIES PEOPLE": 2, "DEFENCE": 3, "DOMESTIC MARKETS": 4,
#                                   "FOREX MARKETS": 5, "HEALTH": 6, "MONEY MARKETS": 7, "SCIENCE AND TECHNOLOGY": 8, "SHARE LISTINGS": 9, "SPORTS":10 }}


# df_test.replace(cleanup_nums, inplace=True)

#give each word a tf-idf value
test_features = vectorizer.transform(df_test['article_words'])

print(test_features.shape)

y_test = df_test['topic']

print(y_test)

(500, 35822)
0          IRRELEVANT
1          IRRELEVANT
2       FOREX MARKETS
3          IRRELEVANT
4          IRRELEVANT
            ...      
495        IRRELEVANT
496            SPORTS
497     MONEY MARKETS
498    SHARE LISTINGS
499        IRRELEVANT
Name: topic, Length: 500, dtype: object


Run ML algos

Naive Bayes Rule

In [8]:
#naive bayes rule

from sklearn.naive_bayes import MultinomialNB
NBclf = MultinomialNB()

NBclf.fit(train_features, y_train)

#gets the NB topic predicitions of the test data
NBpredictions = NBclf.predict(test_features)

np.mean(NBpredictions == y_test)

#tuning

# from sklearn.model_selection import GridSearchCV
# parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
# gs_clf = GridSearchCV(NBclf, parameters, n_jobs=-1)
# gs_clf = gs_clf.fit(train_features, y_train)

# gs_clf.best_score_




0.68

Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

RFclf = RandomForestClassifier()

param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3, 'auto'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

grid_search = GridSearchCV(estimator = RFclf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)


RFclf.fit(train_features, y_train)

grid_search.fit(train_features, y_train)
grid_search.best_params_

grid_predictions = grid_search.predict(test_features)


np.mean(grid_predictions == y_test)

# # Use the forest's predict method on the test data
# RFpredictions = RFclf.predict(test_features)

# np.mean(RFpredictions == y_test)






Fitting 3 folds for each of 720 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 74.5min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 107.6min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 134.2min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 181.3min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed: 224.5min finished


0.726

SVM

In [3]:
from sklearn.svm import SVC


SVM_clf = SVC()
SVM_clf.fit(train_features, y_train)
# SVMpredictions = SVM_clf.predict(test_features)


# print("hello")
# np.mean(SVMpredictions == y_test)

param_grid = {'C': [0.1,1, 10, 100], 'gamma': ['scale', 'auto'] ,'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVM_clf,param_grid,refit=True,verbose=2)
grid.fit(train_features, y_train)

# print(grid.best_estimator_)


grid_predictions = grid.predict(test_features)


np.mean(grid_predictions == y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] C=0.1, gamma=scale, kernel=linear ...............................
[CV] ................ C=0.1, gamma=scale, kernel=linear, total=  52.1s
[CV] C=0.1, gamma=scale, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   52.0s remaining:    0.0s


[CV] ................ C=0.1, gamma=scale, kernel=linear, total=  50.3s
[CV] C=0.1, gamma=scale, kernel=linear ...............................
[CV] ................ C=0.1, gamma=scale, kernel=linear, total=  56.1s
[CV] C=0.1, gamma=scale, kernel=rbf ..................................
[CV] ................... C=0.1, gamma=scale, kernel=rbf, total= 1.2min
[CV] C=0.1, gamma=scale, kernel=rbf ..................................
[CV] ................... C=0.1, gamma=scale, kernel=rbf, total= 1.4min
[CV] C=0.1, gamma=scale, kernel=rbf ..................................
[CV] ................... C=0.1, gamma=scale, kernel=rbf, total= 1.6min
[CV] C=0.1, gamma=scale, kernel=poly .................................
[CV] .................. C=0.1, gamma=scale, kernel=poly, total= 2.6min
[CV] C=0.1, gamma=scale, kernel=poly .................................
[CV] .................. C=0.1, gamma=scale, kernel=poly, total= 2.6min
[CV] C=0.1, gamma=scale, kernel=poly .................................
[CV] .

[CV] ................ C=10, gamma=scale, kernel=sigmoid, total=  36.1s
[CV] C=10, gamma=auto, kernel=linear .................................
[CV] .................. C=10, gamma=auto, kernel=linear, total=  43.3s
[CV] C=10, gamma=auto, kernel=linear .................................
[CV] .................. C=10, gamma=auto, kernel=linear, total=  42.7s
[CV] C=10, gamma=auto, kernel=linear .................................
[CV] .................. C=10, gamma=auto, kernel=linear, total=  46.7s
[CV] C=10, gamma=auto, kernel=rbf ....................................
[CV] ..................... C=10, gamma=auto, kernel=rbf, total=  43.5s
[CV] C=10, gamma=auto, kernel=rbf ....................................
[CV] ..................... C=10, gamma=auto, kernel=rbf, total=  43.7s
[CV] C=10, gamma=auto, kernel=rbf ....................................
[CV] ..................... C=10, gamma=auto, kernel=rbf, total=  43.6s
[CV] C=10, gamma=auto, kernel=poly ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 100.2min finished


0.768

In [5]:
from sklearn.linear_model import SGDClassifier

SVM_clf2 = SGDClassifier(loss='hinge')
SVM_clf2.fit(train_features, y_train)
SVM2predictions = SVM_clf2.predict(test_features)

np.mean(SVM2predictions == y_test)

0.756

Classification report

In [None]:
from sklearn.metrics import classification_report

target_names = ["IRRELEVANT", "ARTS CULTURE ENTERTAINMENT", "BIOGRAPHIES PERSONALITIES PEOPLE", "DEFENCE", "DOMESTIC MARKETS", "FOREX MARKETS", "HEALTH", "MONEY MARKETS", "SCIENCE AND TECHNOLOGY", "SHARE LISTINGS", "SPORTS"] 

print(classification_report(y_test, RFpredictions, target_names=target_names))