Transforming training set words to features with TF-IDF values

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
import numpy as np

df=pd.read_csv('training.csv')

# cleanup_nums = {"topic": {"IRRELEVANT": 0, "ARTS CULTURE ENTERTAINMENT": 1, "BIOGRAPHIES PERSONALITIES PEOPLE": 2, "DEFENCE": 3, "DOMESTIC MARKETS": 4,
#                                   "FOREX MARKETS": 5, "HEALTH": 6, "MONEY MARKETS": 7, "SCIENCE AND TECHNOLOGY": 8, "SHARE LISTINGS": 9, "SPORTS":10 }}


# df.replace(cleanup_nums, inplace=True)

vectorizer = TfidfVectorizer()

#give each word a tf-idf value
train_features = vectorizer.fit_transform(df['article_words'])

print(train_features.shape)

y_train = df['topic']

print(y_train)


(9500, 35822)
0       FOREX MARKETS
1       MONEY MARKETS
2              SPORTS
3       FOREX MARKETS
4          IRRELEVANT
            ...      
9495          DEFENCE
9496       IRRELEVANT
9497    FOREX MARKETS
9498       IRRELEVANT
9499    FOREX MARKETS
Name: topic, Length: 9500, dtype: object


Transforming test set words to features with TF-IDF values

In [2]:
df_test=pd.read_csv('test.csv')

# cleanup_nums = {"topic": {"IRRELEVANT": 0, "ARTS CULTURE ENTERTAINMENT": 1, "BIOGRAPHIES PERSONALITIES PEOPLE": 2, "DEFENCE": 3, "DOMESTIC MARKETS": 4,
#                                   "FOREX MARKETS": 5, "HEALTH": 6, "MONEY MARKETS": 7, "SCIENCE AND TECHNOLOGY": 8, "SHARE LISTINGS": 9, "SPORTS":10 }}


# df_test.replace(cleanup_nums, inplace=True)

#give each word a tf-idf value
test_features = vectorizer.transform(df_test['article_words'])

print(test_features.shape)

y_test = df_test['topic']

print(y_test)

(500, 35822)
0          IRRELEVANT
1          IRRELEVANT
2       FOREX MARKETS
3          IRRELEVANT
4          IRRELEVANT
            ...      
495        IRRELEVANT
496            SPORTS
497     MONEY MARKETS
498    SHARE LISTINGS
499        IRRELEVANT
Name: topic, Length: 500, dtype: object


Run ML algos

Naive Bayes Rule

In [3]:
#naive bayes rule

from sklearn.naive_bayes import MultinomialNB
NBclf = MultinomialNB()

NBclf.fit(train_features, y_train)

#gets the NB topic predicitions of the test data
NBpredictions = NBclf.predict(test_features)

np.mean(NBpredictions == y_test)

#tuning

# from sklearn.model_selection import GridSearchCV
# gs_clf = GridSearchCV(NBclf, parameters, n_jobs=-1)
# gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)



0.68

Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier

RFclf = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt')


RFclf.fit(train_features, y_train)

# Use the forest's predict method on the test data
RFpredictions = RFclf.predict(test_features)

np.mean(RFpredictions == y_test)


['IRRELEVANT' 'IRRELEVANT' 'MONEY MARKETS' 'IRRELEVANT' 'IRRELEVANT'
 'MONEY MARKETS' 'IRRELEVANT' 'SPORTS' 'MONEY MARKETS' 'IRRELEVANT'
 'IRRELEVANT' 'IRRELEVANT' 'SPORTS' 'SPORTS' 'IRRELEVANT' 'MONEY MARKETS'
 'IRRELEVANT' 'IRRELEVANT' 'IRRELEVANT' 'SPORTS' 'IRRELEVANT' 'IRRELEVANT'
 'IRRELEVANT' 'IRRELEVANT' 'FOREX MARKETS' 'IRRELEVANT' 'IRRELEVANT'
 'IRRELEVANT' 'MONEY MARKETS' 'MONEY MARKETS' 'MONEY MARKETS' 'IRRELEVANT'
 'IRRELEVANT' 'MONEY MARKETS' 'IRRELEVANT' 'SPORTS' 'IRRELEVANT'
 'IRRELEVANT' 'MONEY MARKETS' 'IRRELEVANT' 'SPORTS' 'IRRELEVANT'
 'IRRELEVANT' 'IRRELEVANT' 'IRRELEVANT' 'IRRELEVANT' 'MONEY MARKETS'
 'FOREX MARKETS' 'IRRELEVANT' 'FOREX MARKETS' 'MONEY MARKETS' 'IRRELEVANT'
 'MONEY MARKETS' 'IRRELEVANT' 'MONEY MARKETS' 'IRRELEVANT' 'IRRELEVANT'
 'IRRELEVANT' 'DEFENCE' 'MONEY MARKETS' 'IRRELEVANT' 'IRRELEVANT'
 'IRRELEVANT' 'IRRELEVANT' 'FOREX MARKETS' 'IRRELEVANT' 'IRRELEVANT'
 'SPORTS' 'SPORTS' 'MONEY MARKETS' 'IRRELEVANT' 'IRRELEVANT' 'SPORTS'
 'SPORTS' 'IRRELEVA

0.734

SVM

In [6]:
from sklearn.svm import SVC

SVM_clf = SVC(kernel='linear')
SVM_clf.fit(train_features, y_train)
SVMpredictions = SVM_clf.predict(test_features)

np.mean(SVMpredictions == y_test)


0.768

Classification report

In [5]:
from sklearn.metrics import classification_report

target_names = ["IRRELEVANT", "ARTS CULTURE ENTERTAINMENT", "BIOGRAPHIES PERSONALITIES PEOPLE", "DEFENCE", "DOMESTIC MARKETS", "FOREX MARKETS", "HEALTH", "MONEY MARKETS", "SCIENCE AND TECHNOLOGY", "SHARE LISTINGS", "SPORTS"] 

print(classification_report(y_test, RFpredictions, target_names=target_names))

                                  precision    recall  f1-score   support

                      IRRELEVANT       1.00      0.33      0.50         3
      ARTS CULTURE ENTERTAINMENT       0.00      0.00      0.00        15
BIOGRAPHIES PERSONALITIES PEOPLE       1.00      0.15      0.27        13
                         DEFENCE       0.00      0.00      0.00         2
                DOMESTIC MARKETS       0.47      0.17      0.25        48
                   FOREX MARKETS       0.50      0.07      0.12        14
                          HEALTH       0.76      0.95      0.85       266
                   MONEY MARKETS       0.55      0.70      0.61        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       0.00      0.00      0.00         7
                          SPORTS       0.93      0.90      0.92        60

                        accuracy                           0.73       500
                       macro avg    

  'precision', 'predicted', average, warn_for)
