Transforming training set words to features with TF-IDF values

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import csv

df=pd.read_csv('training.csv')

cleanup_nums = {"topic": {"IRRELEVANT": 0, "ARTS CULTURE ENTERTAINMENT": 1, "BIOGRAPHIES PERSONALITIES PEOPLE": 2, "DEFENCE": 3, "DOMESTIC MARKETS": 4,
                                  "FOREX MARKETS": 5, "HEALTH": 6, "MONEY MARKETS": 7, "SCIENCE AND TECHNOLOGY": 8, "SHARE LISTINGS": 9, "SPORTS":10 }}


df.replace(cleanup_nums, inplace=True)

vectorizer = TfidfVectorizer()

#give each word a tf-idf value
train_features = vectorizer.fit_transform(df['article_words'])

print(train_features.shape)

y_train = df['topic']

print(y_train)


(9500, 35822)
0        5
1        7
2       10
3        5
4        0
        ..
9495     3
9496     0
9497     5
9498     0
9499     5
Name: topic, Length: 9500, dtype: int64


Transforming test set words to features with TF-IDF values

In [2]:
df_test=pd.read_csv('test.csv')

cleanup_nums = {"topic": {"IRRELEVANT": 0, "ARTS CULTURE ENTERTAINMENT": 1, "BIOGRAPHIES PERSONALITIES PEOPLE": 2, "DEFENCE": 3, "DOMESTIC MARKETS": 4,
                                  "FOREX MARKETS": 5, "HEALTH": 6, "MONEY MARKETS": 7, "SCIENCE AND TECHNOLOGY": 8, "SHARE LISTINGS": 9, "SPORTS":10 }}


df_test.replace(cleanup_nums, inplace=True)

#give each word a tf-idf value
test_features = vectorizer.transform(df_test['article_words'])

print(test_features.shape)

y_test = df_test['topic']

print(y_test)

(500, 35822)
0       0
1       0
2       5
3       0
4       0
       ..
495     0
496    10
497     7
498     9
499     0
Name: topic, Length: 500, dtype: int64


Run ML algos

In [9]:
#naive bayes rule

from sklearn.naive_bayes import MultinomialNB
NBclf = MultinomialNB()

NBclf.fit(train_features, y_train)

NBpredictions = NBclf.predict(test_features)

#prints the NB topic predicitions of the test data
print(NBpredictions)


#random forst

from sklearn.ensemble import RandomForestClassifier

RFclf = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt')


RFclf.fit(train_features, y_train)

# Use the forest's predict method on the test data
RFpredictions = RFclf.predict(test_features)


#prints the RF topic predicitions of the test data
print(RFpredictions)



# # Calculate the absolute errors
# errors = abs(predictions - y_test)

# # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')


[ 0  0  0  0  0  0  0 10  0  0  0  0 10 10  0  7  0  0  0 10  0  0  0  0
  7  0  0  7  7  7  0  0  0  7  0 10  0  0  0  0  0  0  0  0  0  0  0  0
  0  7  7  0  0  0  7  0  0  0  0  7  0  0  0  0  7  0  0 10 10  7  0  0
 10 10  0  0  0  0  0  0  0  0  0  0  0  7  0  7  0  0  0  0  0  0  0 10
 10  0  0  0  0  0  0  0  0  0  0 10  0 10  0  0  0  0  0  0  0  7  0 10
  0  0  0  0  0  0  0  0  0 10  0  0  0  7  0  0  0  7  0  0  7  0  0  0
  0  0  0  0  0  7  0  0 10  0  0  0  0  0  0  0  0 10 10  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  7  0  0  0  7  0  0  0  0  0  0  0  0  0  0
  0 10 10  0  0  0  0  0  0  7  0  0  0  0  7  7  0  0  0  0  0  0  0  0
  0  7  0  0  0  0  0  0  7  0  0  0  0  0  0  0  7  0  0  0  7 10  0  7
  0  0  0  0  0  0  0  7  0  0  0 10  0 10  7  0  0  0  0 10  7  0 10  0
  7  7  0  0  7  0  0  7  0 10  0  7  0  0  0  0  0 10  0  0  0  0 10  0
  0  0 10  0  0  0  0  0  0  7  0 10  0  0  0  0  0  0  0  0  0  0  0  0
 10  0  0  0  0 10  0  7 10  7  0  7  0  7  0  7  0