In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

from sklearn.metrics import accuracy_score

In [2]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

In [3]:
nltk.download('stopwords')
english_stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/mark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def remove_stopwords(text):
    output = ' '.join([word for word in text.split(
        ) if word not in english_stop_words])
    return output

In [5]:
import re

REGEX_DICT = [
    [re.compile("\d"), " "], #numbers
]
# REGEX_DICT['special_characters_regex'] = re.compile(r"[\[\]^&?.*\"\',:\/\(\)]")

def regex_normalize(text):
    output = text
    for each_regex in REGEX_DICT:
        compiled_regex = each_regex[0]
        substitution = each_regex[1]
        output = compiled_regex.sub(substitution, output)
        
    return output

In [6]:
class_names = train_df['Category'].unique()
print(class_names)

['business' 'tech' 'politics' 'sport' 'entertainment']


In [7]:
print(f"Train Length: {len(train_df)}")

Train Length: 1490


In [8]:
train_df['Text'][0]



In [9]:
train_df['Text'] = train_df['Text'].apply(remove_stopwords)
train_df['Text'] = train_df['Text'].apply(regex_normalize)
train_df['Text'] = train_df['Text'].apply(lambda x: x.replace("said", ""))

In [10]:
train_df['Text'][0]



In [11]:
print(f"Test Length: {len(test_df)}")

Test Length: 735


In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_df['Category'].to_list())

LabelEncoder()

In [13]:
train_df['Category'] = le.transform(train_df['Category'])
test_df['Category'] = le.transform(test_df['Category'])

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_df['Text'])
X_train_counts.shape

(1490, 23491)

In [15]:
X_train_counts[1]

<1x23491 sparse matrix of type '<class 'numpy.int64'>'
	with 150 stored elements in Compressed Sparse Row format>

In [16]:
count_vect.vocabulary_.get(u'phones')

15480

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [18]:
X_train_tfidf

<1490x23491 sparse matrix of type '<class 'numpy.float64'>'
	with 226264 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.naive_bayes import MultinomialNB
mnb_clf = MultinomialNB().fit(X_train_tfidf, train_df['Category'])

In [20]:
# docs_new = ['the stocks are going up', 'mobile phones are growing fast']
docs_new = test_df['Text'].to_list()
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

mnb_predicted = mnb_clf.predict(X_new_tfidf)

In [21]:
from sklearn.metrics import classification_report
print(accuracy_score(mnb_predicted, test_df['Category'].to_list()))

0.19047619047619047


In [22]:
from sklearn import svm

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# clf = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
SVM.fit(X_train_tfidf,train_df['Category'])
svm_predicted = SVM.predict(X_new_tfidf)
print(accuracy_score(test_df['Category'].to_list(), svm_predicted))

0.19183673469387755


In [23]:
# clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train_tfidf, train_df['Category'])
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100,warm_start=True, oob_score=True).fit(X_train_tfidf, train_df['Category'])

In [24]:
predicted = clf.predict(X_new_tfidf)
print(classification_report(test_df['Category'].to_list(), predicted))

              precision    recall  f1-score   support

           0       0.21      0.24      0.22       147
           1       0.22      0.17      0.19       147
           2       0.15      0.14      0.15       147
           3       0.21      0.24      0.22       147
           4       0.19      0.17      0.18       147

    accuracy                           0.19       735
   macro avg       0.19      0.19      0.19       735
weighted avg       0.19      0.19      0.19       735

