In [21]:
import nltk
from sklearn.datasets import load_files
from nltk.corpus import stopwords
import os
import nltk
import os
import itertools
from nltk import word_tokenize
import string
import pandas as pd
default_stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [22]:
all_data = pd.read_csv('Huff_news.csv', parse_dates = True, index_col = 0)

all_data.dropna(inplace = True)

all_data["text_all"] = all_data["headline"] +" "+ all_data["short_description"]

text_all = all_data["text_all"]

In [25]:
def clean_text(doc,
            rm_punctuation=True,
            rm_digits=True,
            lemmatize=False,
            norm_case=True,
            stem=False,
            rm_stopwords=True):
            """
            Creates a cleaned list of words with the given options
            Python 3 friendly

            in:
                a single text string (eg representing an article)
            out:
                same text but cleaned according to given options

            """
            # Doc overall operations
            if(rm_digits==True):
                table = str.maketrans({key: None for key in string.digits})
                doc = str(doc).translate(table)
            if(norm_case==True):
                doc = doc.lower()
            if(rm_punctuation==True):
                table = str.maketrans({key: None for key in string.punctuation})
                doc = doc.translate(table)
            if(rm_stopwords==True):
                words = " ".join([i for i in doc.split() if i not in default_stopwords])
            else:
                words = " ".join([i for i in doc.split()])
            if(lemmatize==True):
                lemma = WordNetLemmatizer()
                words = " ".join(lemma.lemmatize(word) for word in words.split())
            if(stem==True):
                words = " ".join(porter_stemmer.stem(word) for word in words.split())
            return words

In [26]:
documents = [clean_text(x,stem=False,lemmatize=True) for x in all_data.text_all]

In [27]:
X, y =documents, all_data.category

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=3800, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

In [29]:
#tf idf
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [32]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [33]:
#Logit model
logistic = LogisticRegression()
logistic.fit(X_train, y_train) 

# test - prediction

logistic_prediction = logistic.predict(X_test)


#accuracy score
print(accuracy_score(logistic_prediction, y_test))


#confusion matrix
logit_confusionmatrix=confusion_matrix(logistic_prediction, y_test)



#classification report
print(classification_report(logistic_prediction, y_test))

0.4847699287103046
                precision    recall  f1-score   support

          ARTS       0.49      0.67      0.57        84
  BLACK VOICES       0.10      0.45      0.16        11
      BUSINESS       0.61      0.52      0.56       250
       COLLEGE       0.23      0.53      0.32        30
        COMEDY       0.00      0.00      0.00         0
         CRIME       0.00      0.00      0.00         0
     EDUCATION       0.49      0.66      0.56        53
 ENTERTAINMENT       0.55      0.57      0.56       137
         FIFTY       0.20      0.44      0.28        63
     GOOD NEWS       0.00      0.00      0.00         0
         GREEN       0.54      0.75      0.63        87
HEALTHY LIVING       0.73      0.37      0.49       635
        IMPACT       0.28      0.33      0.31       165
 LATINO VOICES       0.05      1.00      0.10         1
         MEDIA       0.06      1.00      0.11         2
       PARENTS       0.61      0.48      0.54       236
      POLITICS       0.77   

In [34]:
from sklearn.metrics import precision_recall_fscore_support as score
p,r,f1,s=score(y_test,logistic_prediction )
a=pd.Series(f1).sort_values(ascending=False).head(3).index.to_list()
all_labels=y.unique()
all_labels.sort()
list(all_labels[a])
# Top 3 categories predicted from logit model with respect to the f1 score

['TASTE', 'GREEN', 'TRAVEL']

In [36]:
 # Naive Bayes Classifier

# Naive Bayes 


nb =  GaussianNB()
nb.fit(X_train, y_train) 

nb_prediction = nb.predict(X_test)



print(accuracy_score(nb_prediction, y_test))


nb_confusionmatrix=confusion_matrix(nb_prediction, y_test)

#classification report
print(classification_report(nb_prediction, y_test))


0.24951393389500973
                precision    recall  f1-score   support

          ARTS       0.22      0.25      0.23       100
  BLACK VOICES       0.00      0.00      0.00         5
      BUSINESS       0.28      0.23      0.25       253
       COLLEGE       0.06      0.15      0.08        27
        COMEDY       0.07      0.18      0.10        17
         CRIME       0.00      0.00      0.00         1
     EDUCATION       0.04      0.12      0.06        25
 ENTERTAINMENT       0.24      0.26      0.25       133
         FIFTY       0.15      0.17      0.16       127
     GOOD NEWS       0.00      0.00      0.00         1
         GREEN       0.13      0.25      0.17        65
HEALTHY LIVING       0.32      0.27      0.29       386
        IMPACT       0.22      0.16      0.18       264
 LATINO VOICES       0.00      0.00      0.00         0
         MEDIA       0.03      0.11      0.04         9
       PARENTS       0.18      0.21      0.20       162
      POLITICS       0.61  

In [37]:
p_nb,r_nb,f1_nb,s_nb=score(y_test,nb_prediction )
b=pd.Series(f1_nb).sort_values(ascending=False).head(3).index.to_list()
all_labels=y.unique()
all_labels.sort()
list(all_labels[b])
# Top 3 predictions from NB classifier

['TASTE', 'POLITICS', 'HEALTHY LIVING']

In [38]:
from sklearn.svm import SVC

SVC_model = SVC()

SVC_model.fit(X_train, y_train)

SVC_prediction = SVC_model.predict(X_test)


print(accuracy_score(SVC_prediction, y_test))


svc_confusionmatix=confusion_matrix(SVC_prediction, y_test)


print(classification_report(SVC_prediction, y_test))

0.4594944912508101
                precision    recall  f1-score   support

          ARTS       0.44      0.72      0.55        69
  BLACK VOICES       0.12      0.75      0.20         8
      BUSINESS       0.54      0.55      0.55       209
       COLLEGE       0.26      0.55      0.35        33
        COMEDY       0.02      1.00      0.04         1
         CRIME       0.00      0.00      0.00         0
     EDUCATION       0.51      0.66      0.58        56
 ENTERTAINMENT       0.51      0.56      0.53       129
         FIFTY       0.15      0.51      0.24        41
     GOOD NEWS       0.00      0.00      0.00         0
         GREEN       0.46      0.80      0.58        69
HEALTHY LIVING       0.79      0.32      0.45       809
        IMPACT       0.26      0.41      0.32       122
 LATINO VOICES       0.00      0.00      0.00         0
         MEDIA       0.03      1.00      0.05         1
       PARENTS       0.58      0.52      0.55       209
      POLITICS       0.81   

In [None]:
p_svc,r_svc,f1_svc,s_svc=score(y_test,SVC_prediction )
c=pd.Series(f1_svc).sort_values(ascending=False).head(3).index.to_list()
all_labels=y.unique()
all_labels.sort()
list(all_labels[c])
# Top 3 predictions from SVC classifier