In [0]:
  !apt-get install -y -qq software-properties-common python-software-properties module-init-tools
  !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
  !apt-get update -qq 2>&1 > /dev/null
  !apt-get -y install -qq google-drive-ocamlfuse fuse
  from google.colab import auth
  auth.authenticate_user()
  from oauth2client.client import GoogleCredentials
  creds = GoogleCredentials.get_application_default()
  import getpass
  !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
  vcode = getpass.getpass()
  !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
!mkdir -p drive
!google-drive-ocamlfuse drive

In [0]:
%cd drive


In [0]:
%cd News categorization

In [0]:
#import dataset
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
data=pd.read_json("News_Category_Dataset.json",lines=True)
data.head(2)

In [0]:
data = data.sample(10000, random_state=42)
data.reset_index(inplace=True, drop=True)
data.head(2)

In [0]:
data.groupby(by='category').size()/10000*100

In [0]:
fig, ax = plt.subplots(1, 1, figsize=(35,7))
sns.countplot(x = 'category', data = data)

In [0]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
data['category'].value_counts().plot.pie( autopct = '%1.1f%%')

In [0]:
# merge headline column and short_description to get one extra feature named important feature

data['imp'] = data['short_description'].astype(str) + data['headline']

In [0]:
#import required library 

import pandas as pd
import numpy as np
pd.options.display.max_columns = 200
pd.options.mode.chained_assignment = None

from nltk.tokenize import word_tokenize, sent_tokenize
#from nltk.corpus import stopwords
#stop = set(stopwords.words('english'))
from string import punctuation

from collections import Counter
import re
import numpy as np

from tqdm import tqdm_notebook
tqdm_notebook().pandas()

In [0]:
# shape of dataset

data.shape

In [0]:
# remove duplicates and null value from imp column

data = data.drop_duplicates('imp')
data = data[~data['imp'].isnull()]
print(data.shape)

In [0]:
# these are most frequent stop words here used my own generated stopwords rather than predefied NLP having very less no. of stop words approx 150
# stopwords.txt is imported drom my local system rather than python pre-defined 

stop_words = []
f = open('stopwords.txt', 'r')
for l in f.readlines():
    stop_words.append(l.replace('\n', ''))
additional_stop_words = ['t', 'will']
stop_words += additional_stop_words

print(len(stop_words))

In [0]:
# these are stopwords and punctuation which is useless in our corpus so we should remove this

print(list(stop_words))
print(list(punctuation))

In [0]:
# clean the imp column of  text by changing into simple words free from non-ascii converted into lower case 

from functools import reduce
def _removeNonAscii(s): 
    return "".join(i for i in s if ord(i)<128)


def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = text.replace('(ap)', '')
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)
    text = re.sub('[^a-zA-Z ?!]+', '', text)
    text = _removeNonAscii(text)
    text = text.strip()
    return text

In [0]:
# change the whole lines into tockens

def tokenizer(text):
    text = clean_text(text)    
    tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]
    tokens = (reduce(lambda x,y: x+y, tokens,[]))
    tockens = tokens[:] 
    filtered_sentence = [w for w in tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
     
    
    
    tokens = list(filter(lambda token: token not in (list(stop_words) + list(punctuation)) , tokens))
    return filtered_sentence


In [0]:
import nltk
nltk.download('punkt')

In [0]:
#tockanize words of short_description free from stopwords punctuation 

data['tokens'] = data['imp'].progress_map(lambda d: tokenizer(d))

In [0]:
# import feature column ate converted into tockens free from stopwords 

for descripition, tokens, category in zip(data['imp'].head(5), data['tokens'].head(5),data['category'].head(5)):
    print('\nimportant feature: ', descripition)
    print('\ntokens: ', tokens)
    print('\ncategory: ',category)
    

In [0]:
# tf-idf vectorization performed on tockenized dataset 

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5, analyzer='word', ngram_range=(1,2), stop_words=stop_words)
vx = vectorizer.fit_transform(list(data['tokens'].map(lambda tokens: ' '.join(tokens))))


In [0]:
# shape of tf-idf sparse matrix

vx.shape

In [0]:
# sparse matrix changed into array

dense = vx.toarray()


In [0]:
dense.shape

In [0]:
#to handle imbalance dataset i.e almost 38% of data are politics and entertainment use SMOTE oversampling method
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
x,y=smote.fit_sample(dense,data['category'])

In [0]:
# After oversampling total no of data increases from 9996 to 82398
x.shape

In [0]:
#now data get balanced having each class 3.22% of data
d=pd.DataFrame(y)
(d.groupby(by=0).size()/(y.shape))*100

In [0]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
d[0].value_counts().plot.pie( autopct = '%1.1f%%')

In [0]:
# datasets is splitted into train and test 

from sklearn.model_selection import train_test_split
des_tr, des_te, cat_tr, cat_te = train_test_split(x,y,test_size=0.2)

In [0]:
# dataset is fit on linear support vector

from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(des_tr,cat_tr)

In [0]:
# prediction is made on test data and accuracy is calculate

y_pred = model.predict(des_te)
acc = accuracy_score(cat_te, y_pred)
print("Accuracy {:.2f}".format(acc*100))

In [0]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
print(classification_report(cat_te, y_pred))

In [0]:
# these are 50 sample of actual value and predicted result on test data we can change range to get more sample
# these results shows model doesn't predicted weird result even if wrong prediction is somewhat similar to actual 

p=np.asarray(cat_te)
for x in range(5000,5050):
    print("actual:" ,cat_te[x] , "           predicted:" ,y_pred[x])
   

In [0]:
# linear regression model is trained on the dataset and accuracy is calculated

from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(des_tr,cat_tr)

In [0]:
y_pred = model.predict(des_te)
acc = accuracy_score(cat_te, y_pred)
print("Accuracy {:.2f}".format(acc*100))

In [0]:
# multinominal naive baise model is fit and prediction is measured  

from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(des_tr,cat_tr)

In [0]:
y_pred = model.predict(des_te)
acc = accuracy_score(cat_te, y_pred)
print("Accuracy {:.2f}".format(acc*100))