In [None]:
#import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#sklearn package 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn. preprocessing import LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import ComplementNB,MultinomialNB,GaussianNB 

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import GradientBoostingClassifier

#model evaluation
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix,recall_score,precision_score,f1_score


In [None]:
#read the dataset
data = pd.read_json('../input/news-category-dataset/News_Category_Dataset_v2.json',lines=True) # lines for avoid the trailing error
column = data.columns
column

In [None]:
data.drop(['link','date'],axis=1,inplace = True)
data.head(4)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
print("The shape of the dataset-------->",data.shape)
print("The number of null values ------>")
print(data.isnull().sum())
column = data.columns
print("The column present there-------->",column)

# Category walkthrough

In [None]:
print("The total number category present here------------->",data['category'].nunique())
category=data['category'].value_counts()
print(category)

In [None]:
plt.figure(figsize=(25,8))
sns.barplot(x=category.index,y=category.values)
plt.title("The distribution of categories")
plt.xlabel("Category")
plt.ylabel("The number of samples")

plt.xticks(rotation=60,fontsize = 14)
plt.show()

In [None]:
#pie chart 
plt.figure(figsize=(20,20))
plt.pie(category.values, autopct="%1.1f%%", labels=category.index)
plt.show()
plt.savefig(r"./category_pie.png")

In [None]:
categories = data['category'].value_counts().index

def groupper(grouplist,name):
    for ele in categories:
        if ele in grouplist:
            data.loc[data['category'] == ele, 'category'] = name

In [None]:
groupper( grouplist= ['SPORTS','ENTERTAINMENT' , 'COMEDY','WEIRD NEWS','ARTS'] , name =  'SPORTS AND ENTERTAINMENT')

groupper( grouplist= ['TRAVEL', 'ARTS & CULTURE','CULTURE & ARTS','FOOD & DRINK', 'TASTE'] , name =  'TRAVEL-TOURISM & ART-CULTURE')

groupper( grouplist= ['WOMEN','QUEER VOICES', 'LATINO VOICES', 'BLACK VOICES'] , name =  'EMPOWERED VOICES')

groupper( grouplist= ['BUSINESS' ,  'MONEY'] , name =  'BUSINESS-MONEY')

groupper( grouplist= ['THE WORLDPOST' , 'WORLDPOST' , 'WORLD NEWS'] , name =  'WORLDNEWS')

groupper( grouplist= ['ENVIRONMENT' ,'GREEN'] , name =  'ENVIRONMENT')

groupper( grouplist= ['TECH', 'SCIENCE'] , name =  'SCIENCE AND TECH')

groupper( grouplist= ['FIFTY' , 'IMPACT' ,'GOOD NEWS','CRIME'] , name =  'GENERAL')

groupper( grouplist= ['WEDDINGS', 'DIVORCE',  'RELIGION','MEDIA'] , name =  'MISC')

In [None]:
print("We have a total of {} categories now".format(data['category'].nunique()))
category = data['category'].value_counts()

In [None]:
#pie chart 
plt.figure(figsize=(15,15))
plt.pie(category.values, autopct="%1.1f%%", labels=category.index)
plt.show()

In [None]:
plt.figure(figsize=(25,13))
sns.barplot(y=category.index,x=category.values)
plt.title("The distribution of categories")
plt.xlabel("Category")
plt.ylabel("The number of samples")

plt.yticks(rotation=0,fontsize = 16)
plt.show()
plt.savefig(r"./category_bar.png")

There are unqual number of sample in each category, so we can drop some category and make it balanced

# handling Dublicate and null values

In [None]:
#delete the dublicate values
data.duplicated().sum() # count the total duplicate samples

In [None]:
data.drop_duplicates(keep='last',inplace=True)

In [None]:
#there can be dublicate of author names so check for the dublicate headline and short discription
data.duplicated(subset=['headline', 'short_description']).sum()

In [None]:
data.drop_duplicates(subset=['headline', 'short_description'],inplace=True,keep='last')

In [None]:
print("THe length of the datset after dublicate deletion------>",data.shape)

# Handling null values

In [None]:
data.isnull().sum()

In [None]:
# there is no null value instead of null they are blank so we need to check for the blank placess and delete that
data[data['headline'] == '']

In [None]:
# drop the blank values
headline_blank = data['headline'] == ''
data = data[~headline_blank]
print("THe length of the datset ------>",data.shape)

In [None]:
#drop the blank short describtion column
description_blank = data['short_description']==''
print("the lenth of the blank description samples----->",len(data[description_blank]))
data = data[~description_blank]
print("THe length of the datset ---------------------->",data.shape)

In [None]:
#drop the null author samples
author_blank = data['authors']==''
print("the lenth of the blank auhtor samples---------->",len(data[author_blank]))
data = data[~author_blank]
print("THe length of the datset ---------------------->",data.shape)

In [None]:
data.head(5)

In [None]:
#author 
#auhtor plot
auhtor_count = data['authors'].value_counts()

plt.figure(figsize=(25,18))
sns.barplot(y=auhtor_count[:25].index,x=auhtor_count[:25].values)
plt.title("The distribution of authors")
plt.xlabel("Author Name")
plt.ylabel("The number of samples")

plt.yticks(rotation=0,fontsize = 18)
plt.show()
plt.savefig(r"./author_bar.png")

# Balance the category data

In [None]:
category = data['category'].value_counts()
category


we can drop the Style, Education, College and Environment they are having very less number of sample, which may lead to less accuracy and f1 score.

In [None]:
cateo_drop = (data['category'] == 'ENVIRONMENT') | (data['category'] == 'STYLE' )| (data['category'] == 'EDUCATION') | (data['category'] == 'COLLEGE')
data = data[~cateo_drop]

In [None]:
data['category'].value_counts()

In [None]:
data = data.copy()
data = data.groupby('category').head(3000)
data.shape

In [None]:
category = data['category'].value_counts()
plt.figure(figsize=(25,13))
sns.barplot(y=category.index,x=category.values)
plt.title("The distribution of categories")
plt.xlabel("Category")
plt.ylabel("The number of samples")

plt.yticks(rotation=0,fontsize = 16)
plt.show()
plt.savefig(r"./category_bar.png")

# Column Combinning

In [None]:
data['text'] = data['headline']+'-'+data['short_description']

In [None]:
data.head(4)

In [None]:
#drop the other columns
data.drop(['authors','headline','short_description'],axis=1,inplace=True)

In [None]:
print("The lenth of the datset-------------------->",data.shape)
data.head(4)

In [None]:
from sklearn.utils import shuffle
data = shuffle(data)
data.reset_index(inplace=True, drop=True) 
data.head(4)

# test cleaning

In [None]:
"""corpus=[]
for i in range(100000):
    text = data.iloc[i,1]
    
    text = text.lower()
    text = re.sub('[^a-z0-9]',' ',text)
    text = text.split()
    
    s = PorterStemmer()
    text = [s.stem(word) for word in text if not word in set(stopwords.words('english')) ]
    text = ' '.join(text)
    corpus.append(text)
    
    if i%1000==0:
        print(i,end='->')"""


In [None]:
"""corpus = pd.read_csv('../input/corpus/corpus.csv')
corpus"""


# Tokazitation and Count Vectorization

In [None]:
#ifidf vectorizer
X = data['text']

vecto =  TfidfVectorizer(stop_words='english',max_df = 0.99,min_df=0.001,
                                   ngram_range=(1, 2),lowercase=True, max_features=5000)
X = vecto.fit_transform(X).toarray()
X.shape

In [None]:
print(vecto.get_feature_names())

In [None]:
tfidf_df = pd.DataFrame(X,columns = vecto.get_feature_names())
tfidf_df.head(4)

In [None]:
#label encoding the target
label = LabelEncoder()
y = label.fit_transform(data['category'])

In [None]:
#train and test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1,random_state=2)

In [None]:
print("The X_train shape----->",X_train.shape)
print('The X_text shape------>',X_test.shape)
print("THe y_train shape----->",y_train.shape)
print("The y_test shape------>",y_test.shape)

# Model training

In [None]:

def classifier_scores(y_train,y_test, pred_train, pred_test):
    
    print()
    print("Train data accuracy score: ", accuracy_score(y_train,pred_train))    
    print("Test data accuracy score: ", accuracy_score(y_test,pred_test))
    print()
    print("Recall score on train data: ", recall_score(y_train,pred_train, average='macro'))
    print("Recall score on test data: ",recall_score( y_test,pred_test, average='macro'))
    print()
    
    print("Precision score on train data: ",precision_score(y_train,pred_train, average='macro'))
    print("Precision score on test data: ",precision_score(y_test,pred_test, average='macro'))
    print()
    print("F1 score on train data: ",f1_score(y_train,pred_train, average='macro'))
    print("F1 score on test data: ",f1_score(y_test,pred_test, average='macro'))

## Multinomial Naive Bayes

In [None]:
print("Multinamial NB----------------------------------->")
multinb = MultinomialNB()
multinb.fit(X_train , y_train)

y_train_pred = multinb.predict(X_train)
y_test_pred = multinb.predict(X_test)
classifier_scores(y_train,y_test,y_train_pred,y_test_pred)

## Complement Naive Bayes

In [None]:
print("Compiment NB----------------------------------->")
compnb = ComplementNB(alpha=1.0)
compnb.fit(X_train , y_train)

y_train_pred = compnb.predict(X_train)
y_test_pred = compnb.predict(X_test)
classifier_scores(y_train,y_test,y_train_pred,y_test_pred)

## Gaussian Naive Bayes

In [None]:
#model training
gaussion_NB = GaussianNB()
gaussion_NB.fit(X_train , y_train)

y_train_pred = gaussion_NB.predict(X_train)
y_test_pred = gaussion_NB.predict(X_test)
classifier_scores(y_train,y_test,y_train_pred,y_test_pred)

## logistic Regresssion

In [None]:
#logistic Regresssion

log_reg = LogisticRegression()

log_reg.fit(X_train , y_train)

y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)
classifier_scores(y_train,y_test,y_train_pred,y_test_pred)

In [None]:
#logistric regression more accuracy
log_reg_hyper = LogisticRegression(solver='liblinear',n_jobs=-1,penalty='l2',)
log_reg_hyper.fit(X_train , y_train)

y_train_pred = log_reg_hyper.predict(X_train)
y_test_pred = log_reg_hyper.predict(X_test)
classifier_scores(y_train,y_test,y_train_pred,y_test_pred)

## Gradient Boosting Classifier

In [None]:
#GradientBoostingClassifier 
lr_list = [0.05, 0.075, 0.1, 0.25]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    
    y_train_pred = gb_clf.predict(X_train)
    y_test_pred = gb_clf.predict(X_test)
    classifier_scores(y_train,y_test,y_train_pred,y_test_pred)
    print('-'*56)

### Model training with SVD

In [None]:
n_com = [500,700,1000,1500]
def models_prepare():
    model = {}
    for n in n_com:
        s = [('svd',TruncatedSVD(n_components = n)),('logistric',LogisticRegression())]
        model[str(n)] = Pipeline(steps = s)
    return model
models = models_prepare()
models

In [None]:

for name,model in models.items():
    model.fit(X_train,y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    print("The Logistric Regression Trained with svd n_components {} ".format(name))
    
    classifier_scores(y_train,y_test,y_pred_train,y_pred_test)

## logistic regresion

In [None]:
log = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]


# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
grid_search = GridSearchCV(estimator=log, param_grid=grid, n_jobs=-1, cv=5, scoring='accuracy',error_score=0)


grid_result = grid_search.fit(X, y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']

params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f : %r" % (mean, param))