In [5]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix

from wordcloud import WordCloud
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords

# from lime import lime_text
import gensim
import gensim.downloader as gensim_api

import warnings
warnings.filterwarnings('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.set_option.max_rows = None

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
cat_df = pd.read_csv("categories.csv")
cat_df

Unnamed: 0,id,category
0,1,Layoff
1,2,Mergers and Acquisitions
2,3,Mass Hiring
3,4,Executive Movement
4,5,Centre Setup and Expansion
5,6,Deals
6,7,Partnerships


In [8]:
map_df = pd.read_excel("category_mapping.xlsx",sheet_name="Sheet1")
print(map_df.head())
map_df.duplicated().value_counts()

                           news_id  category_id
0  5cb377684f646938b14a96f1_google            4
1  5cb426d44f646938b10bf6b6_google            4
2  5cb429984f646938b110f937_google            2
3  5cb426b34f646938b10bbeca_google            5
4  5cb426f74f646938b10c3857_google            7


False    2636
True      908
dtype: int64

In [9]:
#There were some duplicates identified in map_df. hence removing them via drop_duplicates command
map_df = map_df.drop_duplicates()
print("total number of unique articles: ",map_df.news_id.nunique())
print("total articles: ",map_df.news_id.count())

total number of unique articles:  2635
total articles:  2636


In [10]:
news_df = pd.read_excel("news_details.xlsx",sheet_name = 'Sheet1')
print(news_df.duplicated().value_counts())
news_df.head()

False    2720
True      824
dtype: int64


Unnamed: 0,news_id,snippet,title,news_description
0,5cb377684f646938b14a96f1_google,"New Delhi: Bank of Baroda, which has become th...",Bank of Baroda to hire consultancy firm to eva...,
1,5cb426d44f646938b10bf6b6_google,He said his plans would be discussed with the ...,Foxconn chairman Gou says he aims to step down...,TAIPEI (Reuters) - The chairman of Taiwan's Fo...
2,5cb429984f646938b110f937_google,ZF Friedrichshafen AG has announced that it ha...,ZF Finalizes USD 7 Billion Deal to Acquire WABCO,Share 0 Share 0\n\nZF Friedrichshafen AG has a...
3,5cb426b34f646938b10bbeca_google,Zhejiang Geely Holding (ZGH) has begun constru...,Geely Begins Work On New Lotus Plant In China,Zhejiang Geely Holding (ZGH) has begun constru...
4,5cb426f74f646938b10c3857_google,KFH has partnered with IDEMIA to launch Kuwait...,KFH partners with IDEMIA's to launch metal pay...,


In [11]:
news_df = news_df.drop_duplicates()
print("Total Number of articles in the news_details data: " , news_df.news_id.nunique())

#Checking whether there is a mapping for every article in the category mapping table. 
pd.Series(news_df.news_id.unique()).isin(map_df.news_id.unique()).value_counts()

Total Number of articles in the news_details data:  2635


True    2635
dtype: int64

In [12]:
#Combining news data with the associated categories and creating the training dataset
cat_map_df = cat_df.merge(map_df,left_on='id',right_on='category_id')[['news_id','category']]
# cat_map_df = pd.crosstab(cat_map_df['news_id'],cat_map_df['category']).reset_index(level='news_id')
cat_map_df

Unnamed: 0,news_id,category
0,5cb516014f646938b1bda75b_google,Layoff
1,5cb42d064f646938b1178e03_google,Layoff
2,5cb419f44f646938b1f851de_google,Layoff
3,5cb6bbcc4f646938b10fe4b8_google,Layoff
4,5cb6c3234f646938b1199b88_google,Layoff
...,...,...
2631,5cd5468c29458140edccef58_google,Partnerships
2632,5cd5abab29458140edab30e8_google,Partnerships
2633,5cd54df629458140edd686f1_google,Partnerships
2634,5cd3cb1d29458140edbab1e6_google,Partnerships


In [13]:
df = news_df.merge(cat_map_df,on='news_id')
df['text'] = (df['snippet']+" "+ df['title']+" "+df['news_description'].fillna("")).astype(str)
print("the dimensions of the new dataframe are:" ,df.shape)
df.head()

the dimensions of the new dataframe are: (2721, 6)


Unnamed: 0,news_id,snippet,title,news_description,category,text
0,5cb377684f646938b14a96f1_google,"New Delhi: Bank of Baroda, which has become th...",Bank of Baroda to hire consultancy firm to eva...,,Executive Movement,"New Delhi: Bank of Baroda, which has become th..."
1,5cb426d44f646938b10bf6b6_google,He said his plans would be discussed with the ...,Foxconn chairman Gou says he aims to step down...,TAIPEI (Reuters) - The chairman of Taiwan's Fo...,Executive Movement,He said his plans would be discussed with the ...
2,5cb429984f646938b110f937_google,ZF Friedrichshafen AG has announced that it ha...,ZF Finalizes USD 7 Billion Deal to Acquire WABCO,Share 0 Share 0\n\nZF Friedrichshafen AG has a...,Mergers and Acquisitions,ZF Friedrichshafen AG has announced that it ha...
3,5cb426b34f646938b10bbeca_google,Zhejiang Geely Holding (ZGH) has begun constru...,Geely Begins Work On New Lotus Plant In China,Zhejiang Geely Holding (ZGH) has begun constru...,Centre Setup and Expansion,Zhejiang Geely Holding (ZGH) has begun constru...
4,5cb426f74f646938b10c3857_google,KFH has partnered with IDEMIA to launch Kuwait...,KFH partners with IDEMIA's to launch metal pay...,,Partnerships,KFH has partnered with IDEMIA to launch Kuwait...


In [14]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))


def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = text.split()
    text = [word for word in text if word not in STOPWORDS]
#     Stemming and Lemmatization are not being here as they were resulting into decrease in accuracy of logistic and xgboost model    
#     ps = nltk.stem.porter.PorterStemmer()
#     text = [ps.stem(word) for word in text]
#     lem = nltk.stem.wordnet.WordNetLemmatizer()
#     text = [lem.lemmatize(word) for word in text]
    text = " ".join(text)
    return text
    
df['text'] = df['text'].apply(clean_text)

In [15]:
import ktrain
from ktrain import text

In [17]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(df.category.values)

X_train, X_test, Y_train, Y_test = train_test_split(df.text.values.astype('U'), y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=Y_train,
                                                                       x_test=X_test, y_test=Y_test,
                                                                       class_names=Y_train,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


In [18]:
# you can disregard the deprecation warnings arising from using Keras 2.2.4 with TensorFlow 1.14.
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)
learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6)

Is Multi-Label? False
maxlen is 350
done.


In [30]:
learner.fit_onecycle(2e-5, 5)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe40448d3c8>

In [31]:
learner.validate(val_data=(x_test, y_test))

              precision    recall  f1-score   support

           0       0.94      0.87      0.91        94
           1       0.82      0.92      0.86        73
           2       0.96      0.96      0.96       143
           3       0.80      0.94      0.86        17
           4       0.88      0.88      0.88        17
           5       0.89      0.94      0.91        77
           6       0.94      0.88      0.91       124

    accuracy                           0.91       545
   macro avg       0.89      0.91      0.90       545
weighted avg       0.92      0.91      0.91       545



array([[ 82,   4,   0,   0,   1,   4,   3],
       [  0,  67,   0,   0,   0,   3,   3],
       [  0,   1, 137,   2,   0,   2,   1],
       [  1,   0,   0,  16,   0,   0,   0],
       [  0,   0,   0,   2,  15,   0,   0],
       [  0,   4,   1,   0,   0,  72,   0],
       [  4,   6,   4,   0,   1,   0, 109]])

> It can be seen above that BERT resulted in 91% accuracy on test data(unseen data)

In [21]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [23]:
predictions = predictor.predict(X_test)
predictions_train = predictor.predict(X_train)
