In [132]:
import re
import pickle
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


import warnings

warnings.filterwarnings("ignore")

In [53]:
stop_words = set(stopwords.words('english'))

In [15]:
# book_df = pd.read_csv("data/goodreads_books.csv")
# book_df.head(2)
# book_df.shape
# book_df.columns

### Data Collection

In [6]:
# https://www.kaggle.com/yufengdev/bbc-fulltext-and-category?select=bbc-text.csv
bbc_df = pd.read_csv("data/bbc-text.csv")
bbc_df.head(2)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...


### Data Exploration

In [20]:
bbc_df.shape

(2225, 2)

In [46]:
# No of records for all type of category
bbc_df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [23]:
bbc_df.isna().sum()

category    0
text        0
dtype: int64

In [47]:
bbc_df.duplicated().sum()

99

In [32]:
bbc_df[bbc_df.duplicated()].sort_values('text')

Unnamed: 0,category,text
85,politics,hague given up his pm ambition former conser...
301,politics,fox attacks blair s tory lies tony blair lie...
496,tech,microsoft gets the blogging bug software giant...
543,business,economy strong in election year uk businesse...
582,entertainment,ray dvd beats box office takings oscar-nominat...
...,...,...
2206,politics,kennedy questions trust of blair lib dem leade...
2207,tech,california sets fines for spyware the makers o...
2213,tech,progress on new internet domains by early 2005...
2215,tech,junk e-mails on relentless rise spam traffic i...


In [38]:
bbc_df.loc[85]['text']

'hague  given up  his pm ambition former conservative leader william hague says he will not stand for the leadership again  having given up his ambition to be prime minister.  mr hague  43  told the daily telegraph he would now find a life dominated by politics too  boring  and unfulfilling. mr hague  who stepped down after his party s 2001 election defeat  does not rule out a return to the front bench. he also told the paper he hopes to remain mp for richmond  north yorks  and start a family with wife ffion. mr hague  who recently had published the biography of william pitt the younger  also said he wanted to continue writing books and speech-writing.  he told the newspaper:  i don t know whether i will ever go back on to the front  but don t rush me.  asked if he would stand for the leadership again  mr hague replied:  no. definitely not.  his determination to stay away from a central role will disappoint some senior conservative members  who say the party needs him. tim collins  the

In [41]:
for idx in range(len(bbc_df)):
    if "hague  given up  his pm ambition former conservative leader william hague says" in bbc_df.loc[idx]['text']:
#         print(bbc_df.loc[idx]['text'])
        print(idx)

13
85


In [45]:
print(bbc_df.loc[13]['text'])
print("***********************")
print(bbc_df.loc[85]['text'])

hague  given up  his pm ambition former conservative leader william hague says he will not stand for the leadership again  having given up his ambition to be prime minister.  mr hague  43  told the daily telegraph he would now find a life dominated by politics too  boring  and unfulfilling. mr hague  who stepped down after his party s 2001 election defeat  does not rule out a return to the front bench. he also told the paper he hopes to remain mp for richmond  north yorks  and start a family with wife ffion. mr hague  who recently had published the biography of william pitt the younger  also said he wanted to continue writing books and speech-writing.  he told the newspaper:  i don t know whether i will ever go back on to the front  but don t rush me.  asked if he would stand for the leadership again  mr hague replied:  no. definitely not.  his determination to stay away from a central role will disappoint some senior conservative members  who say the party needs him. tim collins  the 

### Data Engineering

In [48]:
bbc_unique_df = bbc_df.drop_duplicates()
print(bbc_unique_df.shape)
bbc_unique_df.head(2)

(2126, 2)


Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...


In [83]:
bbc_unique_df.reset_index(inplace=True, drop=True)

In [50]:
bbc_unique_df.value_counts("category")

category
sport            504
business         503
politics         403
entertainment    369
tech             347
dtype: int64

In [51]:
bbc_unique_df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [72]:
# Punctuation removal Using regex
def punctuation_removal(test_str):
    clean_text = re.sub(r'[^\w\s]', ' ', test_str)
    clean_text = [i for i in clean_text.split() if len(i) > 0]
    clean_text = " ".join(clean_text).strip()
    
    return  clean_text


# Remove stopwords
def stopwords_removal(sent):
    word_tokens = word_tokenize(sent)  
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = " ".join(filtered_sentence)
    
    return filtered_sentence


In [77]:
sentence = "The process of converting data to something a computer @@#1 can understand is referred to as pre-processing."

clean_text = punctuation_removal(sentence)
print(clean_text)
clean_text = stopwords_removal(sentence)
print(clean_text)

The process of converting data to something a computer 1 can understand is referred to as pre processing
The process converting data something computer @ @ # 1 understand referred pre-processing .


In [84]:
all_clean_text = []

for idx in range(len(bbc_unique_df)):
    text = bbc_unique_df.loc[idx]['text']
    clean_text = punctuation_removal(text)
    clean_text = stopwords_removal(clean_text)
    
    all_clean_text.append(clean_text)
    


In [89]:
bbc_unique_df['text'] = all_clean_text


### Data Preparation

In [128]:
X = list(bbc_unique_df['text'])
y = list(bbc_unique_df['category'])


In [129]:
#Split data in train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


### Model Training and Evaluation

In [96]:
#Use pipeline to carry out steps in sequence with a single object
#SVM's rbf kernel gives highest accuracy in this classification problem.
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='rbf'))])


In [97]:
#train model
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SVC())])

In [100]:
# save the model to disk
filename = 'model/svm_model.sav'
pickle.dump(text_clf, open(filename, 'wb'))


In [101]:
# load the model from disk
filename = 'model/svm_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))


In [134]:
# Confusion Matrix
y_pred = text_clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[124,   0,   2,   0,   0],
       [  1,  92,   0,   0,   0],
       [  2,   0,  99,   0,   0],
       [  0,   0,   0, 128,   0],
       [  0,   0,   0,   1,  83]], dtype=int64)

In [137]:
# Classification Matrix
print(f1_score(y_test, y_pred, average='macro'))
print(f1_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='macro'))

0.9890301388022038
0.9887333275066503
0.9890301388022038


In [139]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     business       0.98      0.98      0.98       126
entertainment       1.00      0.99      0.99        93
     politics       0.98      0.98      0.98       101
        sport       0.99      1.00      1.00       128
         tech       1.00      0.99      0.99        84

     accuracy                           0.99       532
    macro avg       0.99      0.99      0.99       532
 weighted avg       0.99      0.99      0.99       532



In [140]:
accuracy_score(y_test, y_pred)

0.9887218045112782

### Model Hyperparameter Tuning

In [112]:
cv_tfidf = Pipeline([('count', CountVectorizer()), ('tfid', TfidfTransformer())])


In [115]:
X_tfidf = cv_tfidf.fit_transform(all_clean_text).toarray()
X_tfidf.shape

(2126, 29279)

In [121]:
y = np.array(bbc_unique_df['category'])
y.shape

(2126,)

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.25)

In [124]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100],
            'gamma': [1, 0.1, 0.01, 0.001],
            'kernel': ['rbf', 'linear']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.241, total= 1.5min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.238, total= 1.5min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.9min remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.238, total= 1.5min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.238, total= 1.5min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.239, total= 1.5min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.765, total= 1.4min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.768, total= 1.4min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.715, total= 1.4min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.771, total= 1.4min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .

[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.238, total= 1.4min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.238, total= 1.4min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.238, total= 1.4min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.238, total= 1.4min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.236, total= 1.4min
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.966, total= 1.3min
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.981, total= 1.3min
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] .

[CV] .... C=10, gamma=0.001, kernel=linear, score=0.981, total= 1.3min
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.965, total= 1.3min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.966, total= 1.5min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.978, total= 1.5min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.966, total= 1.5min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.981, total= 1.5min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.959, total= 1.5min
[CV] C=100, gamma=1, kernel=linear ...................................
[CV] .

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed: 219.7min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'linear']},
             verbose=3)

In [125]:
# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=1, gamma=1, kernel='linear')


In [142]:
final_text_clf = Pipeline([('vect', CountVectorizer()), 
                           ('tfidf', TfidfTransformer()), 
                           ('clf', SVC(kernel='linear', C=1, gamma=1))])

final_text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SVC(C=1, gamma=1, kernel='linear'))])

In [143]:
# Confusion Matrix
y_pred = final_text_clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[120,   0,   4,   0,   2],
       [  1,  91,   1,   0,   0],
       [  2,   0,  99,   0,   0],
       [  0,   0,   0, 128,   0],
       [  1,   0,   0,   1,  82]], dtype=int64)

In [144]:
# Classification Matrix
print(f1_score(y_test, y_pred, average='macro'))
print(f1_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='macro'))

0.9774567037852024
0.9774479238200555
0.9774567037852024


In [145]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     business       0.97      0.95      0.96       126
entertainment       1.00      0.98      0.99        93
     politics       0.95      0.98      0.97       101
        sport       0.99      1.00      1.00       128
         tech       0.98      0.98      0.98        84

     accuracy                           0.98       532
    macro avg       0.98      0.98      0.98       532
 weighted avg       0.98      0.98      0.98       532



In [146]:
accuracy_score(y_test, y_pred)

0.9774436090225563

In [147]:
# Final save the model to disk
filename = 'model/fina_svm_model.sav'
pickle.dump(text_clf, open(filename, 'wb'))


#### Final model accuracy is 97.74%