# Introduction
In this notebook, I will create classify model using TF-IDF algorithm for vectoring text and Random Forest, Native Bayes, SVM 

# Feature engineering

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import numpy as np

print(f"Spacy version: {spacy.__version__}")

nlp_spacy = spacy.load('en_core_web_sm')

def read_volcabulary(filename):
    volcabulary = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            word, count = line.strip().split(',')
            volcabulary.append(word.strip())
    
    return np.array(volcabulary)
            
VOCABULARY = read_volcabulary('corpus/vocabulary.txt')

def text_preprocessing(text):
    #text = correct_spelling(text)
    
    doc = nlp_spacy(text)
    
    clean_bag_words = []
    for token in doc:
        if not (token.like_url or token.is_punct or token.is_space):
            if token.is_currency:
                clean_bag_words.append('_currency_')
            elif token.ent_type_ == 'ORDINAL':
                clean_bag_words.append('_ordinal_')
            elif token.ent_type_ == 'TIME':
                clean_bag_words.append('_time_')
            elif token.ent_type_ == 'QUANTITY':
                clean_bag_words.append('_quantity_')
            else:
                if token.lemma_ in VOCABULARY:
                    clean_bag_words.append(token.lemma_)
    
    return clean_bag_words
    

Spacy version: 2.1.4


In [2]:
print(VOCABULARY[:20])

['year' '_currency_' 'new' 'people' 'time' 'win' 'good' 'game' '_time_'
 'film' 'world' 'government' 'play' 'go' 'come' 'work' '_ordinal_'
 'company' 'take' 'firm']


In [3]:
#Read trainning set and test set
import pandas as pd

trainning_set = pd.read_csv('corpus/clean_training_set.csv')
test_set = pd.read_csv('dataset/BBC News Test.csv')

trainning_set.head()

Unnamed: 0,ArticleId,Text,Category,CleanText
0,1833,worldcom ex-boss launches defence lawyers defe...,business,"['worldcom', 'boss', 'launch', 'defence', 'law..."
1,154,german business confidence slides german busin...,business,"['german', 'business', 'confidence', 'slide', ..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,"['bbc', 'poll', 'indicate', 'economic', 'gloom..."
3,1976,lifestyle governs mobile choice faster bett...,tech,"['lifestyle', 'govern', 'mobile', 'choice', 'f..."
4,917,enron bosses in $168m payout eighteen former e...,business,"['enron', 'boss', '_currency_', 'payout', 'eig..."


In [4]:
test_set.head()

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


In [5]:
#Preprocessing Text feature on test set
#from tqdm.contrib.concurrent import thread_map
#test_set['CleanText'] = thread_map(text_preprocessing, test_set['Text'])
test_set = pd.read_csv('corpus/clean_test_set.csv')

In [6]:
#test_set['CleanText'] = test_set.CleanText.astype(str)

In [7]:
#Saving test_set
#test_set.to_csv('corpus/clean_test_set.csv', index=False)

In [8]:
test_set.head()

Unnamed: 0,ArticleId,Text,CleanText
0,1018,qpr keeper day heads for preston queens park r...,"['qpr', 'keeper', 'day', 'head', 'preston', 'q..."
1,1319,software watching while you work software that...,"['software', 'watch', '-PRON-', 'work', 'softw..."
2,1138,d arcy injury adds to ireland woe gordon d arc...,"['arcy', 'injury', 'add', 'ireland', 'woe', 'g..."
3,459,india s reliance family feud heats up the ongo...,"['india', 'reliance', 'family', 'feud', 'heat'..."
4,1020,boro suffer morrison injury blow middlesbrough...,"['boro', 'suffer', 'morrison', 'injury', 'blow..."


In [9]:
test_set.tail()

Unnamed: 0,ArticleId,Text,CleanText
730,1923,eu to probe alitalia state aid the european ...,"['probe', 'state', 'aid', 'european', 'commiss..."
731,373,u2 to play at grammy awards show irish rock ba...,"['play', 'grammy', 'award', 'show', 'irish', '..."
732,1704,sport betting rules in spotlight a group of mp...,"['sport', 'bet', 'rule', 'in', 'spotlight', 'g..."
733,206,alfa romeos to get gm engines fiat is to sto...,"['alfa', 'get', 'engine', 'fiat', 'stop', 'mak..."
734,471,citizenship event for 18s touted citizenship c...,"['citizenship', 'event', 'tout', 'citizenship'..."


# Modeling Using TF-IDF

In [23]:
#Vectorzing training set
vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.95)

trainning_vector = vectorizer.fit_transform(trainning_set.CleanText).toarray()
test_vector = vectorizer.transform(test_set.CleanText.values).toarray()
print(trainning_vector.shape)
print(test_vector.shape)

print(type(trainning_vector))

(1490, 208)
(735, 208)
<class 'numpy.ndarray'>


In [11]:
#Init label encoder for news category feature
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

#Encode target, save to varible y
y = encoder.fit_transform(trainning_set['Category'].values)
print(y.shape)

(1490,)


In [12]:
#Model
from sklearn.svm import SVC
from sklearn.naive_bayes import  GaussianNB
from sklearn.ensemble import RandomForestClassifier

#Evalution Model
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, ShuffleSplit

#CV splitrun model 10x with 70/20 split intentionally leaving out 10%
cv_split = ShuffleSplit(n_splits = 5, test_size = .2,
                        train_size = .7, random_state = 0)

param_grids = [
    #Random Forest
    {'n_estimators': [100, 150, 250],
     'criterion': ['gini', 'entropy']},

    #GausianNB
    {},

    #SVC
    {'C': [1, 10, 100],
     'gamma': [0.01, 0.1, 0.001]},

]

MLA = [
    RandomForestClassifier(),
    GaussianNB(),
    SVC(),
]

In [13]:
%%time
import multiprocessing
MAX_WORKER = multiprocessing.cpu_count() - 2

report = pd.DataFrame()
scoring = {'f1': 'f1_macro', 'precision': 'precision_macro', 'recall':'recall_macro'}

row = 0
for mla, param in zip(MLA, param_grids):
    gscv = GridSearchCV(mla, param, cv = cv_split, return_train_score=True, n_jobs=MAX_WORKER, 
                        scoring=scoring, refit='f1', error_score='raise')
    gscv.fit(trainning_vector,y)
    
    best_index = gscv.best_index_
    
    report.loc[row, 'algorithm'] = gscv.best_estimator_.__class__.__name__ + "_TF_IDF_ONE_GRAM"
    report.loc[row, 'best_params'] = str(gscv.best_params_)
    report.loc[row, 'f1_train'] = gscv.cv_results_['mean_train_f1'][best_index]
    report.loc[row, 'f1_test'] = gscv.cv_results_['mean_test_f1'][best_index]
    report.loc[row, 'recall_train'] = gscv.cv_results_['mean_train_recall'][best_index]
    report.loc[row, 'recall_test'] = gscv.cv_results_['mean_test_recall'][best_index]
    report.loc[row, 'precision_train'] = gscv.cv_results_['mean_train_precision'][best_index]
    report.loc[row, 'precision_test'] = gscv.cv_results_['mean_test_precision'][best_index]
    report.loc[row, 'fit_time'] = gscv.cv_results_['mean_fit_time'][best_index]
    
    
    
    row+=1


Wall time: 51.2 s


In [14]:
report

Unnamed: 0,algorithm,best_params,f1_train,f1_test,recall_train,recall_test,precision_train,precision_test,fit_time
0,RandomForestClassifier_TF_IDF_ONE_GRAM,"{'criterion': 'gini', 'n_estimators': 250}",1.0,0.911843,1.0,0.910633,1.0,0.914503,1.66159
1,GaussianNB_TF_IDF_ONE_GRAM,{},0.931458,0.858537,0.931203,0.859601,0.933053,0.861169,0.013115
2,SVC_TF_IDF_ONE_GRAM,"{'C': 10, 'gamma': 0.1}",0.986628,0.900158,0.986545,0.900327,0.98674,0.902152,0.262039


- There are a sign of over-fitting on Random Forest and SVC

- In general, all three model giving a high score



# Modeling using N-Gramm model + TF-IDF

## Uni-Gram + 2-Gram

In [24]:
#Vectorzing training set
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95, ngram_range=(1,2))

trainning_vector = vectorizer.fit_transform(trainning_set.CleanText).toarray()
test_vector = vectorizer.transform(test_set.CleanText.values).toarray()
print(trainning_vector.shape)
print(test_vector.shape)

print(type(trainning_vector))

(1490, 2615)
(735, 2615)
<class 'numpy.ndarray'>


In [20]:
%%time
scoring = {'f1': 'f1_macro', 'precision': 'precision_macro', 'recall':'recall_macro'}

for mla, param in zip(MLA, param_grids):
    gscv = GridSearchCV(mla, param, cv = cv_split, return_train_score=True, n_jobs=MAX_WORKER, 
                        scoring=scoring, refit='f1', error_score='raise')
    gscv.fit(trainning_vector,y)
    
    best_index = gscv.best_index_
    
    report.loc[row, 'algorithm'] = gscv.best_estimator_.__class__.__name__ + "_TF_IDF_ONE_TWO_GRAM"
    report.loc[row, 'best_params'] = str(gscv.best_params_)
    report.loc[row, 'f1_train'] = gscv.cv_results_['mean_train_f1'][best_index]
    report.loc[row, 'f1_test'] = gscv.cv_results_['mean_test_f1'][best_index]
    report.loc[row, 'recall_train'] = gscv.cv_results_['mean_train_recall'][best_index]
    report.loc[row, 'recall_test'] = gscv.cv_results_['mean_test_recall'][best_index]
    report.loc[row, 'precision_train'] = gscv.cv_results_['mean_train_precision'][best_index]
    report.loc[row, 'precision_test'] = gscv.cv_results_['mean_test_precision'][best_index]
    report.loc[row, 'fit_time'] = gscv.cv_results_['mean_fit_time'][best_index]
    
    row+=1
    
report

Wall time: 2min 50s


Unnamed: 0,algorithm,best_params,f1_train,f1_test,recall_train,recall_test,precision_train,precision_test,fit_time
0,RandomForestClassifier_TF_IDF_ONE_GRAM,"{'criterion': 'gini', 'n_estimators': 250}",1.0,0.911843,1.0,0.910633,1.0,0.914503,1.66159
1,GaussianNB_TF_IDF_ONE_GRAM,{},0.931458,0.858537,0.931203,0.859601,0.933053,0.861169,0.013115
2,SVC_TF_IDF_ONE_GRAM,"{'C': 10, 'gamma': 0.1}",0.986628,0.900158,0.986545,0.900327,0.98674,0.902152,0.262039
3,RandomForestClassifier_TF_IDF_ONE_TWO_GRAM,"{'criterion': 'entropy', 'n_estimators': 150}",1.0,0.947797,1.0,0.946281,1.0,0.950623,2.689887
4,GaussianNB_TF_IDF_ONE_TWO_GRAM,{},1.0,0.901938,1.0,0.905434,1.0,0.903421,0.193932
5,SVC_TF_IDF_ONE_TWO_GRAM,"{'C': 10, 'gamma': 0.1}",1.0,0.970375,1.0,0.970559,1.0,0.9706,6.497143


- Overall score was increasing on average 5% 
- SVC has best performance with Two-Gram model
- The fit time was dramatically rising



In [22]:
#Save report
report.to_csv('report_training.csv', index=False)

# Conclusion
- Best performance model: SVC_TF_IDF_ONE_TWO_GRAM (Support Vector Machine + Two - Gram model + TF-IDF)
- Using simple machine algorithm, did not apply BERT, Word2Vec or neural network, but overall score above 90%, best score is 97% on metric f1
- Next time try with Word2Vec model to see if there are a better performance.
