# Web-classification with SVM and Naive Bayes
Input:
```
    URL
```
Output:
```
    (BBC) Sport-site or Non-Sport site.
```

Files used:
```
link.txt
false_links.txt
```

### Main libraries

In [1]:
from bs4 import BeautifulSoup # work with html
import requests 

import re #regular expression

import string #to remove punctuation
from nltk.corpus import stopwords #get stopwords
from nltk.stem import WordNetLemmatizer #lemmatise words

#tf-idf libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

### Data Preprocess

The 3 following functions are to trim the scraped texts. 

In [2]:
def tag_to_string(url):
    '''
        Return the Tag-type data found to String-type. 
    '''
    page = requests.get(url)
    parsing = BeautifulSoup(page.text, "html.parser")
    texts_tag = parsing.findAll('title') + parsing.findAll('p')
    texts = str()
    for text in texts_tag:
        texts += str(text) + ' '
    return texts

def tag_removal(texts):
    '''
        Find all the tag of <.*>
    '''
    pattern = r"<.*?>"
    findings = re.findall(pattern, texts)
    return findings

def get_texts(texts):
    '''
        Replace all the tag of <.*> to get texts
    '''
    findings = tag_removal(texts)
    for finding in findings:
        texts = texts.replace(finding, "")
    return texts       

This function is to preprocess the data

In [3]:
import string #to remove punctuation
from nltk.corpus import stopwords #get stopwords
from nltk.stem import WordNetLemmatizer #lemmatise words

def preprocess(text_data):
    '''
        Remove unwanted and preprocess data
    '''
    #lower text
    text_data = text_data.lower()
    
    #remove punctuation
    text_data = text_data.translate(str.maketrans('', '', string.punctuation))
    
    #remove stopwords
    stop_words = stopwords.words('english')
    for stopword in stop_words:
        pattern = ' ' + stopword + ' '
        try:
            text_data = text_data.replace(pattern, " ")
        except:
            None
    
    
    #remove numbers
    text_data = re.sub(r"[0-9]", "", text_data)
    
    #remove big whitespace
    text_data = re.sub(r"\s{2,}", " ", text_data)
    
    #remove non-latin character
    text_data = re.sub(r"[^a-zA-Z\s]", "", text_data)
    
    #lemmatize
    text_data = text_data.split(' ')
    lmt = WordNetLemmatizer() 
    text_data = list(map(lmt.lemmatize, text_data))
    
    
    return ' '.join(text_data)

In [4]:
def corpus(urls):
    '''
        Create a corpus
    '''
    
    docs = list()
    
    for url in urls:
        texts = tag_to_string(url)
        findings = tag_removal(texts)
        texts = get_texts(texts)
        docs.append(preprocess(texts))
    
    return docs

Open `links.txt` file that contains links from BBC Sports

In [5]:
with open("links.txt", "r") as f:
    links = f.readlines()

docs = corpus(links)

Open `false_links.txt` that contains links from other sections

In [6]:
with open("false_links.txt", "r") as f:
    links = f.readlines()

false_docs = corpus(links)

Create a data frame 

In [7]:
import pandas as pd

Create a tuple with format:
```
    correct link: (data, 1)
    wrong link: (data, 0)
```

In [8]:
list_tuples = list()

for doc in docs:
    list_tuples.append((doc, 1))
for f_doc in false_docs:
    list_tuples.append((f_doc, 0))

Push to dataframe

In [9]:
df = pd.DataFrame({
    'data': [data[0] for data in list_tuples],
    'target': [data[1] for data in list_tuples]
})

Check data schema of the Data Frame

In [10]:
df.dtypes

data      object
target     int64
dtype: object

## Build model

Compute Tf-Idf value

In [11]:
X, y = df.data, df.target

#remove common words
cv = CountVectorizer(max_df=0.5)
X = cv.fit_transform(df.data).toarray()

#Compute the tf-idf value
X = TfidfTransformer().fit_transform(X)

Perform train-test splitting

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

Import ML libraries and Metric Scoring libraries

In [13]:
# ML libraries
from sklearn.naive_bayes import MultinomialNB 
from sklearn import svm #support vector machine

# Metric scoring
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

Perform and observe the results from SVM - Support Vector Machine

In [14]:
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("CONFUSION MATRIXx:\n", confusion_matrix(y_test,y_pred))
print("\nSTATS REPORT:\n", classification_report(y_test,y_pred))
print("\nACCURACY SCORE:\n", accuracy_score(y_test, y_pred))
print("\nPREDICTION RESULTS:\n", y_pred)
print("\nDATA TESTED:\n", y_test)

CONFUSION MATRIXx:
 [[0 2]
 [0 6]]

STATS REPORT:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.75      1.00      0.86         6

   micro avg       0.75      0.75      0.75         8
   macro avg       0.38      0.50      0.43         8
weighted avg       0.56      0.75      0.64         8


ACCURACY SCORE:
 0.75

PREDICTION RESULTS:
 [1 1 1 1 1 1 1 1]

DATA TESTED:
 15    1
2     1
14    1
23    0
8     1
4     1
16    1
20    0
Name: target, dtype: int64


  'precision', 'predicted', average, warn_for)


Perform and observe the results from Multinomial Naive Bayes

In [15]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("CONFUSION MATRIXx:\n", confusion_matrix(y_test,y_pred))
print("\nSTATS REPORT:\n", classification_report(y_test,y_pred))
print("\nACCURACY SCORE:\n", accuracy_score(y_test, y_pred))
print("\nPREDICTION RESULTS:\n", y_pred)
print("\nDATA TESTED:\n", y_test)

CONFUSION MATRIXx:
 [[1 1]
 [0 6]]

STATS REPORT:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.86      1.00      0.92         6

   micro avg       0.88      0.88      0.88         8
   macro avg       0.93      0.75      0.79         8
weighted avg       0.89      0.88      0.86         8


ACCURACY SCORE:
 0.875

PREDICTION RESULTS:
 [1 1 1 1 1 1 1 0]

DATA TESTED:
 15    1
2     1
14    1
23    0
8     1
4     1
16    1
20    0
Name: target, dtype: int64


As observed, Multinomial NB outperforms the SVM. Therefore, NB will be picked to perform classification.

Below is the probability when predicting a random value in accordance with its class:```[prob to data, prob to target]```

In [16]:
# clf carries NB algorithm
clf.predict_proba(X_test)

array([[0.21686332, 0.78313668],
       [0.16339472, 0.83660528],
       [0.20974684, 0.79025316],
       [0.31208506, 0.68791494],
       [0.24157298, 0.75842702],
       [0.2061244 , 0.7938756 ],
       [0.15039716, 0.84960284],
       [0.5676975 , 0.4323025 ]])

### Classification 

Import random function to random sample the dataset

In [17]:
import random

Prediction function

In [18]:
def prediction(url):

    
    # define target
    X, y = df.data, df.target
    
    #split train-test data
    count_vect = CountVectorizer()
    X = count_vect.fit_transform(df.data).toarray()
    X = TfidfTransformer().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random.randint(10, 40))
    
    #select NB algorithm
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    
    #prediction probabilities
    probs = clf.predict_proba(X_test)
    
    #calculate the average of probabilities
    mean_prob = 0
    for i in probs:
        mean_prob += i[1]
    mean_prob = mean_prob/len(probs)
    

    if mean_prob >= 0.85:
        return ('Sport site.'), mean_prob #Should be Sport-site in this test
    else:
        return ('Non-sport site.'), mean_prob

In [19]:
url2 = "https://www.bbc.com/news/world-us-canada-53788018"
prediction(url2)

('Non-sport site.', 0.7634120338602803)