### Agenda
* Step1: Testing with open source (bbc)
* Step2: Applied Tf-idf to figure out the weightage of important keywords
* Step3: Compare the metrics of two Classification Algorithms: Multinomial Naive Bayes and Support Vector Machine (SVM)
* Step4: Calculate the prediction probability by taking the mean. 
* Step5: Run the real gambling links and check

In [1]:
import pandas as pd
import requests, io, re
from bs4 import BeautifulSoup  
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords, wordnet
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from nltk.stem import WordNetLemmatizer
import random
#import numpy as np

In [2]:
rite_links = ['https://www.bbc.com/sport/football/51685405',
             'https://www.bbc.com/sport/rugby-union/51784446',
             'https://www.bbc.com/sport/football/51784350',
             'https://www.bbc.com/sport/football/51685402',
             'https://www.bbc.com/sport/football/51685406',
             'https://www.bbc.com/sport/football/51785880',
             'https://www.bbc.com/sport/cricket/51784104',
             'https://www.bbc.com/sport/football/51693762',
             'https://www.bbc.com/sport/rugby-union/51745091',
             'https://www.bbc.com/sport/football/51787338',
             'https://www.bbc.com/sport/football/51793861',
             'https://www.bbc.com/sport/football/51693759',
             'https://www.bbc.com/sport/football/51693745',
             'https://www.bbc.com/sport/football/51793861',
             'https://www.bbc.com/sport/football/51787031',
             'https://www.bbc.com/sport/football/51786497',
             'https://www.bbc.com/sport/football/51163821',
             'https://www.bbc.com/sport/football/51786554']

wrong_links = ['https://www.bbc.com/news/uk-51800196',
              'https://www.bbc.com/news/business-51796806',
              'https://www.bbc.com/news/world-europe-51799956',
              'http://www.bbc.com/travel/story/20200308-japans-ancient-way-to-save-the-planet',
              'https://www.bbc.com/future/article/20200306-how-to-live-without-time']

def preprocess(txt):
    pattern1 = re.compile(r'<.*?>')   
    translator = str.maketrans('', '', string.punctuation)
    lmt = WordNetLemmatizer()


    for i in range(len(txt)):  
        txt[i] = str(txt[i]).lower()
        for p1 in pattern1.findall(txt[i]):
            txt[i] = txt[i].replace(p1, '').strip()
            txt[i] = txt[i].translate(translator)
            txt[i] = ' '.join([lmt.lemmatize(i) for i in txt[i].split()])

    return ' '.join([sentence for sentence in txt])

def test_doc(link):
    page = requests.get(link)
    text = BeautifulSoup(page.text, 'html.parser').findAll('p')
    return preprocess(text)

def based_docs(links):
    docs = list()
    for link in links:
        page = requests.get(link)
        text = BeautifulSoup(page.text, 'html.parser').findAll('p')
        cleaned_text = preprocess(text)
        docs.append(cleaned_text)
    return docs # ==> return [a, b, c]

In [3]:
rite_docs = based_docs(rite_links)
for i, v in enumerate(rite_docs):
    rite_docs[i] = (rite_links[i], v, 1)

wrong_docs = based_docs(wrong_links)
for i, v in enumerate(wrong_docs):
    wrong_docs[i] = (wrong_links[i], v, 0)
    
docs = rite_docs + wrong_docs

In [4]:
df = pd.DataFrame({'links': [docs[i][0] for i in range(len(docs))],
                   'data': [docs[i][1] for i in range(len(docs))],
                'target': [docs[i][2] for i in range(len(docs))]})
df.head(n=5)

Unnamed: 0,links,data,target
0,https://www.bbc.com/sport/football/51685405,img alt height1 srchttpsa1apibbccoukhitxitiamp...,1
1,https://www.bbc.com/sport/rugby-union/51784446,img alt height1 srchttpsa1apibbccoukhitxitiamp...,1
2,https://www.bbc.com/sport/football/51784350,img alt height1 srchttpsa1apibbccoukhitxitiamp...,1
3,https://www.bbc.com/sport/football/51685402,img alt height1 srchttpsa1apibbccoukhitxitiamp...,1
4,https://www.bbc.com/sport/football/51685406,img alt height1 srchttpsa1apibbccoukhitxitiamp...,1


Train-Test dataset splitting + Applying Tf-Idf

In [5]:
X, y = df.data, df.target

count_vect = CountVectorizer()
X = count_vect.fit_transform(df.data).toarray()
X = TfidfTransformer().fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

Result from Support Vector Machine algorithm 

In [6]:
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
print(y_pred)
print(y_test)

[[0 1]
 [0 6]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.86      1.00      0.92         6

   micro avg       0.86      0.86      0.86         7
   macro avg       0.43      0.50      0.46         7
weighted avg       0.73      0.86      0.79         7

0.8571428571428571
[1 1 1 1 1 1 1]
22    0
17    1
11    1
15    1
4     1
13    1
8     1
Name: target, dtype: int64


  'precision', 'predicted', average, warn_for)


Result from Multinomial Naive Bayes algorithm

In [7]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
print(y_pred)
print(y_test)

[[0 1]
 [0 6]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.86      1.00      0.92         6

   micro avg       0.86      0.86      0.86         7
   macro avg       0.43      0.50      0.46         7
weighted avg       0.73      0.86      0.79         7

0.8571428571428571
[1 1 1 1 1 1 1]
22    0
17    1
11    1
15    1
4     1
13    1
8     1
Name: target, dtype: int64


  'precision', 'predicted', average, warn_for)


The accuracy of two methods are the same. Hence, it makes no difference to choose one over another. 

Probability when predicting a random value in accordance with its class [[prob to classify text, prob to classify target]]

In [8]:
clf.predict_proba(X_test)

array([[0.14550839, 0.85449161],
       [0.17164962, 0.82835038],
       [0.05666256, 0.94333744],
       [0.14296042, 0.85703958],
       [0.0747194 , 0.9252806 ],
       [0.07284682, 0.92715318],
       [0.10888412, 0.89111588]])

In [9]:
def prediction(url):
    X, y = df.data, df.target

    count_vect = CountVectorizer()
    X = count_vect.fit_transform(df.data).toarray()
    X = TfidfTransformer().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random.randint(10, 51))
    

    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    
    probs = clf.predict_proba(X_test)
    mean_prob = 0
    for i in probs:
        mean_prob += i[1]
    mean_prob = mean_prob/len(probs)
    
    
    #if url not in df['links']:
    # depending on the prob => df.append(pd.DataFrame({'links':[], 'data':[], 'target': []}))
    
    if mean_prob >= 0.85:
        return ('Gambling site.'), mean_prob #Should be Sport-site in this test
    else:
        return ('Non-gambling site.'), mean_prob



In [10]:
prediction('https://www.bbc.com/sport/football/51800667')

('Gambling site.', 0.9253904470489545)

#### Classifying Gamble Site

In [None]:
gamble_links = list()
nongamble_links = list()

with open('dir/links_gamble.txt', 'r') as f:
    for link in f:
        gamble_links.append(link)
    f.close()

with open('dir/links_nongamble.txt', 'r') as f:
    for link in f:
        nongamble_links.append(link)
    f.close()

gamble_docs = based_docs(gamble_links)
for i, text in enumerate(gamble_docs):
    gameble_docs[i] = (gamble_links[i], text, 1)

nongameble_docs = based_docs(nongamble_links)
for i, text in enumerat(nongamble_docs):
    nongamble_docs[i] = (nongamble_links[i], text, 0)

docs = gamble_docs + nongamble_docs
df = pd.DataFram({'links': [docs[i][0] for i in range(len(docs))],
                'data': [docs[i][1] for i in range(len(docs))],
                'target': [docs[i][2] for i in range(len(docs))]})

In [11]:
#prediction(your url)