#### Importing all the required libraries

In [1]:
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

#### Importing the Dataset
#### Source :

In [None]:
url = 'Dataset/data_url.csv'
url_csv = pd.read_csv(url)

#converting the data from csv to dataframe for easy handling
url_df = pd.DataFrame(url_csv)

#to convert into array 
url_df = np.array(url_df)  
random.shuffle(url_df)

In [3]:
url_df[0:10] 

array([['diaryofagameaddict.com', 'bad'],
       ['diaryofagameaddict.com', 'bad'],
       ['iamagameaddict.com', 'bad'],
       ['kalantzis.net', 'bad'],
       ['iamagameaddict.com', 'bad'],
       ['espdesign.com.au', 'bad'],
       ['diaryofagameaddict.com', 'bad'],
       ['iamagameaddict.com', 'bad'],
       ['kalantzis.net', 'bad'],
       ['tubemoviez.com', 'bad']], dtype=object)

#### Seperating the data according to it's characteristics

In [4]:
y = [d[1] for d in url_df]                 # 'y' holds the category whether url is good or bad.
urls = [d[0] for d in url_df]              # 'urls' holds the actual url.

#### Since the urls are different from normal text, we need to use a sanitization method to get the relevant data from raw urls.

In [5]:
def sanitization(web):
    web = web.lower()
    token = []
    dot_token_slash = []
    raw_slash = str(web).split('/') # Removes '/' in the url wherever encountered and stores the rest into a list. 
    for i in raw_slash:
        # removing slash to get token
        raw1 = str(i).split('-')                   # Removing '-' in the remaining url and stores the rest into a list.
        slash_token = []
        for j in range(0,len(raw1)):
            # removing dot to get the tokens
            raw2 = str(raw1[j]).split('.')         # Removing '.' in the remaining url and stores the rest into a list.
            slash_token = slash_token + raw2
            dot_token_slash = dot_token_slash + raw1 + slash_token       # Stores the final string after removing all the 
    # to remove same words
    token = list(set(dot_token_slash))  
    if 'com' in token:
        #remove com
        token.remove('com')
    return token

#### We will have to pass the data to our custom vectorizer function using Tf-idf approach 

In [6]:
# term-frequency and inverse-document-frequency
# TF gives us information on how often a term appears in a document and IDF gives us information about the relative rarity of a term in the collection of documents.
vectorizer = TfidfVectorizer(tokenizer=sanitization)

#### Splitting the test set and train set

In [7]:
x = vectorizer.fit_transform(urls)       # will standardize the urls by converting them to numbers.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#### Training

In [8]:
lgr = LogisticRegression(solver='lbfgs', max_iter=1000)                  # Logistic regression (Binary)
lgr.fit(x_train, y_train)
score = lgr.score(x_test, y_test)
vectorizer_save = vectorizer
print("score: {0:.2f} %".format(100 * score)) # Got a score of 98.53 % ; a very good score which terms our model as realistic.

score: 98.53 %


#### Saving the model and vectors

In [9]:
file = "pickel_model.pkl"
with open(file, 'wb') as f:
    pickle.dump(lgr, f)
f.close()

file2 = "pickel_vector.pkl"
with open(file2,'wb') as f2:
    pickle.dump(vectorizer_save, f2)
f2.close()