## Malicious URLs Classification

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Import Dataset

In [3]:
URLs_df = pd.read_csv('malicious_urls.csv')
URLs_df.head(3)

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign


## Dataset Properties

#### Shape

In [4]:
URLs_df.shape

(651191, 2)

#### Columns

In [5]:
URLs_df.columns

Index(['url', 'type'], dtype='object')

#### Categorical Columns

In [6]:
URLs_df.select_dtypes(include='object').columns

Index(['url', 'type'], dtype='object')

#### Dataframe Info

In [7]:
URLs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     651191 non-null  object
 1   type    651191 non-null  object
dtypes: object(2)
memory usage: 9.9+ MB


## Data Cleaning

#### Remove Duplicates

In [8]:
URLs_df.duplicated().sum()

10066

In [9]:
URLs_df.drop_duplicates(inplace=True, ignore_index=True)

In [10]:
URLs_df.shape

(641125, 2)

#### Nulls

In [11]:
URLs_df.isnull().sum()

url     0
type    0
dtype: int64

#### Uniques

In [12]:
URLs_df['type'].unique()

array(['phishing', 'benign', 'defacement', 'malware'], dtype=object)

#### Value_Counts

In [13]:
URLs_df['type'].value_counts()

type
benign        428080
defacement     95308
phishing       94092
malware        23645
Name: count, dtype: int64

#### Under Sampling

In [14]:
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler()

X, Y = sampler.fit_resample(URLs_df[['url']], URLs_df['type'])

In [15]:
urls_df = pd.concat([X, Y], axis = 1, ignore_index=True)
urls_df.shape

(94580, 2)

In [16]:
urls_df.reset_index(inplace=True)

In [17]:
urls_df

Unnamed: 0,index,0,1
0,14554,mediaonline.net/en/televisions/kststvtelemundo...,benign
1,44379,overtheair.saveandreplay.com/,benign
2,274620,ascsports.org/oldsite2/Soccer/mississippi.htm,benign
3,196789,tvstations.usradiostations.info/tvstation/7061...,benign
4,388296,tvduck.com/Deliver-Us-from-Eva.htm,benign
...,...,...,...
94575,611981,tools.ietf.org/html/rfc851,phishing
94576,10534,kefthymioudevelopment.com,phishing
94577,637795,www.angelfire.com/sd/isengard/,phishing
94578,349523,usaa.com-inet-truememberent-iscaddetour-start-...,phishing


#### Drop Index column

In [18]:
urls_df.drop(columns='index', inplace=True)

In [19]:
urls_df.head(3)

Unnamed: 0,0,1
0,mediaonline.net/en/televisions/kststvtelemundo...,benign
1,overtheair.saveandreplay.com/,benign
2,ascsports.org/oldsite2/Soccer/mississippi.htm,benign


#### Rename Columns

In [20]:
urls_df.rename(columns={0:'url', 1:'type'}, inplace=True)

#### Value Counts

In [21]:
urls_df['type'].value_counts()

type
benign        23645
defacement    23645
malware       23645
phishing      23645
Name: count, dtype: int64

#### Save Dataset

In [23]:
urls_df.to_csv('urls.csv')

#### Column Standardization: url

In [None]:
import re
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
common_url_words = ['com', 'in', 'http', 'https', 'html', 'www', 'php', 'org', 'css', 'js', 'htm', 'xml', 'py', 'java', 'rb', 'cs', 'json', 'sql']

def Refiner(text):
    text = ''.join([i if i not in punctuation else ' ' for i in text])
    words = word_tokenize(text)
    url = ' '.join([word for word in words if word not in common_url_words and word not in stop_words])
    return url

In [None]:
urls_df['url'] = urls_df['url'].apply(Refiner)

In [None]:
urls_df['url'] = urls_df['url'].apply(lambda x: x + ' ')

In [None]:
urls_df

## EDA

In [None]:
from wordcloud import WordCloud

wc = WordCloud(width=400, height=400, include_numbers=False)

In [None]:
text = urls_df['url'].sum()

In [None]:
plt.imshow(wc.generate(text))
plt.xticks(ticks = [])
plt.yticks(ticks = [])
plt.title('Word Cloud of URLs')
plt.show()

In [None]:
sns.countplot(x = 'type', data = urls_df)
plt.title('Counts of URLs type')
plt.show()

In [None]:
col_dict = dict(urls_df['type'].value_counts())
plt.pie(x = col_dict.values(), labels=col_dict.keys(), autopct='%1.1f%%', startangle=90)
plt.title('Pie Chart of URL type')
plt.show()

## Data Splitting

In [None]:
X = urls_df['url']
Y = urls_df['type']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.2)

In [None]:
X_train.shape, y_train.shape

## Model Selection

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

cross_val_df = pd.DataFrame(columns = ['CV_scores', 'Mean_Accuracy'])

log_reg = Pipeline(steps=[('cv', CountVectorizer()), ('log_reg', LogisticRegression())])
scores = cross_val_score(log_reg, X, Y, cv = 5)

cross_val_df.loc['log_reg', 'CV_scores'] = scores
cross_val_df.loc['log_reg', 'Mean_Accuracy'] = scores.mean()

In [None]:
knn = Pipeline(steps=[('cv', CountVectorizer()), ('knn', KNeighborsClassifier())])
scores = cross_val_score(knn, X, Y, cv = 5)

cross_val_df.loc['knn', 'CV_scores'] = scores
cross_val_df.loc['knn', 'Mean_Accuracy'] = scores.mean()

In [None]:
dec_tree = Pipeline(steps=[('cv', CountVectorizer()), ('dec_tree', DecisionTreeClassifier())])
scores = cross_val_score(dec_tree, X, Y, cv = 5)

cross_val_df.loc['dec_tree', 'CV_scores'] = scores
cross_val_df.loc['dec_tree', 'Mean_Accuracy'] = scores.mean()

In [None]:
mnb = Pipeline(steps=[('cv', CountVectorizer()), ('mnb', MultinomialNB())])
scores = cross_val_score(mnb, X, Y, cv = 5)

cross_val_df.loc['mnb', 'CV_scores'] = scores
cross_val_df.loc['mnb', 'Mean_Accuracy'] = scores.mean()

In [None]:
cross_val_df.sort_values(by = 'Mean_Accuracy')

## Model Building

In [None]:
model = Pipeline(steps=[('cv', CountVectorizer()), ('log_reg', LogisticRegression())])
model.fit(X_train, y_train)

## Model Evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score

y_pred = model.predict(X_test)

print(f'Accuracy Score: {model.score(X_test, y_test)}')
print(f'Precision Score: {precision_score(y_test, y_pred, average="micro")}')
print(f'Recall Score: {recall_score(y_test, y_pred, average="micro")}')

## Predictions

In [None]:
URLs_df.sample(5)

In [None]:
model.predict(['ottawadancehall.com/montrealdancehall/newsback...	'])

In [None]:
model.predict(['http://155.138.234.103/mips'])

In [None]:
model.predict(['fanbase.com/Eric-Fernsten'])