# Website Scam Prevention Using AI

## Import Necessary Libraries

In [20]:
import pandas as pd
import re
from urllib.parse import urlparse
from sklearn.feature_extraction.text import TfidfVectorizer

## Read and Display Data

In [25]:
data = pd.read_csv(r"C:\Users\natie\OneDrive\Documents\GitHub\Website-Scam-Prevention-AI\tech_support_scams_dataset.csv")

data.head()

Unnamed: 0,UID,Domain,Url,Number,Host,Country,City,Ip,ASN,Latitude,Longitude,Hash,Date,Mail
0,5ec26c3f7e42e,europian-hot-videos.ml,https://europian-hot-videos.ml/sol/ed/index.html,+49-800-505-2852,WEBSITEWELCOME.COM,United States,Houston,108.167.146.36,AS46606,30,-95.4641,8b0ded62e108f41a193bc245b3c62567,2020-05-18 13:07:28,
1,5ec29cc7edd6c,europian-erotic-videos.ml,https://europian-erotic-videos.ml/ed/index.html,+49-800-505-2852,WEBSITEWELCOME.COM,United States,Houston,108.167.146.36,AS46606,30,-95.4641,8b0ded62e108f41a193bc245b3c62567,2020-05-18 16:34:32,
2,5ec22e0841504,micro-deskto.azurewebsites.net,http://micro-deskto.azurewebsites.net/0EDhdfgd...,+1-844-288-8665,Microsoft Corporation,United States,Des Moines,104.43.221.31,AS8075,42,-93.6208,e037e490b00f6be85ee60b983678fc6b,2020-05-18 08:41:30,
3,5ec16019ea2a3,configure056.ga,https://configure056.ga/WIn10010_jnh0101.nnb/X...,+1-844-223-6833,"Cloudflare, Inc.",United States,Ashburn,104.27.150.38,AS13335,39,-77.4874,f54f22a9dd19a680b412d539d5429efc,2020-05-17 18:02:54,
4,5ec2ed981c104,wincaphost.xyz,https://wincaphost.xyz/freezus/DF10010011010FI...,+1-855-303-5343,"Cloudflare, Inc.",United States,Ashburn,104.28.2.167,AS13335,39,-77.4874,35fce0a355796cc902a6712c9f833a78,2020-05-18 22:19:00,


In [22]:
print(data.shape)
print()
print(data.columns)

(11375, 14)

Index(['UID', 'Domain', 'Url', 'Number', 'Host', 'Country', 'City', 'Ip',
       'ASN', 'Latitude', 'Longitude', 'Hash', 'Date', 'Mail'],
      dtype='object')


In [23]:
print(data.info)

<bound method DataFrame.info of                  UID                          Domain  \
0      5ec26c3f7e42e          europian-hot-videos.ml   
1      5ec29cc7edd6c       europian-erotic-videos.ml   
2      5ec22e0841504  micro-deskto.azurewebsites.net   
3      5ec16019ea2a3                 configure056.ga   
4      5ec2ed981c104                  wincaphost.xyz   
...              ...                             ...   
11370  60e9f7b0d6d23               453inheritance.ga   
11371  60e9f85c0e179                     hkx-ikg.xyz   
11372  60ea0e15bb7e1                033instantive.ml   
11373  60ea0f8c65edb    d7b0johdl2v8y.cloudfront.net   
11374  60ea2064cca10                    922lagged.ga   

                                                     Url            Number  \
0       https://europian-hot-videos.ml/sol/ed/index.html  +49-800-505-2852   
1        https://europian-erotic-videos.ml/ed/index.html  +49-800-505-2852   
2      http://micro-deskto.azurewebsites.net/0EDhdfgd...   +1

## Feature Extraction

In [27]:
def extract_features(url):
    features = {}
    
    # Length of URL
    features['url_length'] = len(url)
    
    # Count of special characters
    features['count_-'] = url.count('-')
    features['count_@'] = url.count('@')
    features['count_.'] = url.count('.')
    features['count_?'] = url.count('?')
    
    # Presence of suspicious words
    suspicious_words = ['login', 'secure', 'bank', 'account', 'update', 'free', 'password', 'hot', 'erotic', 'sexy', 'seduction', 'kissing']
    features['suspicious_word_count'] = sum(word in url for word in suspicious_words)
    
    # Domain-based features
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    features['domain_length'] = len(domain)
    features['domain_count_dots'] = domain.count('.')
    
    return features

# Function to extract TLD from domain
def extract_tld(domain):
    return domain.split('.')[-1]

# Apply the function to extract TLD and create a new column 'TLD'
data['TLD'] = data['Domain'].apply(extract_tld)

# Define categories based on TLDs
def categorize_tld(tld):
    if tld in ['com', 'net', 'org']:
        return 'Common TLDs'
    elif tld in ['online', 'site', 'io']:
        return 'New TLDs'
    elif tld in ['ml', 'xyz', 'beauty', 'top']:
        return 'Suspicious TLDs'
    else:
        return 'Unknown TLDs'

# Apply categorization to create a new column 'TLD_Category'
data['TLD_Category'] = data['TLD'].apply(categorize_tld)

# Apply feature extraction
data_features = data['Url'].apply(extract_features)
data_features = pd.json_normalize(data_features)

# Combine the features with the original dataframe
data = pd.concat([data, data_features], axis=1)

data.head()


Unnamed: 0,UID,Domain,Url,Number,Host,Country,City,Ip,ASN,Latitude,...,domain_length,domain_count_dots,url_length,count_-,count_@,count_.,count_?,suspicious_word_count,domain_length.1,domain_count_dots.1
0,5ec26c3f7e42e,europian-hot-videos.ml,https://europian-hot-videos.ml/sol/ed/index.html,+49-800-505-2852,WEBSITEWELCOME.COM,United States,Houston,108.167.146.36,AS46606,30,...,22,1,48,2,0,2,0,1,22,1
1,5ec29cc7edd6c,europian-erotic-videos.ml,https://europian-erotic-videos.ml/ed/index.html,+49-800-505-2852,WEBSITEWELCOME.COM,United States,Houston,108.167.146.36,AS46606,30,...,25,1,47,2,0,2,0,1,25,1
2,5ec22e0841504,micro-deskto.azurewebsites.net,http://micro-deskto.azurewebsites.net/0EDhdfgd...,+1-844-288-8665,Microsoft Corporation,United States,Des Moines,104.43.221.31,AS8075,42,...,30,2,86,4,0,2,1,0,30,2
3,5ec16019ea2a3,configure056.ga,https://configure056.ga/WIn10010_jnh0101.nnb/X...,+1-844-223-6833,"Cloudflare, Inc.",United States,Ashburn,104.27.150.38,AS13335,39,...,15,1,67,0,0,2,0,0,15,1
4,5ec2ed981c104,wincaphost.xyz,https://wincaphost.xyz/freezus/DF10010011010FI...,+1-855-303-5343,"Cloudflare, Inc.",United States,Ashburn,104.28.2.167,AS13335,39,...,14,1,58,0,0,2,0,1,14,1
