# Website Scam Prevention Using AI

## Import Necessary Libraries

In [20]:
import pandas as pd
import re
from urllib.parse import urlparse
from sklearn.feature_extraction.text import TfidfVectorizer

## Read and Display Data

In [21]:
data = pd.read_csv(r"C:\Users\natie\OneDrive\Documents\GitHub\Website-Scam-Prevention-AI\tech_support_scams_dataset.csv")

data.head()

             UID                          Domain  \
0  5ec26c3f7e42e          europian-hot-videos.ml   
1  5ec29cc7edd6c       europian-erotic-videos.ml   
2  5ec22e0841504  micro-deskto.azurewebsites.net   
3  5ec16019ea2a3                 configure056.ga   
4  5ec2ed981c104                  wincaphost.xyz   

                                                 Url            Number  \
0   https://europian-hot-videos.ml/sol/ed/index.html  +49-800-505-2852   
1    https://europian-erotic-videos.ml/ed/index.html  +49-800-505-2852   
2  http://micro-deskto.azurewebsites.net/0EDhdfgd...   +1-844-288-8665   
3  https://configure056.ga/WIn10010_jnh0101.nnb/X...   +1-844-223-6833   
4  https://wincaphost.xyz/freezus/DF10010011010FI...   +1-855-303-5343   

                    Host        Country        City              Ip       ASN  \
0     WEBSITEWELCOME.COM  United States     Houston  108.167.146.36  AS46606    
1     WEBSITEWELCOME.COM  United States     Houston  108.167.146.36  AS46606    

In [22]:
print(data.shape)
print()
print(data.columns)

(11375, 14)

Index(['UID', 'Domain', 'Url', 'Number', 'Host', 'Country', 'City', 'Ip',
       'ASN', 'Latitude', 'Longitude', 'Hash', 'Date', 'Mail'],
      dtype='object')


In [23]:
print(data.info)

<bound method DataFrame.info of                  UID                          Domain  \
0      5ec26c3f7e42e          europian-hot-videos.ml   
1      5ec29cc7edd6c       europian-erotic-videos.ml   
2      5ec22e0841504  micro-deskto.azurewebsites.net   
3      5ec16019ea2a3                 configure056.ga   
4      5ec2ed981c104                  wincaphost.xyz   
...              ...                             ...   
11370  60e9f7b0d6d23               453inheritance.ga   
11371  60e9f85c0e179                     hkx-ikg.xyz   
11372  60ea0e15bb7e1                033instantive.ml   
11373  60ea0f8c65edb    d7b0johdl2v8y.cloudfront.net   
11374  60ea2064cca10                    922lagged.ga   

                                                     Url            Number  \
0       https://europian-hot-videos.ml/sol/ed/index.html  +49-800-505-2852   
1        https://europian-erotic-videos.ml/ed/index.html  +49-800-505-2852   
2      http://micro-deskto.azurewebsites.net/0EDhdfgd...   +1

## Feature Extraction

In [24]:
def extract_features(url):
    features = {}
    
    # Length of URL
    features['url_length'] = len(url)
    
    # Count of special characters
    features['count_-'] = url.count('-')
    features['count_@'] = url.count('@')
    features['count_.'] = url.count('.')
    features['count_?'] = url.count('?')
    
    # Presence of suspicious words
    suspicious_words = ['login', 'secure', 'bank', 'account', 'update', 'free', 'password', 'hot', 'erotic', 'sexy', 'seduction']
    features['suspicious_word_count'] = sum(word in url for word in suspicious_words)
    
    # Domain-based features
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    features['domain_length'] = len(domain)
    features['domain_count_dots'] = domain.count('.')
    
    return features

# Apply feature extraction
data_features = data['Url'].apply(extract_features)
data_features = pd.json_normalize(data_features)

# Combine the features with the original dataframe
data = pd.concat([data, data_features], axis=1)
print(data.head())


             UID                          Domain  \
0  5ec26c3f7e42e          europian-hot-videos.ml   
1  5ec29cc7edd6c       europian-erotic-videos.ml   
2  5ec22e0841504  micro-deskto.azurewebsites.net   
3  5ec16019ea2a3                 configure056.ga   
4  5ec2ed981c104                  wincaphost.xyz   

                                                 Url            Number  \
0   https://europian-hot-videos.ml/sol/ed/index.html  +49-800-505-2852   
1    https://europian-erotic-videos.ml/ed/index.html  +49-800-505-2852   
2  http://micro-deskto.azurewebsites.net/0EDhdfgd...   +1-844-288-8665   
3  https://configure056.ga/WIn10010_jnh0101.nnb/X...   +1-844-223-6833   
4  https://wincaphost.xyz/freezus/DF10010011010FI...   +1-855-303-5343   

                    Host        Country        City              Ip       ASN  \
0     WEBSITEWELCOME.COM  United States     Houston  108.167.146.36  AS46606    
1     WEBSITEWELCOME.COM  United States     Houston  108.167.146.36  AS46606    