### General Import

In [21]:
# general
import os
import numpy as np
import pandas as pd

# plots
from IPython.display import Image
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from IPython.display import Image

# predictions
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

### Define classes

In [3]:
class Perceptron_self(object):
    """ Perceptron classifier
    # param eta: float, learning rate (between 0.0 and 1.0)
    # param n_iter: int, passes over the training dataset
    # param random_state: int, random number generator seed for random weight initialization

    # attribute w_: 1d-array, weights after fitting
    # attribute errors_: list, number of misclasifications (updates) in each epoch
    """

    def __init__(self, eta=0.01, n_iter=50, random_state=1):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state

    def fit(self, X, y):
        """ Fit training data
        # param X:array-like, shape = [n_examples, n_features]
        # param y: array_like, shape =[n_examples]
        # return self: object
        """

        rgen = np.random.RandomState(self.random_state)
        self.w_ = rgen.normal(loc=0.0, scale=0.01, size=1 + X.shape[1])
        self.errors_ = []

        for _ in range(self.n_iter):
            errors = 0
            for xi, target in zip(X, y):
                update = self.eta * (target - self.predict(xi))
                self.w_[1:] += update * xi
                self.w_[0] += update
                errors += int(update != 0.0)
            self.errors_.append(errors)
        return self

    def net_input(self, X):
        """Calculate net input"""
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def predict(self, X):
        """Return class label after unit step"""
        return np.where(self.net_input(X) >= 0.0, 1, -1)

### Define functions

In [7]:
def plot_decision_regions(X, y, classifier, plot_test, resolution=0.02):
    """ Plot decission regions, perceptron model
    # param X: <<your task to comment here>>
    # param y: <<your task to comment here>>
    # param classifier: <<your task to comment here>>
    # param plot_test: integer, 1 if plot test sample, 0 otherwise
    # param resolution: <<your task to comment here>>
    # return: None
    """

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0],
                    y=X[y == cl, 1],
                    alpha=0.8,
                    c=colors[idx],
                    marker=markers[idx],
                    label=cl,
                    edgecolor='black')

    # highlight text examples
    if plot_test == 1:
        test_idx = range(105, 150)
        X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    edgecolor='black',
                    alpha=1.0,
                    linewidth=1,
                    marker='o',
                    s=100,
                    label='test set')


### Read data

Phishing dataset: https://archive.ics.uci.edu/static/public/967/phiusiil+phishing+url+dataset.zip

In [5]:
import pandas as pd

# URL to the dataset
url = 'https://archive.ics.uci.edu/static/public/967/phiusiil+phishing+url+dataset.zip'

# Read the CSV file from the URL, ignoring the first column (index 0)
df = pd.read_csv(url, encoding='utf-8')
df = df.drop('FILENAME', axis=1) # Column "FILENAME" can be ignored.
print('\nShape of data:', df.shape)
df.head()



Shape of data: (235795, 55)


Unnamed: 0,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,0.061933,...,0,0,1,34,20,28,119,0,124,1
1,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,0.050207,...,0,0,1,50,9,8,39,0,217,1
2,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,0.064129,...,0,0,1,10,2,7,42,2,5,1
3,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,0.057606,...,1,1,1,3,27,15,22,1,31,1
4,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,0.059441,...,1,0,1,244,15,34,72,1,85,1


# EDA Lab 1

* Describe the purpose of the data set you selected (i.e., why was this data collected in
the first place?). Describe how you would define and measure the outcomes from the
dataset. That is, why is this data important and how do you know if you have mined
useful knowledge from the dataset? How would you measure the effectiveness of a
good prediction algorithm? Be specific.

* Describe the meaning and type of data (scale, values, etc.) for each
attribute in the data file.

* Verify data quality: Explain any missing values, duplicate data, and outliers.
Are those mistakes? How do you deal with these problems? Be specific.

* Give simple, appropriate statistics (range, mode, mean, median, variance,
counts, etc.) for the most important attributes and describe what they mean or if you
found something interesting. Note: You can also use data from other sources for
comparison. Explain the significance of the statistics run and why they are meaningful.

* Visualize the most important attributes appropriately (at least 5 attributes).
Important: Provide an interpretation for each chart. Explain for each attribute why the
chosen visualization is appropriate.

* Explore relationships between attributes: Look at the attributes via scatter
plots, correlation, cross-tabulation, group-wise averages, etc. as appropriate. Explain
any interesting relationships.

In [None]:
# Filter only numeric data
numeric_df = df.select_dtypes(include=['int64', 'float64'])

# Create correlation matrix
corr_matrix = numeric_df.corr()
print(corr_matrix)


In [None]:
# Filter for strong correlations
threshold = 0.7
strong_corr = corr_matrix.copy()
strong_corr = strong_corr.where(np.triu(np.ones(strong_corr.shape), k=1).astype(bool))  # Upper triangle only
strong_pairs = strong_corr.stack().reset_index()
strong_pairs.columns = ['Feature1', 'Feature2', 'Correlation']
strong_pairs['AbsCorr'] = strong_pairs['Correlation'].abs()
strong_pairs = strong_pairs[strong_pairs['AbsCorr'] > threshold].sort_values(by='AbsCorr', ascending=False)
strong_pairs


*  Identify and explain interesting relationships between features and the class
you are trying to predict (i.e., relationships with variables and the target classification).

* Are there other features that could be added to the data or created from
existing features? Which ones?

* Exceptional Work -
You have free reign to provide additional analyses.
 One idea: implement dimensionality reduction, then visualize and interpret the results.