In [1]:
import numpy as np
from multiprocessing import Pool
from functools import partial
from contextlib import contextmanager

from pyspark.sql import SparkSession 
from pyspark.sql.types import *

In [2]:
gazze_file = "2018_Gaz_counties_national.csv"
threat_loc_file = "checkpoint_threat.csv"
shodan_file = "stream-2000.csv"

spss = SparkSession.builder\
    .master('local[*]')\
    .appName("sec_lab_4")\
    .getOrCreate()

ga_sppd = spss.read.csv(gazze_file, inferSchema=True, header=True)
threat_sppd = spss.read.csv(threat_loc_file, inferSchema=True, header=True)
shodan_sppd = spss.read.csv(shodan_file, inferSchema=True, header=True)

ga_sppd.printSchema()
threat_sppd.printSchema()
shodan_sppd.printSchema()

root
 |-- USPS: string (nullable = true)
 |-- GEOID: integer (nullable = true)
 |-- ANSICODE: integer (nullable = true)
 |-- NAME: string (nullable = true)
 |-- ALAND: long (nullable = true)
 |-- AWATER: long (nullable = true)
 |-- ALAND_SQMI: double (nullable = true)
 |-- AWATER_SQMI: double (nullable = true)
 |-- INTPTLAT: double (nullable = true)
 |-- INTPTLONG: double (nullable = true)

root
 |-- threat_id: integer (nullable = true)
 |-- src_city: string (nullable = true)
 |-- dst_city: string (nullable = true)
 |-- src_state: string (nullable = true)
 |-- dst_state: string (nullable = true)
 |-- src_country: string (nullable = true)
 |-- dst_country: string (nullable = true)
 |-- src_longitude: double (nullable = true)
 |-- dst_longitude: double (nullable = true)
 |-- src_latitude: double (nullable = true)
 |-- dst_latitude: double (nullable = true)
 |-- atk_time: integer (nullable = true)
 |-- atk_type: string (nullable = true)
 |-- atk_name: string (nullable = true)

root
 |-- i

In [3]:
#drop useless column and fill na
threat_sppd = threat_sppd.drop('atk_type')
threat_sppd = threat_sppd.fillna('na')


In [4]:
threat_sppd.filter(threat_sppd.dst_latitude.isNull()).count()
threat_sppd.filter(threat_sppd.dst_longitude.isNull()).count()

0

In [5]:
shodan_sppd = shodan_sppd.filter(shodan_sppd.ip.isNotNull())
shodan_sppd = shodan_sppd.filter(shodan_sppd.longtitude.isNotNull())
shodan_sppd = shodan_sppd.filter(shodan_sppd.latitude.isNotNull())
shodan_sppd.count()

1394

In [6]:
def distance(instance1, instance2):
    # just in case, if the instances are lists or tuples:
    instance1 = np.array(instance1) 
    instance2 = np.array(instance2)
    error = np.square(instance1 - instance2)
    return np.sqrt(np.sum(error))

def get_neighbors(test_instance,
                  training_set,
                  distance=distance):
    """
    get_neighors calculates a list of the k nearest neighbors
    of an instance 'test_instance'.
    The list neighbors contains 3-tuples with  
    (index, dist, label)
    where 
    index    is the index from the training_set, 
    dist     is the distance between the test_instance and the 
             instance training_set[index]
    distance is a reference to a function used to calculate the 
             distances
    """
    k=1
    distances = []
    for index in range(len(training_set)):
        dist = distance(test_instance, training_set[index, 1:])
        distances.append((training_set[index], dist))
    distances.sort(key=lambda x: x[1])
    neighbors = distances[:k]
    return neighbors

@contextmanager
def poolcontext(*args, **kwargs):
    pool = Pool(*args, **kwargs)
    yield pool
    pool.terminate()

In [7]:
threat_pd = threat_sppd.toPandas()
threat_pd = threat_pd.loc[threat_pd['dst_country'] == 'United States']

shodan_pd = shodan_sppd.toPandas()

train = np.array(shodan_pd[['id', 'latitude', 'longtitude']])
test = np.array(threat_pd[['dst_latitude', 'dst_longitude']])


#the get neighbors will return in index, latitude, longitude;
#create
with poolcontext(processes=4) as pool:
    results = pool.map(partial(get_neighbors, training_set = train), test)

print(len(results))
    

14143


In [8]:
def get_shodan_threat(oneResult):
    return oneResult[0][0][0]

with poolcontext(processes=4) as pool:
    shodan_threat_index = pool.map(get_shodan_threat, results)

shodan_threat_banner = shodan_pd[shodan_pd['id'].isin(shodan_threat_index)]
shodan_threat_banner.to_csv('shodan_threat_banner.csv')
print(shodan_threat_banner.shape)

(109, 21)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.externals import joblib
from sklearn.cross_validation import cross_val_score
from sklearn import preprocessing
import pandas


shodan_not_threat_pd = shodan_pd[~shodan_pd['id'].isin(shodan_threat_index)]
shodan_not_threat_pd['threat'] = pandas.Series(0, index=shodan_not_threat_pd.index)
shodan_threat_banner['threat'] = pandas.Series(1, index=shodan_threat_banner.index)
shodan_threat_banner = shodan_threat_banner.append(shodan_not_threat_pd)



shodan_threat_banner_en = shodan_threat_banner[['product', 'ip_str', 'org',  
                      'transport', 'isp',
                      'country_code_3', 'postal_code', 'dma_code', 'area_code',]]

shodan_tmp = shodan_threat_banner[['port','longtitude', 'latitude', 'threat']]
le = preprocessing.LabelEncoder()
shodan_threat_banner_en = shodan_threat_banner_en.apply(le.fit_transform)
shodan_threat_banner = shodan_tmp.join(shodan_threat_banner_en)

print(shodan_threat_banner.dtypes)
shodan_threat_banner = shodan_threat_banner.sample(frac=1).reset_index(drop=True)
shodan_train_np_X = np.array(shodan_threat_banner)
shodan_train_y = np.array(shodan_threat_banner['threat'])

randF = RandomForestClassifier(n_estimators=600, max_depth=15, min_samples_leaf=2)
knnF = KNeighborsClassifier()
logReg = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
svm_clf = SVC(gamma='auto')

# shodan_threat_banner.to_csv('shodan_threat_banner_ml.csv')
print(cross_val_score(randF, shodan_train_np_X, shodan_train_y, cv=10).mean())
print(cross_val_score(knnF, shodan_train_np_X, shodan_train_y, cv=10).mean())
print(cross_val_score(logReg, shodan_train_np_X, shodan_train_y, cv=10).mean())
print(cross_val_score(svm_clf, shodan_train_np_X, shodan_train_y, cv=3).mean())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


port                int32
longtitude        float64
latitude          float64
threat              int64
product             int64
ip_str              int64
org                 int64
transport           int64
isp                 int64
country_code_3      int64
postal_code         int64
dma_code            int64
area_code           int64
dtype: object
1.0
0.9210989469294131
0.9203846612151272
0.9218094815253317


In [15]:
randF.fit(shodan_train_np_X, shodan_train_y)
knnF.fit(shodan_train_np_X, shodan_train_y)
logReg.fit(shodan_train_np_X, shodan_train_y)
svm_clf.fit(shodan_train_np_X, shodan_train_y)

randF_file = 'random_forestml.sav'
knnF_file = 'KNN_ml.sav'
logReg_file = 'logReg.sav'
svm_file = 'svm.sav'

joblib.dump(randF, randF_file)
joblib.dump(knnF, knnF_file)
joblib.dump(logReg, logReg_file)
joblib.dump(svm_clf, svm_file)
#in order to use the model
#run the following code

#model = joblib.load(filename)
#result = model.predict_proba(ENCODED_INPUT_DATA)

['logReg.sav']