In [3]:
import pandas as pd
import pandasql as pdsql
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt

pysql = lambda q: pdsql.sqldf(q, globals())

data = pd.read_csv('data/amico-export.csv.gz', compression='gzip')
data['date'] = pd.to_datetime(data['date'])
data['vt_query'] = pd.to_datetime(data['vt_query'])




In [5]:
sql_query = """
    SELECT *
    FROM data 
    WHERE server = '151.139.242.2'
        AND type = 'DMG'
    ORDER BY date;
    """

adware = pysql(sql_query)
print "Number of malicious downloads =", len(adware)
print "MIN(date) =", min(adware['date'])
print "MAX(date) =", max(adware['date'])

 Number of malicious downloads = 384
MIN(date) = 2017-08-17 00:00:00.000000
MAX(date) = 2017-11-30 00:00:00.000000


In [6]:
sql_query = """
    SELECT host,
           COUNT(*) AS downloads, 
           COUNT(DISTINCT md5) AS md5s,
           COUNT(DISTINCT server) AS server_IPs
    FROM adware 
    GROUP BY host
    ORDER BY downloads DESC, md5s DESC;
    """

adware_stats = pysql(sql_query)
print "Number of host names =", len(adware_stats)
adware_stats[:20]

Number of host names = 2


Unnamed: 0,host,downloads,md5s,server_IPs
0,com.klivirtled.www,341,9,1
1,com.ybuwyn.www,43,5,1


In [7]:
sql_query = """
    SELECT server,
           COUNT(*) AS downloads, 
           COUNT(DISTINCT md5) AS md5s,
           COUNT(DISTINCT host) AS hosts
    FROM adware 
    GROUP BY server
    ORDER BY downloads DESC, md5s DESC;
    """

adware_stats = pysql(sql_query)
adware_stats

Unnamed: 0,server,downloads,md5s,hosts
0,151.139.242.2,384,14,2


In [8]:
sql_query = """
    SELECT max_tavs AS tavs, 
           COUNT(*) AS downloads,
           COUNT(DISTINCT md5) AS md5s,
           AVG(score), MAX(score)
    FROM adware 
    GROUP BY max_tavs
    ORDER BY max_tavs DESC;
    """

adware_stats = pysql(sql_query)
adware_stats

Unnamed: 0,tavs,downloads,md5s,AVG(score),MAX(score)
0,7.0,16,1,0.889375,0.9
1,6.0,2,1,0.8965,0.9
2,5.0,120,2,0.773233,0.875
3,4.0,171,5,0.765702,0.893
4,2.0,73,4,0.750548,0.899
5,,2,1,0.763,0.763


In [9]:
sql_query = """
    SELECT max_avs AS avs, 
           COUNT(*) AS downloads,
           COUNT(DISTINCT md5) AS md5s,
           AVG(score), MAX(score)
    FROM adware 
    GROUP BY max_avs
    ORDER BY max_avs DESC;
    """

adware_stats = pysql(sql_query)
adware_stats

Unnamed: 0,avs,downloads,md5s,AVG(score),MAX(score)
0,23.0,16,1,0.889375,0.9
1,17.0,109,2,0.76545,0.9
2,13.0,1,1,0.814,0.814
3,12.0,13,1,0.857462,0.875
4,11.0,146,1,0.766226,0.847
5,7.0,11,1,0.743,0.743
6,6.0,20,3,0.8093,0.899
7,5.0,66,3,0.737621,0.895
8,,2,1,0.763,0.763


In [10]:
sql_query = """
    SELECT *
    FROM adware 
    WHERE max_tavs = 0
    ORDER BY date;
    """

adware_stats = pysql(sql_query)
adware_stats

Unnamed: 0,dump_id,date,md5,host,server,type,max_tavs,max_avs,score,vt_query


In [11]:
sql_query = """
    SELECT md5, COUNT(*) AS count
    FROM adware 
    WHERE max_tavs IS NULL
    GROUP BY md5
    ORDER BY count DESC
    """

adware_stats = pysql(sql_query)
adware_stats[:10]

Unnamed: 0,md5,count
0,a930ced2222e503f6b7249acc0c0dfab,2


In [12]:
sql_query = """
    SELECT COUNT(*) AS detected_downloads, 
           COUNT(DISTINCT md5) AS detected_md5s
    FROM adware 
    WHERE max_tavs IS NULL
        AND score > 0.5
    """

adware_stats = pysql(sql_query)
adware_stats

Unnamed: 0,detected_downloads,detected_md5s
0,2,1
