In [1]:
import requests
import logging
import base64
import time



In [2]:
logging.basicConfig(level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

class Server(object):
    url = 'https://mlb.praetorian.com'
    log = logging.getLogger(__name__)

    def __init__(self):
        self.session = requests.session()
        self.binary  = None
        self.hash    = None
        self.wins    = 0
        self.targets = []

    def _request(self, route, method='get', data=None):
        while True:
            try:
                if method == 'get':
                    r = self.session.get(self.url + route)
                else:
                    r = self.session.post(self.url + route, data=data)
                if r.status_code == 429:
                    raise Exception('Rate Limit Exception')
                if r.status_code == 500:
                    raise Exception('Unknown Server Exception')

                return r.json()
            except Exception as e:
                self.log.error(e)
                self.log.info('Waiting 60 seconds before next request')
                time.sleep(60)

    def get(self):
        r = self._request("/challenge")
        self.targets = r.get('target', [])
        self.binary  = base64.b64decode(r.get('binary', ''))
        return r

    def post(self, target):
        r = self._request("/solve", method="post", data={"target": target})
        self.wins = r.get('correct', 0)
        self.hash = r.get('hash', self.hash)
        self.ans  = r.get('target', 'unknown')
        return r


In [5]:
import pandas as pd
df2=pd.read_csv('praetorian_train.csv')
hex_train = df2.iloc[:,0]   #get data from praetorian_train.csv which has 13000 rows hex_values,target_values
target_train = df2.iloc[:,1]



In [10]:

from sklearn.feature_extraction.text import CountVectorizer

#performs frequency analysis on the input data and constructs a vector of frequencies for each input
vec_opts = {
    "ngram_range": (1, 4),  # allow n-grams of 1-4 words in length (32-bits)
    "analyzer": "word",     # analyze hex words
    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
    "min_df": 0.001,        # value selected after trial and error for more features 
}
v = CountVectorizer(**vec_opts)
X = v.fit_transform(hex_train, target_train)

for feature, freq in zip(v.inverse_transform(X)[0], X.A[0]):
    print("'%s' : %s" % (feature, freq))

'b'' : 8
'15' : 1
'40' : 0
'00' : 0
'a6' : 0
'a1' : 0
'a0' : 0
'a4' : 0
'20' : 0
'58' : 0
'17' : 0
'48' : 0
'27' : 0
'14' : 0
'b5' : 0
'ab' : 0
'93' : 0
'b3' : 0
'c6' : 0
'a8' : 0
'07' : 0
'b8' : 0
'2d' : 0
'0a' : 0
'3d' : 0
'0b' : 0
'1d' : 0
'f0' : 0
'36' : 0
'61' : 0
'7d' : 0
'01' : 0
'29' : 0
'39' : 0
'49' : 0
'28' : 0
'96' : 0
'b2' : 0
'08' : 0
'b6' : 0
'c2' : 0
'02' : 0
'38' : 0
'b' 15' : 0
'15 40' : 0
'40 00' : 0
'00 a6' : 0
'a1 a0' : 0
'a4 20' : 0
'20 58' : 0
'58 17' : 0
'17 48' : 0
'48 27' : 0
'27 00' : 0
'00 14' : 0
'14 40' : 0
'00 b5' : 0
'a1 20' : 0
'93 20' : 0
'c6 00' : 0
'00 00' : 0
'00 a8' : 0
'a8 07' : 0
'07 b8' : 0
'b8 17' : 0
'17 2d' : 0
'2d 0a' : 0
'0a 3d' : 0
'3d 0b' : 0
'0b 1d' : 0
'1d f0' : 0
'f0 00' : 0
'00 36' : 0
'36 61' : 0
'61 00' : 0
'00 7d' : 0
'7d 01' : 0
'01 29' : 0
'29 07' : 0
'07 39' : 0
'39 17' : 0
'17 49' : 0
'49 27' : 0
'27 28' : 0
'28 07' : 0
'b2 08' : 0
'08 28' : 0
'28 27' : 0
'c2 02' : 0
'02 c6' : 0
'c6 20' : 0
'20 00' : 0
'00 38' : 0
'38 27' : 0
'

In [11]:

from sklearn.feature_extraction.text import TfidfTransformer
#normalize our data with respect to the corpus of documents using tf–idf
idf_opts = {"use_idf": True}
idf = TfidfTransformer(**idf_opts)

# perform the idf transform
X = idf.fit_transform(X)
print(X)

  (0, 19446)	0.09946234178273985
  (0, 19444)	0.09362667683182974
  (0, 19440)	0.06947740546389643
  (0, 19439)	0.0396111328033781
  (0, 17741)	0.0685988134431349
  (0, 17714)	0.09025089875793449
  (0, 17713)	0.06675628308871402
  (0, 17712)	0.0934250570903147
  (0, 17533)	0.10398051248844804
  (0, 17511)	0.043723354381649614
  (0, 17041)	0.09531522320454583
  (0, 17026)	0.05454043952897524
  (0, 16969)	0.05318503897533165
  (0, 16963)	0.0705365801595092
  (0, 16943)	0.06311965556328496
  (0, 16930)	0.10466347550758491
  (0, 16927)	0.05755234098318039
  (0, 16533)	0.10693840982367463
  (0, 16403)	0.013997965179316906
  (0, 16174)	0.07627352696349489
  (0, 16093)	0.10332932692944954
  (0, 16092)	0.10332932692944954
  (0, 16091)	0.09765364046472348
  (0, 16087)	0.05457948548226828
  (0, 15956)	0.05584732059987822
  :	:
  (12999, 4284)	0.05183618592920208
  (12999, 4256)	0.08288055299986398
  (12999, 4255)	0.07713348822199993
  (12999, 3982)	0.023276157313275765
  (12999, 3939)	0.08348095

In [12]:
from sklearn.pipeline import Pipeline
#chain transforms together in our pipeline
pipeline = Pipeline([
    ('vec',   CountVectorizer(**vec_opts)),
    ('idf',  TfidfTransformer(**idf_opts)),
])

X = pipeline.fit_transform(hex_train, target_train)
print(X)

  (0, 19446)	0.09946234178273985
  (0, 19444)	0.09362667683182974
  (0, 19440)	0.06947740546389643
  (0, 19439)	0.0396111328033781
  (0, 17741)	0.0685988134431349
  (0, 17714)	0.09025089875793449
  (0, 17713)	0.06675628308871402
  (0, 17712)	0.0934250570903147
  (0, 17533)	0.10398051248844804
  (0, 17511)	0.043723354381649614
  (0, 17041)	0.09531522320454583
  (0, 17026)	0.05454043952897524
  (0, 16969)	0.05318503897533165
  (0, 16963)	0.0705365801595092
  (0, 16943)	0.06311965556328496
  (0, 16930)	0.10466347550758491
  (0, 16927)	0.05755234098318039
  (0, 16533)	0.10693840982367463
  (0, 16403)	0.013997965179316906
  (0, 16174)	0.07627352696349489
  (0, 16093)	0.10332932692944954
  (0, 16092)	0.10332932692944954
  (0, 16091)	0.09765364046472348
  (0, 16087)	0.05457948548226828
  (0, 15956)	0.05584732059987822
  :	:
  (12999, 4284)	0.05183618592920208
  (12999, 4256)	0.08288055299986398
  (12999, 4255)	0.07713348822199993
  (12999, 3982)	0.023276157313275765
  (12999, 3939)	0.08348095

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
#here we use multinomial naive bayes classifier to get predictions as it is suitable for classification with discrete features as well as fractions from TF-IDF 
clf = MultinomialNB().fit(X,target_train)
predictions = clf.predict(X)
print(predictions)

['xtensa' 'x86_64' 'powerpc' ... 'sparc' 'xtensa' 'sparc']


In [14]:
from sklearn.metrics import classification_report
#here we calculate multi-class classification to get related evaluation metrics
print (classification_report(target_train, predictions))

              precision    recall  f1-score   support

   alphaev56       1.00      0.99      1.00      1059
         arm       1.00      0.99      0.99      1077
         avr       1.00      1.00      1.00      1096
        m68k       1.00      0.99      0.99      1065
        mips       1.00      0.99      0.99      1068
      mipsel       0.99      0.99      0.99      1095
     powerpc       1.00      0.99      1.00      1124
        s390       0.99      1.00      1.00      1115
         sh4       1.00      0.99      0.99      1077
       sparc       1.00      1.00      1.00      1068
      x86_64       0.95      1.00      0.97      1076
      xtensa       1.00      0.99      1.00      1080

    accuracy                           0.99     13000
   macro avg       0.99      0.99      0.99     13000
weighted avg       0.99      0.99      0.99     13000



In [47]:
  '''
        #Create training dataset
        data_train=[]
        target_train=[]
        hex_train=[]
        for _ in range(13000):
            # query the /challenge endpoint
            input_dict=s.get()

            # choose a random target and /solve
            target = random.choice(s.targets)
            output_arch=s.post(target)

            data_train.append(binascii.a2b_base64(input_dict.get('binary')))  #get data from server and convert data to binary format for training
          
            target_train.append(output_arch.get('target')) #store target architecture for training      

        hex_train = [binascii.hexlify(e) for e in data_train]  
        df1 = pd.DataFrame({'hex_values':hex_train,'target_values':target_train})  
        df1.to_csv('/content/sample_data/praetorian_train.csv', index=False, header=True)
    '''

if __name__ == "__main__":
    import random,binascii
    import numpy as np

    # create the server object
    s = Server()

    for _ in range(600):
        
        input_dict=s.get()

        data_test=binascii.a2b_base64(input_dict.get('binary')) 
  
        hex_test=binascii.hexlify(s.binary) #convert data from binary to hex format for testing


        x_test = pipeline.transform([hex_test]) #call our pipeline with TF-IDF and Countvectorizer
  
        target = clf.predict(x_test)  
           
        target_to_string=target[0] 

        s.post(target_to_string)

        s.log.info("Guess:[{: >9}]   Answer:[{: >9}]   Wins:[{: >3}]".format(target_to_string, s.ans, s.wins))

        # 500 consecutive correct answers are required to win
        if s.hash:
            s.log.info("You win! {}".format(s.hash))
            


2020-10-17 06:40:30,240 - __main__ - INFO - Guess:[     s390]   Answer:[     s390]   Wins:[  1]
2020-10-17 06:40:30,448 - __main__ - INFO - Guess:[   mipsel]   Answer:[   mipsel]   Wins:[  2]
2020-10-17 06:40:30,649 - __main__ - INFO - Guess:[   xtensa]   Answer:[   xtensa]   Wins:[  3]
2020-10-17 06:40:30,779 - __main__ - INFO - Guess:[   mipsel]   Answer:[   mipsel]   Wins:[  4]
2020-10-17 06:40:30,886 - __main__ - INFO - Guess:[alphaev56]   Answer:[alphaev56]   Wins:[  5]
2020-10-17 06:40:31,011 - __main__ - INFO - Guess:[   mipsel]   Answer:[   mipsel]   Wins:[  6]
2020-10-17 06:40:31,240 - __main__ - INFO - Guess:[     m68k]   Answer:[     m68k]   Wins:[  7]
2020-10-17 06:40:31,421 - __main__ - INFO - Guess:[alphaev56]   Answer:[alphaev56]   Wins:[  8]
2020-10-17 06:40:31,556 - __main__ - INFO - Guess:[   mipsel]   Answer:[   mipsel]   Wins:[  9]
2020-10-17 06:40:31,680 - __main__ - INFO - Guess:[alphaev56]   Answer:[alphaev56]   Wins:[ 10]
2020-10-17 06:40:31,869 - __main__ - INF