In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import glob

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
all_files = glob.glob("../data/ember2018/tra*")

li = []

for filename in all_files:
    df = pd.read_json(filename,lines=True)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [3]:
def list_to_json(jsonl):
    norm = {
        'size': [],
        'entropy' : [],
        'vsize' : [],
        'props' : ""
    }
    for obj in jsonl:
        norm['size'].append(obj['size'])
        norm['entropy'].append(obj['entropy'])
        norm['vsize'].append(obj['vsize'])
        props = " ".join(obj['props'])+" "
        norm['props']+=props
    return norm

def remove_infrequent(string,top_entries):
    if string not in top_entries:
        return 'other'
    else:
        if string == "   " or string == "":
            string="[blank]"
        return string.strip()
    
def explode_section(df):
    df_section = pd.json_normalize(df['section']).add_prefix("section.")
    df_section['section.sections'] = df_section['section.sections'].apply(list_to_json)
    top_entries = df_section['section.entry'].value_counts()[:10].index.to_list()
    df_section['section.entry'] = df_section['section.entry'].apply(remove_infrequent,top_entries=top_entries)
    df_section = pd.get_dummies(df_section,prefix='section.entry',prefix_sep=".",columns=['section.entry'])
    df_section = pd.concat([df_section,pd.json_normalize(df_section['section.sections']).add_prefix('section.sections.')],axis=1).drop(columns=['section.sections'])
    df_section = explode_cols_list(df_section,"section.sections.size")
    df_section = explode_cols_list(df_section,"section.sections.entropy")
    df_section = explode_cols_list(df_section,"section.sections.vsize")
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df_section['section.sections.props'])
    columns = ['section.sections.props.' + feature for feature in vectorizer.get_feature_names()]
    df_props = pd.DataFrame(data=X.toarray(), columns=columns).astype('int8')
    return pd.concat([df,df_section,df_props],axis=1).drop(['section.sections.props','section'],axis=1)

def imports_one_hot(df, max_features=80):
    imports = []
    for i in range(df.shape[0]):
        sentence = []
        for key in df['imports'][i].keys():  # create list of lists of sentences for each elements
            sentence.append(' '.join(df['imports'][i][key]))
        sentence = ' '.join(sentence)  # create sentece of each list
        sentence = re.sub(r"[^a-zA-Z0-9 ]+", '', sentence)  # remove special characters
        imports.append(re.sub(r"(\w)([A-Z])", r"\1 \2", sentence))  # put blank spaces between words
        
    vectorizer = CountVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(imports)  # perform One-Hot in the most frequent words
    
    columns = ['imports_' + feature for feature in vectorizer.get_feature_names()]
    df_imports = pd.DataFrame(data=X.toarray(), columns=columns).astype('int8')
    df = pd.concat([df,df_imports], axis=1).drop(['imports'], axis=1)  # create new DataFrame
    return df

def exports_one_hot(df, max_features=40):
    exports = []
    for i in range(df.shape[0]):
        sentence = df['exports'][i]
        sentence = ' '.join(sentence)  # create sentece of each element
        sentence = re.sub(r"[^a-zA-Z0-9 ]+", '', sentence)  # remove special characters
        exports.append(re.sub(r"(\w)([A-Z])", r"\1 \2", sentence))  # put blank spaces between words

    vectorizer = CountVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(exports)  # perform One-Hot in the most frequent words

    columns = ['exports_' + feature for feature in vectorizer.get_feature_names()]
    df_exports = pd.DataFrame(data=X.toarray(), columns=columns).astype('int8')
    df = pd.concat([df,df_exports], axis=1).drop(['exports'], axis=1)  # create new DataFrame
    return df

def explode_cols_list(df,colname):
#     max_len_list = df[colname].apply(len).values.max()
#     newcols = [colname+"_"+str(i) for i in range(max_len_list)]
#     df[newcols] = pd.DataFrame(df[colname].tolist(), index= df.index)
#     df = df.drop(columns=[colname])
    df[colname+".mean"] = df[colname].apply(np.mean)
    df[colname+".std"] = df[colname].apply(np.std)
    df[colname+'.median'] = df[colname].apply(np.median)
    df[colname+'.max'] = df[colname].apply(np.max,initial=0)
    df[colname+'.min'] = df[colname].apply(np.min,initial=0)
    df[colname+'.var'] = df[colname].apply(np.var)
#     df[colname+'.mode'] = df[colname].apply(stats.mode)
    df = df.drop(columns=[colname])
    return df

def explode_header(df):
    df_header = pd.json_normalize(df.header)
    new_cols = ['header.'+col for col in df_header.columns]
    df_header.rename(mapper=dict(zip(df_header.columns,new_cols)),axis=1, inplace=True)
    one_hot_cols = ['header.coff.machine','header.optional.subsystem','header.optional.magic']
    df_header = pd.get_dummies(df_header,prefix=one_hot_cols,prefix_sep='.',columns=one_hot_cols)
    mlb = MultiLabelBinarizer()
    df_header = df_header.join(pd.DataFrame(mlb.fit_transform(df_header.pop('header.coff.characteristics')),columns=['header.coff.characteristics.'+classe  for classe in mlb.classes_],index=df_header.index))
    df_header = df_header.join(pd.DataFrame(mlb.fit_transform(df_header.pop('header.optional.dll_characteristics')),columns=['header.optional.dll_characteristics.'+classe  for classe in mlb.classes_],index=df_header.index))
    df = df.drop(columns=['header'])
    return pd.concat([df, df_header], axis=1)


In [None]:
df=df.drop(columns=['sha256','md5','appeared'])
df = explode_section(df)
df = explode_cols_list(df,'histogram')
df = explode_cols_list(df,'byteentropy')
df = explode_header(df)
df = pd.concat([df, pd.json_normalize(df['strings']).add_prefix("strings."), pd.json_normalize(df['general']).add_prefix("general.")], axis=1)
df = explode_cols_list(df,'strings.printabledist')
df = imports_one_hot(df)
df = exports_one_hot(df)

In [None]:
df.to_csv("final_df.csv",index=False)

In [None]:
df