In [None]:
import pandas as pd
import pytz
from datetime import datetime
import numpy as np

In [None]:
# Import dataset
df = pd.read_csv('data_5002.csv') # data.csv is a dataset csv file having total six column: md5, TTP, timestamp, group, Aliases, Country
df

In [None]:
def convert_to_timezone(row):
    # country = row['Country']
    time_str = row['timestamp']

    # Attempt to parse the datetime string
    try:
        utc_time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S UTC')
        utc_time = utc_time.replace(tzinfo=pytz.UTC)
        return utc_time.strftime('%H:%M:%S')
    except ValueError:
        # print(time_str)
        # If there's an error in parsing, return a placeholder
        return 'Invalid Timestamp'

df['utc_time'] = df.apply(convert_to_timezone, axis=1)

## Feature

In [None]:
# Extract hours from 'local_24hr_time'
df['hour'] = df['utc_time'].apply(lambda x: int(x.split(':')[0]) if x != 'Invalid Timestamp' and x != 'Country Unrecognized or Error' else -1)
# Filter out any rows where time conversion was unsuccessful
df = df[df['hour'] != -1]
df['minute'] = df['utc_time'].apply(lambda x: int(x.split(':')[1]))
df['second'] = df['utc_time'].apply(lambda x: int(x.split(':')[2]))

### Sin Cos

In [None]:
# Convert hours, minutes, and seconds to cyclical features
df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
df['minute_sin'] = np.sin(2 * np.pi * df['minute']/60)
df['minute_cos'] = np.cos(2 * np.pi * df['minute']/60)
df['second_sin'] = np.sin(2 * np.pi * df['second']/60)
df['second_cos'] = np.cos(2 * np.pi * df['second']/60)

In [None]:
filtered_df = df.copy()

### OneHot

In [None]:
#eval(filtered_df["TTP"][0])
ttp = []
for i in list(filtered_df["TTP"]):
    ttp.append(eval(i))

#Target labels
labels = filtered_df["group"]

In [None]:
# Create a set of unique categories from all the lists
unique_categories = set(category for categories in ttp for category in categories)

df_one_hot = filtered_df.copy()

# Create binary columns for each unique category
for category in unique_categories:
    df_one_hot[category] = filtered_df['TTP'].apply(lambda x: 1 if category in x else 0)

# Drop the original 'feature_list' column
df_one_hot.drop('TTP', axis=1, inplace=True)

In [None]:
df_one_hot_feature = df_one_hot.drop(columns=["md5","group","Aliases","Country",'timestamp', 'utc_time', 'hour', 'minute', 'second','hour_sin','hour_cos','minute_sin','minute_cos','second_sin','second_cos'])
labels = df_one_hot["group"]
len(df_one_hot_feature), len(labels), df_one_hot_feature, labels

(5002,
 5002,
       T1564  T1082  T1033  T1083  T1007  T1027  T1529  T1518  T1055  T1222  \
 0         1      1      0      1      0      1      0      0      0      0   
 1         0      0      0      0      0      1      0      0      0      0   
 2         0      1      0      1      0      1      1      0      0      1   
 3         0      1      0      1      0      1      0      1      0      1   
 4         0      0      0      0      0      0      0      0      0      0   
 ...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
 5004      0      0      0      1      0      0      0      1      0      0   
 5005      0      0      0      0      0      1      0      0      0      0   
 5006      0      0      0      1      1      0      0      0      0      0   
 5007      0      0      0      1      1      0      0      0      0      0   
 5008      0      0      0      0      0      1      0      0      0      0   
 
       ...  T1112  T1546  T1125  T10

### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(filtered_df['TTP'].apply(lambda x: ''.join(x)))


# Convert to a DataFrame
df_tfidf_vector = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# print(df_tfidf_vector.shape)
# Concatenate the vectorized features with the original DataFrame
# df_tfidf_vector = pd.concat([filtered_df, df_vectorized], axis=1)
# df_tfidf_vector["HASH"] = filtered_df["HASH"]
# df_tfidf_vector["APT Group"] = filtered_df["APT Group"]

# df_tfidf_vector

In [None]:
# df_tfidf_vector_feature = df_tfidf_vector.drop(columns=["HASH","APT Group"])
labels = filtered_df["group"]
len(df_tfidf_vector), len(labels)

(5002, 5002)

## Merged

In [None]:
df_hr_s = df[['hour_sin']]
df_hr_c = df[['hour_cos']]
df_mn_s = df[['minute_sin']]
df_mn_c = df[['minute_cos']]
df_sc_s = df[['second_sin']]
df_sc_c = df[['second_cos']]

In [None]:
df_one_hot_feature = df_one_hot_feature.reset_index(drop=True)
df_tfidf_vector = df_tfidf_vector.reset_index(drop=True)
df_hr_s = df_hr_s.reset_index(drop=True)
df_hr_c = df_hr_c.reset_index(drop=True)
df_mn_s = df_mn_s.reset_index(drop=True)
df_mn_c = df_mn_c.reset_index(drop=True)
df_sc_s = df_sc_s.reset_index(drop=True)
df_sc_c = df_sc_c.reset_index(drop=True)

In [None]:
merged_feature = pd.concat([df_one_hot_feature,df_tfidf_vector,df_hr_s, df_hr_c, df_mn_s, df_mn_c, df_sc_s, df_sc_c], axis=1)

In [None]:
merged_feature.to_csv("Merged_Feature.csv",index=False)
labels.to_csv("Labels.csv",index=False)