## Imports

In [1]:
import json
import pickle

import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

## Configurations

In [2]:
sklearn.set_config(display = 'diagram')

with open("configs.json", "r") as config_file:
    configs = json.load(config_file)

RANDOM_STATE = configs["RANDOM_STATE"]

configs.keys()

dict_keys(['RANDOM_STATE', 'DATASET_BASE_PATH', 'OUTPUTS_BASE_PATH', 'MODELS_BASE_PATH', 'PLOTS_BASE_PATH', 'PLOTS_DPI', 'SYMBOL_COL_PREFIX', 'VAL_SIZE', 'MODELS'])

## Data Loading

In [3]:
filecontent_df = pd.read_csv(f"{configs['OUTPUTS_BASE_PATH']}processed_filecontent.csv")
filecontent_df.head()

Unnamed: 0,sample_path,content,extension,content_len,Ratio space,Ratio NL,Ratio tab,Ratio _,Ratio #,Ratio :,...,Ratio |,Ratio &&,Ratio ||,Ratio ^,Ratio (,Ratio ),Ratio {,Ratio },Ratio [,Ratio ]
0,Unity/Assets/eDriven/Demo/_shared/Materials/Ma...,fileFormatVersion: 2\nguid: 83fb76c2876417a4cb...,meta,981,0.17737,0.04791,0.0,0.0,0.0,0.054027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003058,0.003058,0.002039,0.002039
1,Assets/RPGSystems/Scripts/Stats/RPGStatCollect...,fileFormatVersion: 2\nguid: 172ba00023166314c8...,meta,262,0.099237,0.045802,0.0,0.0,0.0,0.049618,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003817,0.003817,0.003817,0.003817
2,Assets/singleplayer/scripts/SingleGameUnit.cs....,fileFormatVersion: 2\nguid: 5343291fbebbcdb4fa...,meta,263,0.098859,0.045627,0.0,0.0,0.0,0.04943,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003802,0.003802,0.003802,0.003802
3,Assets/Textures/Pictures/cars/quad/Resources/q...,fileFormatVersion: 2\nguid: 9057805e7fcf641ccb...,meta,910,0.176923,0.049451,0.0,0.0,0.0,0.051648,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001099,0.001099,0.002198,0.002198
4,Assets/Art/Characters/99 Super Sanic/SuperSani...,fileFormatVersion: 2\nguid: fa975b7184d12a446b...,meta,1121,0.171276,0.047279,0.0,0.0,0.0,0.052632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002676,0.002676,0.001784,0.001784


In [4]:
filecontent_df = filecontent_df.drop(["sample_path", "content"], axis = 1)
filecontent_df.shape

(388009, 59)

In [5]:
filecontent_df.isnull().sum().sum()

0

In [6]:
filecontent_df.groupby("extension")["extension"].count().sort_values()

extension
cmd         720
lisp        940
ipynb       960
tcl         966
xhtml       981
           ... 
patch      4924
cc         4975
geojson    4992
po         4994
csproj     5000
Name: extension, Length: 147, dtype: int64

In [7]:
extensions = filecontent_df["extension"].unique()
len(extensions)

147

In [8]:
filecontent_df.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388009 entries, 0 to 388008
Data columns (total 59 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   extension    388009 non-null  object 
 1   content_len  388009 non-null  int64  
 2   Ratio space  388009 non-null  float64
 3   Ratio NL     388009 non-null  float64
 4   Ratio tab    388009 non-null  float64
 5   Ratio _      388009 non-null  float64
 6   Ratio #      388009 non-null  float64
 7   Ratio :      388009 non-null  float64
 8   Ratio ;      388009 non-null  float64
 9   Ratio .      388009 non-null  float64
 10  Ratio ,      388009 non-null  float64
 11  Ratio "      388009 non-null  float64
 12  Ratio '      388009 non-null  float64
 13  Ratio ?      388009 non-null  float64
 14  Ratio @      388009 non-null  float64
 15  Ratio $      388009 non-null  float64
 16  Ratio +      388009 non-null  float64
 17  Ratio -      388009 non-null  float64
 18  Ratio *      388009 non-

## Baseline model using Random Forest

### Data splitting

In [9]:
X = filecontent_df.drop("extension", axis = 1)
y = filecontent_df["extension"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = configs["VAL_SIZE"], stratify = y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((349208, 58), (38801, 58), (349208,), (38801,))

### Reduce memory footprint

In [10]:
del filecontent_df
del X
del y

### Model pipeline

In [11]:
skf = StratifiedKFold(configs["MODELS"]["BASELINE_RF"]["FOLDS"])

rf_pipeline = Pipeline(
    [
        ('std_scaler', StandardScaler()),
        ('rf', RandomForestClassifier(random_state = RANDOM_STATE))
    ]
)

rf_params = {
    "rf__n_estimators": [75, 100, 125, 150],
    "rf__class_weight": ["balanced", "balanced_subsample"],
    "rf__max_depth": [6, 8],
    "rf__max_samples": [0.5, 0.75],
    "rf__max_leaf_nodes": [1_500, 2_000, 2_500]
}

rf_rscv = RandomizedSearchCV(
    rf_pipeline, 
    rf_params,  
    n_iter = configs["MODELS"]["BASELINE_RF"]["GRID_ITER"], 
    cv = skf,
    n_jobs = -1,
    random_state = RANDOM_STATE,
    error_score = 'raise'
)

rf_rscv

### Model training

In [12]:
%%time

rf_rscv.fit(X_train, y_train)

Wall time: 23min 16s


In [13]:
rf_rscv.best_params_

{'rf__n_estimators': 125,
 'rf__max_samples': 0.5,
 'rf__max_leaf_nodes': 2000,
 'rf__max_depth': 8,
 'rf__class_weight': 'balanced_subsample'}

In [14]:
pd.DataFrame(rf_rscv.cv_results_).sort_values('mean_test_score', ascending = False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__n_estimators,param_rf__max_samples,param_rf__max_leaf_nodes,param_rf__max_depth,param_rf__class_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,161.19241,1.268583,30.563743,1.497818,125,0.5,2000,8,balanced_subsample,"{'rf__n_estimators': 125, 'rf__max_samples': 0...",0.555325,0.557401,0.54944,0.559958,0.55529,0.555483,0.003472,1
12,232.68679,2.631907,23.091195,3.241333,150,0.75,2500,8,balanced_subsample,"{'rf__n_estimators': 150, 'rf__max_samples': 0...",0.552762,0.560995,0.549927,0.553142,0.553858,0.554137,0.00368,2
8,144.726002,2.586071,23.885195,2.045248,125,0.5,1500,8,balanced,"{'rf__n_estimators': 125, 'rf__max_samples': 0...",0.55375,0.549426,0.546362,0.553701,0.552369,0.551121,0.002851,3
3,134.962484,3.258131,15.032237,1.644618,100,0.5,1500,8,balanced_subsample,"{'rf__n_estimators': 100, 'rf__max_samples': 0...",0.55123,0.551058,0.541007,0.554588,0.551424,0.549861,0.004615,4
4,157.231746,3.905789,17.296108,0.897935,100,0.75,2000,8,balanced,"{'rf__n_estimators': 100, 'rf__max_samples': 0...",0.551674,0.549054,0.543398,0.544923,0.539182,0.545646,0.004366,5


### Metrics

#### Accuracy

In [15]:
rf_rscv.score(X_test, y_test).round(3)

0.557

#### Confusion Matrix

In [16]:
conf_matrix = pd.DataFrame(confusion_matrix(y_test, rf_rscv.predict(X_test), labels = extensions), index = extensions, columns = extensions)
conf_matrix

Unnamed: 0,meta,expected,gradle,pem,pgm,jsx,geojson,cmd,out,xpm,...,rb,pm,js,json,cs,java,cpp,html,c,h
meta,178,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
expected,3,140,0,0,0,0,0,0,0,0,...,0,1,0,2,1,0,0,0,0,0
gradle,9,0,142,1,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
pem,11,0,0,280,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pgm,0,0,0,6,303,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
java,0,0,0,0,0,1,0,0,0,2,...,0,0,1,0,26,374,2,0,25,0
cpp,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,4,45,0,82,0
html,0,0,0,0,0,1,0,0,0,0,...,0,1,0,2,1,0,0,200,0,0
c,0,0,0,1,0,0,0,0,0,8,...,0,0,0,0,0,5,0,0,353,1


#### Top class predictions per extension

In [17]:
TOP_COUNT = 5

top_predictions_per_extn = pd.DataFrame(columns = pd.MultiIndex.from_product([extensions, ['extension', 'count']]), index = range(TOP_COUNT))

for col in conf_matrix.columns:
    top_pred_classes = conf_matrix[col].sort_values(ascending = False)[:TOP_COUNT]
    top_predictions_per_extn.loc[:, [(col, "extension")]] = top_pred_classes.index
    top_predictions_per_extn.loc[:, [(col, "count")]] = top_pred_classes.values
    
top_predictions_per_extn

Unnamed: 0_level_0,meta,meta,expected,expected,gradle,gradle,pem,pem,pgm,pgm,...,java,java,cpp,cpp,html,html,c,c,h,h
Unnamed: 0_level_1,extension,count,extension,count,extension,count,extension,count,extension,count,...,extension,count,extension,count,extension,count,extension,count,extension,count
0,meta,178,expected,140,gradle,142,pem,280,pgm,303,...,java,374,cpp,45,html,200,c,353,h,3
1,yaml,37,json,6,groovy,24,dat,22,properties,20,...,as,92,cxx,33,htm,49,h,84,php,1
2,yml,33,js,4,lua,21,txt,19,ppm,8,...,ts,54,cc,22,xml,6,cpp,82,hh,1
3,rb,15,lua,4,f90,18,yml,14,ini,6,...,js,46,C,7,hbs,4,cc,47,c,1
4,pem,11,txt,3,py,16,out,12,dat,6,...,cs,42,hpp,7,jsp,4,cxx,44,md,0


### Model Saving

In [18]:
with open(f"{configs['MODELS_BASE_PATH']}{configs['MODELS']['BASELINE_RF']['MODEL_NAME']}.pkl", "wb") as rf_pkl_file:
    pickle.dump(rf_rscv.best_estimator_, rf_pkl_file)