In [1]:
import nltk
import pandas as pd
import sys

sys.path.append('../../')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from src.modules.pipeline.balancing import Balancing
from src.modules.pipeline.cross_validation import CrossValidation
from src.modules.pipeline.finetunning import Finetunning
from src.modules.preprocess.preprocess import Preprocess
from src.modules.util.constant import Features, Model, ModelName as mn
from src.modules.util.helper_metrics import MetricsHelper as mh
from src.modules.util.util import Util as util


In [2]:
# nltk.download('punkt')

## Loading Data

#### Models

In [3]:
NB = Model.NB
LG = Model.LG
DT = Model.DT
RF = Model.RF
GB = Model.GB
CV = Model.CV

In [4]:
BALANCE_PATH = '../data/balanced/balanced_data.csv'

data = pd.read_csv('../data/bg_results.csv', engine='python', quoting=3, header=0, sep='§')
data.drop(columns=Features.train_test_features, inplace=True, axis=1)

## Preprocess

In [5]:
data = pd.DataFrame.copy(data)

In [6]:
# filing the null values whit empty string

data['summary'].fillna('', inplace=True)
data['total_words_summary'] = data.apply(lambda row: len(list(nltk.word_tokenize(row['summary']))), axis=1)

data['description'].fillna('', inplace=True)
data['total_words_description'] = data.apply(lambda row: len(list(nltk.word_tokenize(row['description']))), axis=1)

data.drop(columns=Features.features, inplace=True, axis=1)
data = data.dropna()

## Balancing 

##### For balancind the data the follwoing chunks must be executed.

In [7]:
# data = Balancing.oversample(data)

##### Saving balanced data into a csv file

In [8]:
# data.to_csv(BALANCE_PATH, index=False)

## Dummification

In [9]:
# data = pd.get_dummies(data)

### Data split

In [10]:
TRAIN, TEST = train_test_split(data, test_size=0.2)
# x_train, y_train, x_test, y_test, classes = mh().get_classification_artifacts(TRAIN, TEST, 'resolution')

In [11]:
train = TRAIN.drop('resolution', axis = 1)
y_train = TRAIN['resolution']

test = TEST.drop('resolution', axis=1)
y_test = TEST['resolution']

classes = TRAIN['resolution'].unique() 

## Vectorization

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

text_columns = train.select_dtypes(include=['object']).columns

vectorizer = TfidfVectorizer()
x_train_vectorized = pd.DataFrame()

for col in text_columns:
    column_data = train[col].tolist()
    column_vectorized = vectorizer.fit_transform(column_data)
    
    column_df = pd.DataFrame(column_vectorized.toarray(), columns=vectorizer.get_feature_names_out())
    x_train_vectorized = pd.concat([x_train_vectorized, column_df], axis=1)

x_train_vectorized

Unnamed: 0,blocker,critical,major,minor,normal,s1,s2,s3,s4,trivial,...,opensolaris,os,other,server,solaris,unspecified,ux,vista,windows,xp
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.486014,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54697,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
54698,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.0
54699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
54700,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.0


In [17]:

    
train.drop(text_columns, axis=1)
# x_train_f = pd.concat([x_train_vectorized, train], axis = 1)
train

Unnamed: 0,bg_number,total_users_commenting,total_comments_by_author,has_attachment,total_attachment_comments,changes_severity,changes_priority,changes_assigned,total_changes,total_users_changes,...,platform,priority,comment_count,version,product,component,op_sys,votes,total_words_summary,total_words_description
61960,724971,2,2,True,1,0,0,0,7,2,...,All,--,5,other,Participation Infrastructure,Phonebook,All,0,20,62
39987,1134858,8,3,False,0,0,0,0,6,4,...,x86,--,21,unspecified,Toolkit,Safe Browsing,Windows NT,0,9,59
45104,793709,7,1,False,16,0,1,0,22,9,...,x86_64,P3,43,other,Release Engineering,General,Linux,0,12,21
33514,1691478,5,2,False,2,1,1,1,19,4,...,Desktop,P1,10,unspecified,Firefox,Top Sites,All,0,6,178
5008,1423564,6,2,True,4,0,1,1,14,7,...,All,P1,17,Trunk,Firefox,Preferences,All,0,21,208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49573,781855,5,5,False,7,0,0,1,23,10,...,x86,--,19,Trunk,Core,JavaScript Engine,Linux,0,23,75
46217,1463354,7,6,False,0,0,2,1,32,11,...,Unspecified,P2,16,60 Branch,Core,DOM: Events,Unspecified,0,9,110
50160,1693821,5,0,False,1,1,1,1,22,6,...,Desktop,P3,9,unspecified,Firefox,Search,All,0,5,157
36671,1183526,1,4,False,0,0,1,0,6,3,...,Unspecified,P3,6,Trunk,Core,JavaScript Engine: JIT,Unspecified,0,27,280


In [15]:
x_train_f

Unnamed: 0,blocker,critical,major,minor,normal,s1,s2,s3,s4,trivial,...,platform,priority,comment_count,version,product,component,op_sys,votes,total_words_summary,total_words_description
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Unspecified,P1,3.0,unspecified,Core,DOM: Core & HTML,Unspecified,0.0,4.0,54.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,All,P3,33.0,unspecified,DevTools,General,All,0.0,10.0,85.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,x86,P3,18.0,unspecified,www.mozilla.org,L10N,macOS,0.0,9.0,16.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,,,,
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68090,,,,,,,,,,,...,Unspecified,P2,10.0,unspecified,Core,DOM: Networking,Unspecified,0.0,29.0,240.0
68058,,,,,,,,,,,...,x86,--,12.0,Trunk,Core,JavaScript Engine,Linux,0.0,9.0,82.0
63796,,,,,,,,,,,...,Unspecified,P2,7.0,55 Branch,Core,WebRTC: Signaling,Unspecified,0.0,4.0,487.0
67260,,,,,,,,,,,...,All,P1,21.0,unspecified,Firefox,Sync,All,0.0,12.0,53.0


In [None]:
# x_train = vetorizador.fit_transform(TRAIN.drop('resolution', axis = 1))
# y_train = TRAIN['resolution']

# x_test = vetorizadortransform(TEST.drop('resolution', axis=1))
# y_test = TEST['resolution']

# classes = TRAIN['resolution'].unique() 

## Nomalization

In [None]:
# scaler = MinMaxScaler()
# x_train_normalized = scaler.fit_transform(x_train)
# x_test_normalized = scaler.transform(x_test)

## Machine Learning Models

#### Gaussian Naive Bayers

In [None]:
# NB_train = NB.fit(x_train.toarray(), y_train)
NB_metrics, NB_time = util().get_metrics(NB, mn.NB, x_train, y_train, x_test, y_test)

#### Logistic Regression

In [None]:
LG_metrics, LG_time = util().get_metrics(LG, mn.LG, x_train, y_train, x_test, y_test, classes)

#### Decision Tree

In [None]:
DT_metrics, DT_time = util().get_metrics(DT, mn.DT, x_train, y_train, x_test, y_test, classes)

#### Random Forest

In [None]:
RF_metrics, RF_time = util().get_metrics(RF, mn.RF, x_train, y_train, x_test, y_test, classes)

#### Gradient Boosting

In [None]:
GB_metrics, GB_time = util().get_metrics(GB, mn.GB, x_train, y_train, x_test, y_test, classes)

## Cross Validation

#### Gaussian Naive Bayers

In [None]:
NB_cv = CrossValidation().get_cross_validation_result(NB, x_train, y_train)

#### Logistic Regression

In [None]:
LG_cv = CrossValidation().get_cross_validation_result(LG, x_train, y_train)

#### Decision Tree

In [None]:
DT_cv = CrossValidation().get_cross_validation_result(DT, x_train, y_train)

#### Random Forest

In [None]:
RF_cv = CrossValidation().get_cross_validation_result(RF, x_train, y_train)

#### Gradient Boosting

In [None]:
GB_cv = CrossValidation().get_cross_validation_result(GB, x_train, y_train)

## Finetunning

#### Gaussian Naive Bayers

In [None]:
Finetunning().model_finetuning(NB, mn.NB, x_train, y_train, x_test, y_test, classes, './data/models/naive_bayers/NB_tuned_metrics.pkl', './data/models/naive_bayers/NB_pred.pkl')

#### Logistic Regression

In [None]:
Finetunning().model_finetuning(LG, mn.LG, x_train, y_train, x_test, y_test, classes, './data/models/logistic_regression/LG_tuned_metrics.pkl', './data/models/logistic_regression/LG_pred.pkl')

#### Decision Tree

In [None]:
Finetunning().model_finetuning(DT, mn.DT, x_train, y_train, x_test, y_test, classes, './data/models/decision_tree/DT_tuned_metrics.pkl', './data/models/decision_tree/DT_pred.pkl')

#### Random Forest

In [None]:
Finetunning().model_finetuning(RF, mn.RF, x_train, y_train, x_test, y_test, classes, './data/models/random_forest/RF_tuned_metrics.pkl', './data/models/random_forest/RF_pred.pkl')

#### Gradient Boosting

In [None]:
Finetunning().model_finetuning(GB, mn.GB, x_train, y_train, x_test, y_test, classes, './data/models/gradient_boosting/GB_tuned_metrics.pkl', './data/models/gradient_boosting/GB_pred.pkl')

## Saving results

In [None]:
util().save_result(NB_metrics, NB_time, mn.NB)
util().save_result(LG_metrics, LG_time, mn.LG)
util().save_result(DT_metrics, DT_time, mn.DT)
util().save_result(RF_metrics, RF_time, mn.RF)
util().save_result(GB_metrics, GB_time, mn.GB)