In [1]:
import nltk
import pandas as pd
import sys

sys.path.append('../../')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from src.modules.pipeline.balancing import Balancing
from src.modules.pipeline.cross_validation import CrossValidation
from src.modules.pipeline.finetunning import Finetunning
from src.modules.preprocess.preprocess import Preprocess
from src.modules.util.constant import Features, Model, ModelName as mn
from src.modules.util.helper_metrics import MetricsHelper as mh
from src.modules.util.util import Util as util
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/lorena/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading Data

#### Models

In [3]:
NB = Model.NB
LG = Model.LG
DT = Model.DT
RF = Model.RF
GB = Model.GB
CV = Model.CV

In [4]:
# BALANCE_PATH = '../data/balanced/balanced_data.csv'

data = pd.read_csv('../data/bg_results.csv', engine='python', quoting=3, header=0, sep='§')
data.drop(columns=Features.train_test_features, inplace=True, axis=1)

## Preprocess

In [5]:
data = pd.DataFrame.copy(data)

In [6]:
# filing the null values whit empty string

data['summary'].fillna('', inplace=True)
data['total_words_summary'] = data.apply(lambda row: len(list(nltk.word_tokenize(row['summary']))), axis=1)

data['description'].fillna('', inplace=True)
data['total_words_description'] = data.apply(lambda row: len(list(nltk.word_tokenize(row['description']))), axis=1)

data.drop(columns=Features.features, inplace=True, axis=1)
data = data.dropna()

## Balancing 

##### For balancind the data the follwoing chunks must be executed.

In [7]:
# data = Balancing.oversample(data)

##### Saving balanced data into a csv file

In [8]:
# data.to_csv(BALANCE_PATH, index=False)

## Dummification

In [9]:
# data = pd.get_dummies(data)

## Vectorization

### Data split

In [10]:
TRAIN, TEST = train_test_split(data, test_size=0.2)
# x_train, y_train, x_test, y_test, classes = mh().get_classification_artifacts(TRAIN, TEST, 'resolution')

In [11]:
train = TRAIN.drop('resolution', axis = 1)
y_train = TRAIN['resolution']

test = TEST.drop('resolution', axis=1)
y_test = TEST['resolution']

classes = TRAIN['resolution'].unique() 

In [12]:
vectorizer = TfidfVectorizer()

text_columns = train.select_dtypes(include=['object']).columns
x_train_vectorized = vectorizer.fit_transform(train[text_columns])

text_columns = test.select_dtypes(include=['object']).columns
x_test_vectorized =  vectorizer.transform(test[text_columns])


In [13]:
x_test_vectorized = pd.DataFrame(x_test_vectorized.toarray(), columns=vectorizer.get_feature_names_out())
x_train_vectorized = pd.DataFrame(x_train_vectorized.toarray(), columns=vectorizer.get_feature_names_out())

In [14]:
train_numeric = train.drop(text_columns, axis=1)
test_numeric = test.drop(text_columns, axis=1)

In [15]:
x_train = pd.concat([train_numeric, x_train_vectorized], axis=1)
x_test = pd.concat([test_numeric, x_test_vectorized], axis=1)

In [16]:
x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

In [17]:
x_train

Unnamed: 0,bg_number,total_users_commenting,total_comments_by_author,has_attachment,total_attachment_comments,changes_severity,changes_priority,changes_assigned,total_changes,total_users_changes,...,votes,total_words_summary,total_words_description,component,op_sys,platform,priority,product,severity,version
31712,952806.0,8.0,0.0,False,7.0,0.0,1.0,2.0,24.0,8.0,...,1.0,13.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10609,1383755.0,4.0,0.0,True,6.0,0.0,1.0,1.0,12.0,4.0,...,0.0,7.0,287.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67591,727751.0,1.0,1.0,False,2.0,0.0,1.0,1.0,6.0,3.0,...,0.0,5.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30189,1213224.0,3.0,12.0,False,0.0,0.0,1.0,0.0,16.0,5.0,...,0.0,11.0,152.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,1015563.0,3.0,1.0,True,4.0,0.0,0.0,1.0,8.0,3.0,...,0.0,19.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15158,1672740.0,6.0,3.0,True,1.0,0.0,0.0,1.0,13.0,5.0,...,0.0,9.0,107.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13646,1508944.0,6.0,3.0,False,1.0,0.0,1.0,1.0,15.0,5.0,...,0.0,30.0,294.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1800,1198960.0,1.0,0.0,True,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,7.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22971,953469.0,1.0,0.0,False,0.0,0.0,0.0,0.0,3.0,1.0,...,0.0,7.0,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
x_train = x_train.iloc[:-1]

## Nomalization

In [19]:
# scaler = MinMaxScaler()
# x_train_normalized = scaler.fit_transform(x_train)
# x_test_normalized = scaler.transform(x_test)

## Machine Learning Models

#### Gaussian Naive Bayers

In [20]:
# [x_train[c].apply(lambda x: print(x, c) if (str(x).isalpha() and str(x) not in ['nan', 'True', 'False']) else None) for c in x_train.columns]

In [21]:
# NB_train = NB.fit(x_train, y_train)
NB_metrics, NB_time = util().get_metrics(NB, mn.NB, x_train, y_train, x_test, y_test)

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

#### Logistic Regression

In [None]:
LG_metrics, LG_time = util().get_metrics(LG, mn.LG, x_train, y_train, x_test, y_test, classes)

#### Decision Tree

In [None]:
DT_metrics, DT_time = util().get_metrics(DT, mn.DT, x_train, y_train, x_test, y_test, classes)

#### Random Forest

In [None]:
RF_metrics, RF_time = util().get_metrics(RF, mn.RF, x_train, y_train, x_test, y_test, classes)

#### Gradient Boosting

In [None]:
GB_metrics, GB_time = util().get_metrics(GB, mn.GB, x_train, y_train, x_test, y_test, classes)

## Cross Validation

#### Gaussian Naive Bayers

In [None]:
NB_cv = CrossValidation().get_cross_validation_result(NB, x_train, y_train)

#### Logistic Regression

In [None]:
LG_cv = CrossValidation().get_cross_validation_result(LG, x_train, y_train)

#### Decision Tree

In [None]:
DT_cv = CrossValidation().get_cross_validation_result(DT, x_train, y_train)

#### Random Forest

In [None]:
RF_cv = CrossValidation().get_cross_validation_result(RF, x_train, y_train)

#### Gradient Boosting

In [None]:
GB_cv = CrossValidation().get_cross_validation_result(GB, x_train, y_train)

## Finetunning

#### Gaussian Naive Bayers

In [None]:
Finetunning().model_finetuning(NB, mn.NB, x_train, y_train, x_test, y_test, classes, './data/models/naive_bayers/NB_tuned_metrics.pkl', './data/models/naive_bayers/NB_pred.pkl')

#### Logistic Regression

In [None]:
Finetunning().model_finetuning(LG, mn.LG, x_train, y_train, x_test, y_test, classes, './data/models/logistic_regression/LG_tuned_metrics.pkl', './data/models/logistic_regression/LG_pred.pkl')

#### Decision Tree

In [None]:
Finetunning().model_finetuning(DT, mn.DT, x_train, y_train, x_test, y_test, classes, './data/models/decision_tree/DT_tuned_metrics.pkl', './data/models/decision_tree/DT_pred.pkl')

#### Random Forest

In [None]:
Finetunning().model_finetuning(RF, mn.RF, x_train, y_train, x_test, y_test, classes, './data/models/random_forest/RF_tuned_metrics.pkl', './data/models/random_forest/RF_pred.pkl')

#### Gradient Boosting

In [None]:
Finetunning().model_finetuning(GB, mn.GB, x_train, y_train, x_test, y_test, classes, './data/models/gradient_boosting/GB_tuned_metrics.pkl', './data/models/gradient_boosting/GB_pred.pkl')

## Saving results

In [None]:
util().save_result(NB_metrics, NB_time, mn.NB)
util().save_result(LG_metrics, LG_time, mn.LG)
util().save_result(DT_metrics, DT_time, mn.DT)
util().save_result(RF_metrics, RF_time, mn.RF)
util().save_result(GB_metrics, GB_time, mn.GB)