# 0 - Analyzing Dataset from project using Classification Algorithms

* Author> Pedro Ciancaglini
* EY Badge> Artificial Intelligence Bronze

## Main objective of this analysis:

* Test dataset with different classification algorithms (Logistic Regression, Decision Tree, Random Forest, SVM)
* Measuring performance using Accuracy, Recall, Precision, and F1 Score
* Running test with one selected algorithm (potentially based on accuracy)
* Analyze results and conclude

# 1 - Preparing Notebook for Analysis

Run this section to be able to run separate sections later

In [1]:
# Required Libraries
import pandas as pd
import numpy as np


# Libraries for Metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score

# Libraries for Modeling
# 1 - Prototype with Logistic Regression
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression


# 2 - Prototype with RandomForest
from sklearn.ensemble import RandomForestClassifier

# 3 - SVM
from sklearn import svm
from sklearn.preprocessing import StandardScaler



# Libraries for graphics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Mounting Google Colab Drive
from google.colab import drive
drive.mount('/content/drive')
working_directory = '/content/drive/MyDrive/Colab Notebooks/98.COB/DS_v0/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Global Functions for this Notebook

# Importing the dataset
def importsets():
    train = pd.read_csv(working_directory + 'Train_26_8.csv')
    test = pd.read_csv(working_directory + 'Results_26_8.csv')
    results = pd.read_csv(working_directory + 'Results_26_8.csv')
    return train, test, results

# Calling function to set global variables for information goals
train_df, test_df, results_df = importsets()


# Function to show columns of imported Datasets
def cols(): 
    print('Train Columns> ' , train_df.columns)
    print('Test Columns> ' , test_df.columns)
    print('Result Columns> ' , results_df.columns)

# Function to split train dataset in y column with target and the rest on X 
def create_x_y(train_df):
    X = train_df.iloc[:, :-1].values
    y = train_df.iloc[:, -1].values
    return X, y

# Function to measure performance of models using precision, recall and F1
def performance(test, pred):
    print("Precision Score: \t {0:.4f}".format(precision_score(test, 
                                                            pred, 
                                                            average='weighted')))
    print("Recall Score: \t\t {0:.4f}".format(recall_score(test,
                                                        pred, 
                                                        average='weighted')))
    print("F1 Score: \t\t {0:.4f}".format(f1_score(test,
                                                pred, 
                                                average='weighted')))



def myfunctions(): # Running this function will return name of functions and how functions works
    print('importsets() useful for importing datasets to new variables train, test, results')
    print('cols() is a function to know the columns of the original datasets')
    print('performance(y_test, y_test_pred) is a function to know the performance of the model')
    print('create_x_y(train_df) is a function will return y target from train and X for the rest of the columns')




In [4]:
cols()

Train Columns>  Index(['IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type', 'SP'], dtype='object')
Test Columns>  Index(['Work Item Id', 'IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type',
       'SP'],
      dtype='object')
Result Columns>  Index(['Work Item Id', 'IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type',
       'SP'],
      dtype='object')


# 2 - Prototype with Logistic Regression

Using logistic regression to train and test the models

## a. Preparing Data

In [5]:
# Importing the dataset
train_lg, results_test_lg, results_lg = importsets()

# Spliting Datasets in X and Y for fitting
X_lg = train_lg.iloc[:, :-1].values
y_lg = train_lg.iloc[:, -1].values

In [6]:
train_lg.columns # Checking columns

Index(['IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type', 'SP'], dtype='object')

In [7]:
cols() # Checking columns of all datasets

Train Columns>  Index(['IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type', 'SP'], dtype='object')
Test Columns>  Index(['Work Item Id', 'IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type',
       'SP'],
      dtype='object')
Result Columns>  Index(['Work Item Id', 'IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type',
       'SP'],
      dtype='object')


In [8]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X_lg, y_lg, test_size = 0.2, random_state = 0)

## b. Training Algorithm

In [9]:
# Fitting Logistic Regression to the Training set
classifier = LogisticRegression(random_state=10)
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=10, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
y_pred_lg = classifier.predict(X_lg)

In [11]:
classifier.predict_proba(X_lg)

array([[0.19521572, 0.12800011, 0.13675781, ..., 0.18845843, 0.09728637,
        0.03127625],
       [0.22507189, 0.09270988, 0.12221087, ..., 0.21326214, 0.08938819,
        0.03036233],
       [0.23985347, 0.07830785, 0.11465954, ..., 0.22515644, 0.08503851,
        0.02969046],
       ...,
       [0.11894626, 0.15145769, 0.09218519, ..., 0.2966819 , 0.13066624,
        0.02207903],
       [0.02595585, 0.13444794, 0.14721271, ..., 0.26361741, 0.15794743,
        0.05337405],
       [0.03323927, 0.1326804 , 0.14879961, ..., 0.26002536, 0.15413338,
        0.05129943]])

## c. Testing Algorithm

In [12]:
# Visualizing Result
pred_test = pd.DataFrame(data=y_pred_lg)
train_test = pd.DataFrame(data=y_train)

h_stack_test = pd.concat([train_test, pred_test], axis=1)

h_stack_test.head(50)

Unnamed: 0,0,0.1
0,3.0,3
1,8.0,3
2,1.0,0
3,8.0,0
4,3.0,0
5,5.0,5
6,8.0,5
7,13.0,5
8,0.0,3
9,0.0,5


## d. Measuring Performance

In [13]:
performance(train_test, pred_test[:len(train_test)])

Precision Score: 	 0.1480
Recall Score: 		 0.2442
F1 Score: 		 0.1779


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
cm = confusion_matrix(train_test, pred_test[:len(train_test)])
print(cm)
accuracy_score(train_test, pred_test[:len(train_test)])

# Multiclass Matrix. 
#Each feature is compared with the possitive or negative prediction

[[ 3  2  0 16 26  0  0]
 [ 3  1  0 15 28  0  0]
 [ 2  2  0 25 27  0  0]
 [ 6  5  0 37 52  0  0]
 [ 4  7  0 38 65  0  0]
 [ 4  6  0  9 30  0  0]
 [ 0  2  0 10  9  0  0]]


0.24423963133640553

# 3 - Prototype with Random Forest

## a. Preparing Data

In [76]:
# importing datasets for Random Forest Analysis
train_rf, test_rf, results_rf = importsets()

In [77]:
# Spliting Datasets in X and Y for fitting
X_rf = train_rf.iloc[:, :-1].values
y_rf = train_rf.iloc[:, -1].values

# Creating Training and Testing Samples from Train Dataset
X_train, X_test, y_train, y_test = train_test_split(X_rf, y_rf, test_size = 0.2, random_state = 0)

## b. Training algorithm

In [78]:
# Initializing Random Forest Classifier. All hyperparameters from Lecture
classifier_rf = RandomForestClassifier(n_estimators=43, 
                                        criterion='gini',
                                        bootstrap=False,
                                        max_depth=10,
                                        max_features='sqrt',
                                        min_samples_leaf=10,
                                        min_samples_split=3,
                                        random_state = 0,
                                        warm_start=True, 
                                        min_impurity_decrease=0.001)



classifier_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.001, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=43,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=True)

In [79]:
y_pred_rf = classifier_rf.predict(X_test)

## c. Testing the algorithm

In [80]:
# Visualizing Result
pred_test = pd.DataFrame(data=y_pred_rf)
train_test = pd.DataFrame(data=y_train)

h_stack_test = pd.concat([train_test, pred_test], axis=1)

h_stack_test.head(50)

Unnamed: 0,0,0.1
0,3,5.0
1,8,8.0
2,1,2.0
3,8,0.0
4,3,3.0
5,5,5.0
6,8,5.0
7,13,3.0
8,0,1.0
9,0,8.0


## d. Measuring performance

In [81]:
cm = confusion_matrix(y_test, y_pred_rf)
print(cm)
accuracy_score(y_test, y_pred_rf)

[[ 2  0  0  5  4  0  0]
 [ 0  4  2  3  3  0  0]
 [ 0  0  3  3  5  0  0]
 [ 0  1  0 10 11  0  0]
 [ 1  3  1  7 20  0  0]
 [ 0  1  0  5  9  1  2]
 [ 0  0  0  0  2  1  0]]


0.3669724770642202

In [82]:
len(pred_test)

109

In [83]:
performance(train_test[:len(pred_test)], pred_test[:len(train_test)])

Precision Score: 	 0.2522
Recall Score: 		 0.2844
F1 Score: 		 0.2379


In [84]:
print(classifier_rf.score(X_lg, y_lg))

0.427255985267035


# 4 - Prototype with SVM

## a. Preparing Data

In [24]:
# Importing the dataset
train_svm, results_svm, results_test_svm = importsets()

X_svm = train_svm.iloc[:, :-1].values
y_svm = train_svm.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X_svm, y_svm,
                                                    train_size=0.8, 
                                                    random_state=0)

## b. Feature Scaling

In [25]:
sc_x = StandardScaler()
X_std_train = sc_x.fit_transform(X_train)

## c. Training with Linear SVM Classification

In [26]:
C = 1.0 #0.01
clf = svm.SVC(kernel='linear', C=C)
clf.fit(X_std_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [27]:
res = cross_val_score(clf, X_std_train, y_train, cv=10, scoring='accuracy')
print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

Average Accuracy: 	 0.2929
Accuracy SD: 		 0.0659


In [28]:
y_train_pred = cross_val_predict(clf, X_std_train, y_train, cv=3)

## d. Testing algorithm

In [29]:
# Visualizing Result
pred_test = pd.DataFrame(data=y_train_pred)
train_test = pd.DataFrame(data=y_train)

h_stack_test = pd.concat([train_test, pred_test], axis=1)

h_stack_test.head(50)

Unnamed: 0,0,0.1
0,3,5
1,8,5
2,1,3
3,8,5
4,3,5
5,5,5
6,8,5
7,13,5
8,0,3
9,0,3


## e. Measuring Performance

In [30]:
print(confusion_matrix(y_train, y_train_pred))
print(accuracy_score(y_train, y_train_pred))
# cm = confusion_matrix(y_test, y_pred_rf)
# print(cm)
print(accuracy_score(y_test[:len(y_train_pred)], y_train_pred[:len(y_test)]))

[[ 0  1  0 27 18  0  1]
 [ 0  7  0 17 23  0  0]
 [ 0  8  0 34 14  0  0]
 [ 0  7  0 54 38  0  1]
 [ 0 10  0 41 61  0  2]
 [ 0  5  0 16 25  0  3]
 [ 0  1  0  0 19  0  1]]
0.2834101382488479
0.28440366972477066


In [31]:
performance(y_train, y_train_pred)

Precision Score: 	 0.1722
Recall Score: 		 0.2834
F1 Score: 		 0.2098


  _warn_prf(average, modifier, msg_start, len(result))


**Cross validation with Test Set**

In [32]:
y_test_pred = cross_val_predict(clf, sc_x.transform(X_test), y_test, cv=3)

In [33]:
print(confusion_matrix(y_test, y_test_pred))
print(accuracy_score(y_test, y_test_pred))

[[ 2  0  0  2  6  1  0]
 [ 0  2  0  1  8  1  0]
 [ 1  1  1  0  8  0  0]
 [ 2  1  0  3 13  3  0]
 [ 2  2  1  4 19  4  0]
 [ 2  0  0  3  9  4  0]
 [ 0  0  0  0  3  0  0]]
0.28440366972477066


In [34]:
performance(y_test, y_test_pred)

Precision Score: 	 0.2915
Recall Score: 		 0.2844
F1 Score: 		 0.2512


  _warn_prf(average, modifier, msg_start, len(result))


## f. Changing from linear to polynomial kernel

In [35]:
C = 1.0
clf = svm.SVC(kernel='poly', degree=3, C=C, gamma='auto')
clf.fit(X_std_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [36]:
res = cross_val_score(clf, X_std_train, y_train, cv=10, scoring='accuracy')
print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

Average Accuracy: 	 0.2811
Accuracy SD: 		 0.0602


In [37]:
y_train_pred = cross_val_predict(clf, X_std_train, y_train, cv=3)

In [38]:
confusion_matrix(y_train, y_train_pred)

array([[ 3,  0,  1, 16, 27,  0,  0],
       [ 0,  3,  2, 14, 28,  0,  0],
       [ 0,  2,  3, 27, 24,  0,  0],
       [ 2,  1,  4, 37, 56,  0,  0],
       [ 0,  5,  0, 44, 62,  2,  1],
       [ 0,  4,  0, 12, 29,  3,  1],
       [ 0,  0,  0,  0, 19,  0,  2]])

In [39]:
performance(y_train, y_train_pred)

Precision Score: 	 0.3406
Recall Score: 		 0.2604
F1 Score: 		 0.2139


In [40]:
y_test_pred = cross_val_predict(clf, sc_x.transform(X_test), y_test, cv=3)

In [41]:
confusion_matrix(y_test, y_test_pred)

array([[ 0,  0,  0,  0, 11,  0,  0],
       [ 0,  0,  0,  0, 12,  0,  0],
       [ 0,  0,  0,  0, 11,  0,  0],
       [ 0,  0,  1,  0, 20,  0,  1],
       [ 0,  1,  0,  1, 28,  1,  1],
       [ 0,  0,  0,  1, 17,  0,  0],
       [ 0,  0,  0,  0,  3,  0,  0]])

In [42]:
performance(y_test, y_test_pred)

Precision Score: 	 0.0806
Recall Score: 		 0.2569
F1 Score: 		 0.1227


  _warn_prf(average, modifier, msg_start, len(result))


**CONCLUSION>**

There is a small improvement in Precision, Recall and F1 score agaist other algorithms but there is no significant improvement and uncertainty is still big. 



## g. Experimenting with Radial Basis Function

In [43]:
train_gaus, test_gaus, result_gaus = importsets()

In [44]:
X_gs, y_gs = create_x_y(train_gaus)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_gs, y_gs,
                                                    train_size=0.8, 
                                                    random_state=0) #0.6

In [46]:
sc_x = StandardScaler()
X_std_train = sc_x.fit_transform(X_train)

In [47]:
C = 1.0
clf = svm.SVC(kernel='rbf', gamma=0.7, C=C)
clf.fit(X_std_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.7, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [48]:
res = cross_val_score(clf, X_std_train, y_train, cv=10, scoring='accuracy')
print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

Average Accuracy: 	 0.2834
Accuracy SD: 		 0.0557


In [49]:
y_train_pred = cross_val_predict(clf, X_std_train, y_train, cv=3)

In [50]:
confusion_matrix(y_train, y_train_pred)

array([[ 3,  2,  2, 14, 26,  0,  0],
       [ 0,  7,  4,  8, 28,  0,  0],
       [ 1,  5, 12, 19, 17,  2,  0],
       [ 3,  2,  7, 33, 52,  1,  2],
       [ 2,  8,  0, 38, 60,  3,  3],
       [ 1,  2,  3, 10, 27,  4,  2],
       [ 1,  1,  0,  0, 14,  1,  4]])

In [51]:
performance(y_train, y_train_pred)

Precision Score: 	 0.3042
Recall Score: 		 0.2834
F1 Score: 		 0.2575


## h. Grid Search

In [52]:
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import train_test_split, GridSearchCV 

In [53]:
pipeline = Pipeline([('clf', svm.SVC(kernel='rbf', C=1, gamma=0.1))]) 

In [54]:
params = {'clf__C':(0.1, 0.5, 1, 2, 5, 10, 20), 
          'clf__gamma':(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1)} 

In [55]:
svm_grid_rbf = GridSearchCV(pipeline, params, n_jobs=-1,
                            cv=3, verbose=1, scoring='accuracy') 

In [56]:
svm_grid_rbf.fit(X_train, y_train)

Fitting 3 folds for each of 49 candidates, totalling 147 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 147 out of 147 | elapsed:    2.4s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('clf',
                                        SVC(C=1, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma=0.1, kernel='rbf',
                                            max_iter=-1, probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))],
                                verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'clf__C': (0.1, 0.5, 1, 2, 5, 10, 20),
                         'clf__gamma': (0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1)},
             pre_dispatch='2*n_jobs', refit=True, retur

In [57]:
svm_grid_rbf.best_score_

0.26267560664112394

In [58]:
best = svm_grid_rbf.best_estimator_.get_params() 

In [59]:
for k in sorted(params.keys()): 
    print('\t{0}: \t {1:.2f}'.format(k, best[k]))

	clf__C: 	 0.10
	clf__gamma: 	 0.00


**validating in testing set**

In [60]:
y_test_pred = svm_grid_rbf.predict(X_test)

In [61]:
confusion_matrix(y_test, y_test_pred)

array([[ 0,  0,  0,  0, 11,  0,  0],
       [ 0,  0,  0,  0, 12,  0,  0],
       [ 0,  0,  0,  0, 11,  0,  0],
       [ 0,  0,  0,  0, 22,  0,  0],
       [ 0,  0,  0,  0, 32,  0,  0],
       [ 0,  0,  0,  0, 18,  0,  0],
       [ 0,  0,  0,  0,  3,  0,  0]])

In [62]:
performance(y_test, y_test_pred)

Precision Score: 	 0.0862
Recall Score: 		 0.2936
F1 Score: 		 0.1333


  _warn_prf(average, modifier, msg_start, len(result))


# 5 - Final Results

Training and testing Random Forest with real data, compare results, take conclusions. 

Random Forest was the model with best overall performance, comparing 40% accuracy agains 20% and 25%. Additionally, F1, Precision and Recall are also better than other algorithms. Thus Random Forest is the one selected for the final training and testing purpose. 

As I did in Regression, I will consider all predictions higher than real data as good results. 



## a. Testing the hypothesis

How many predictions are higher than real estimation. 

More predictions higher than estimations, the better. 

In [105]:
# Visualizing Result
pred_test = pd.DataFrame(data=y_pred_rf)
train_test = pd.DataFrame(data=y_train)

h_stack_test = pd.concat([train_test, pred_test], axis=1)
h_stack_test.head(5)

Unnamed: 0,0,0.1
0,3,5.0
1,8,8.0
2,1,2.0
3,8,0.0
4,3,3.0


In [106]:
# comparing results (Real versus prediction)
h_stack_test.columns = ['REAL','PRED'] 
h_stack_test.head(5)

Unnamed: 0,REAL,PRED
0,3,5.0
1,8,8.0
2,1,2.0
3,8,0.0
4,3,3.0


In [104]:
# Counting the number of predictions higher than real estimation

pos_pred = 0
neg_pred = 0
comp = False

for i in range(0, len(pred_test)):

  comp = h_stack_test.iloc[i].REAL < h_stack_test.iloc[i].PRED

  if comp == True:
    #print(comp)
    pos_pred += 1
  else:
    #print(comp)
    neg_pred += 1


print('Predictions are higher than real estimation: ', pos_pred)
print('Predictions are lower than real estimation: ', neg_pred)

Predictions are higher than real estimation:  42
Predictions are lower than real estimation:  67


## b. Making an exportable final result

If results were acceptable, like accuracy higher than 70%, algorithms could be tested against datasets without target variable and return predictions in a new column to be used. 

In [86]:
del results_rf['Work Item Id']
del results_rf['SP']

In [87]:
results_rf

Unnamed: 0,IP_ML,QA,ENG,TechDebt,ID_Area,ID_Type
0,11,0,0,0,1,1
1,11,0,0,0,1,1
2,11,0,0,0,1,1
3,11,0,0,0,1,1
4,11,0,0,0,1,1
...,...,...,...,...,...,...
632,22,1,1,0,2,1
633,26,0,0,0,1,4
634,26,0,0,0,1,5
635,26,1,0,0,1,5


In [89]:
real_pred = classifier_rf.predict(results_rf)

In [90]:
final_result = pd.concat([results_rf, pd.DataFrame(real_pred)], axis=1)

In [91]:
final_result

Unnamed: 0,IP_ML,QA,ENG,TechDebt,ID_Area,ID_Type,0
0,11,0,0,0,1,1,3
1,11,0,0,0,1,1,3
2,11,0,0,0,1,1,3
3,11,0,0,0,1,1,3
4,11,0,0,0,1,1,3
...,...,...,...,...,...,...,...
632,22,1,1,0,2,1,5
633,26,0,0,0,1,4,3
634,26,0,0,0,1,5,3
635,26,1,0,0,1,5,3


In [92]:
print(r2_score(y_lg, y_pred_lg))
print(r2_score(y_train, y_pred_lg[:len(y_train)]))

-0.026205369026836234
-0.25551563287947543


In [93]:
confusion_matrix(y_train, y_pred_lg[:len(y_train)])
#pred_test = pd.DataFrame(data=y_pred_lg)
#train_test = pd.DataFrame(data=y_train)

array([[ 3,  2,  0, 16, 26,  0,  0],
       [ 3,  1,  0, 15, 28,  0,  0],
       [ 2,  2,  0, 25, 27,  0,  0],
       [ 6,  5,  0, 37, 52,  0,  0],
       [ 4,  7,  0, 38, 65,  0,  0],
       [ 4,  6,  0,  9, 30,  0,  0],
       [ 0,  2,  0, 10,  9,  0,  0]])

In [94]:
len(y_lg)

543

In [95]:
print(classifier.score(X_lg, y_lg))

0.427255985267035


In [96]:
performance(y_lg, y_pred_lg)

Precision Score: 	 0.2024
Recall Score: 		 0.2910
F1 Score: 		 0.2243


  _warn_prf(average, modifier, msg_start, len(result))


In [97]:
final_result = final_result.rename(columns={'Work Item Id': 'WIID', 0: 'SP_Pred'})

## c. This is the final result, exportable to a CSV

In [100]:
final_result

Unnamed: 0,IP_ML,QA,ENG,TechDebt,ID_Area,ID_Type,SP_Pred
0,11,0,0,0,1,1,3
1,11,0,0,0,1,1,3
2,11,0,0,0,1,1,3
3,11,0,0,0,1,1,3
4,11,0,0,0,1,1,3
...,...,...,...,...,...,...,...
632,22,1,1,0,2,1,5
633,26,0,0,0,1,4,3
634,26,0,0,0,1,5,3
635,26,1,0,0,1,5,3


In [None]:
# comparing results (Real versus prediction)
h_stack_test = pd.concat([train_test, pred_test], axis=1)
h_stack_test.columns = ['REAL','PRED'] 
h_stack_test.head(20)

In [98]:
len(final_result[final_result['SP_Pred']== 0])

3

In [99]:
len(final_result)

637

**RESULTS**

Results are not good with these two examples. Similar to regression,  performance of the algorithm is bad. 

Algorithms will be refined with Data Science and AI Silver badges with more deeper experience and knowledge of application. 