# 1 - Preparing Notebook for Analysis

Run this section to be able to run separate sections later

In [3]:
# Required Libraries
import pandas as pd
import numpy as np


# Libraries for Metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix

# Libraries for Modeling
from sklearn import linear_model


# Libraries for graphics
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Mounting Google Colab Drive
from google.colab import drive
drive.mount('/content/drive')
working_directory = '/content/drive/MyDrive/Colab Notebooks/98.COB/DS_v0/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Global Functions for this Notebook

# Importing the dataset
def importsets():
    train = pd.read_csv(working_directory + 'Train_26_8.csv')
    test = pd.read_csv(working_directory + 'Results_26_8.csv')
    results = pd.read_csv(working_directory + 'Results_26_8.csv')
    return train, test, results

'''
Use this to split between test and set if needed
X = train_df.iloc[:, :-1].values
y = train_df.iloc[:, -1].values
'''
# Calling function to set global variables for information goals
train_df, test_df, results_df = importsets()


# Function to show columns of imported Datasets
def cols(): 
    print('Train Columns> ' , train_df.columns)
    print('Test Columns> ' , test_df.columns)
    print('Result Columns> ' , results_df.columns)

# Function to measure performance of models using precision, recall and F1
def performance(test, pred):
    print("Precision Score: \t {0:.4f}".format(precision_score(test, 
                                                            pred, 
                                                            average='weighted')))
    print("Recall Score: \t\t {0:.4f}".format(recall_score(test,
                                                        pred, 
                                                        average='weighted')))
    print("F1 Score: \t\t {0:.4f}".format(f1_score(test,
                                                pred, 
                                                average='weighted')))

# Function to split train dataset in y column with target and the rest on X 
def create_x_y(train_df):
    X = train_df.iloc[:, :-1].values
    y = train_df.iloc[:, -1].values
    return X, y

def myfunctions():
    print('importsets() useful for importing datasets to new variables train, test, results')
    print('cols() is a function to know the columns of the original datasets')
    print('performance(y_test, y_test_pred) is a function to know the performance of the model')
    print('create_x_y(train_df) is a function will return y target from train and X for the rest of the columns')




In [6]:
cols()

Train Columns>  Index(['IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type', 'SP'], dtype='object')
Test Columns>  Index(['Work Item Id', 'IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type',
       'SP'],
      dtype='object')
Result Columns>  Index(['Work Item Id', 'IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type',
       'SP'],
      dtype='object')


# 2 - Prototype with Logistic Regression

In [7]:
# Importing the dataset
train_lg, results_test_lg, results_lg = importsets()

# Spliting Datasets in X and Y for fitting
X_lg = train_lg.iloc[:, :-1].values
y_lg = train_lg.iloc[:, -1].values

In [8]:
train_lg.columns # Checking columns

Index(['IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type', 'SP'], dtype='object')

In [9]:
cols() # Checking columns of all datasets

Train Columns>  Index(['IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type', 'SP'], dtype='object')
Test Columns>  Index(['Work Item Id', 'IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type',
       'SP'],
      dtype='object')
Result Columns>  Index(['Work Item Id', 'IP_ML', 'QA', 'ENG', 'TechDebt', 'ID_Area', 'ID_Type',
       'SP'],
      dtype='object')


In [10]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X_lg, y_lg, test_size = 0.2, random_state = 0)

In [11]:
# Fitting Gradient Boosting to the Training set
classifier = GradientBoostingClassifier(max_depth=10, random_state=10)
classifier.fit(X_train, y_train)

NameError: ignored

In [None]:
y_pred_lg = classifier.predict(X_lg)

In [None]:
classifier.predict_proba(X_lg)

array([[1.39955996e-07, 1.75803543e-07, 2.43230768e-07, ...,
        7.39438276e-09, 4.13639054e-11, 9.99998099e-01],
       [3.12954677e-07, 1.83639807e-07, 1.77746115e-07, ...,
        6.09795051e-08, 9.99997959e-01, 3.82224685e-11],
       [9.42798837e-06, 1.17599066e-05, 1.53214038e-05, ...,
        9.99874841e-01, 2.81577537e-09, 2.65986834e-09],
       ...,
       [9.98342127e-01, 1.61482641e-04, 1.95712994e-04, ...,
        7.20419386e-06, 4.04511196e-08, 3.88869182e-08],
       [9.98679367e-01, 1.13700316e-04, 1.71614908e-04, ...,
        5.57619650e-06, 3.12470388e-08, 3.00387494e-08],
       [7.90657055e-04, 9.87710465e-01, 1.57679495e-03, ...,
        4.97023817e-05, 2.78592081e-07, 2.67819225e-07]])

In [None]:
# Visualizing Result
pred_test = pd.DataFrame(data=y_pred_lg)
train_test = pd.DataFrame(data=y_train)

h_stack_test = pd.concat([train_test, pred_test], axis=1)

h_stack_test.head(50)

Unnamed: 0,0,0.1
0,3.0,441
1,8.0,316
2,13.0,130
3,2.0,130
4,6.0,129
5,10.0,6
6,5.0,100
7,4.0,11
8,1.0,3
9,2.0,86


In [None]:
performance(train_test, pred_test[:244])

Precision Score: 	 0.0726
Recall Score: 		 0.0779
F1 Score: 		 0.0724


In [None]:
del results_test_lg['Work Item Id']
del results_test_lg['SP']

In [None]:
results_test_lg

Unnamed: 0,IP_ML,QA,ENG,TechDebt,ID_Area,ID_Type
0,11,0,0,0,1,1
1,11,0,0,0,1,1
2,11,0,0,0,1,1
3,11,0,0,0,1,1
4,11,0,0,0,1,1
...,...,...,...,...,...,...
632,22,1,1,0,2,1
633,26,0,0,0,1,4
634,26,0,0,0,1,5
635,26,1,0,0,1,5


In [None]:
real_pred = classifier.predict(results_test_lg)

In [None]:
final_result = pd.concat([results_lg, pd.DataFrame(real_pred)], axis=1)

In [None]:
final_result

Unnamed: 0,Work Item Id,IP_ML,QA,ENG,TechDebt,ID_Area,ID_Type,SP,0
0,1626698,11,0,0,0,1,1,5,3
1,1626701,11,0,0,0,1,1,5,3
2,1626704,11,0,0,0,1,1,3,3
3,1626706,11,0,0,0,1,1,3,3
4,1626707,11,0,0,0,1,1,3,3
...,...,...,...,...,...,...,...,...,...
632,1627191,22,1,1,0,2,1,8,2
633,1987644,26,0,0,0,1,4,3,19
634,1987648,26,0,0,0,1,5,8,19
635,1987646,26,1,0,0,1,5,8,21


In [None]:
r2_score(y_lg, y_pred_lg)

0.9099698383129072

In [None]:
performance(y_lg, y_pred_lg)

Precision Score: 	 0.8427
Recall Score: 		 0.8366
F1 Score: 		 0.8292


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
final_result = final_result.rename(columns={'Work Item Id': 'WIID', 0: 'SP_Pred'})

In [None]:
len(final_result[final_result['SP_Pred']== 0])

0

In [None]:
len(final_result)

637

**RESULTS**

Results are good with these two examples, but performance of the algorithm is bad. 

Algorithms will be refined with Data Science and AI Silver badges with more deeper experience and knowledge of application. 

# 4 - Prototype with SVM

In [None]:
from sklearn import svm

In [None]:
# Importing libraries
from sklearn.linear_model import LogisticRegression

# Importing the dataset
train_svm, results_svm, results_test_svm = importsets()

X_svm = train_svm.iloc[:, :-1].values
y_svm = train_svm.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X_svm, y_svm,
                                                    train_size=0.8, 
                                                    random_state=0)

**Feature Scaling**

In [None]:
sc_x = StandardScaler()
X_std_train = sc_x.fit_transform(X_train)

## a. Trying with Linear SVM Classification

In [None]:
C = 1.0 #0.01
clf = svm.SVC(kernel='linear', C=C)
clf.fit(X_std_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
res = cross_val_score(clf, X_std_train, y_train, cv=10, scoring='accuracy')
print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

Average Accuracy: 	 0.1798
Accuracy SD: 		 0.0434




In [None]:
y_train_pred = cross_val_predict(clf, X_std_train, y_train, cv=3)



In [None]:
y_train_pred = cross_val_predict(clf, X_std_train, y_train, cv=3)



In [None]:
confusion_matrix(y_train, y_train_pred)

array([[4, 0, 0, ..., 0, 0, 0],
       [2, 0, 6, ..., 0, 0, 0],
       [5, 3, 5, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
performance(y_train, y_train_pred)

Precision Score: 	 0.0798
Recall Score: 		 0.1844
F1 Score: 		 0.1093


  _warn_prf(average, modifier, msg_start, len(result))


**Cross validation with Test Set**

In [None]:
y_test_pred = cross_val_predict(clf, sc_x.transform(X_test), y_test, cv=3)



In [None]:
confusion_matrix(y_test, y_test_pred)

array([[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0],
       [1, 5, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0],
       [0, 1, 0, 3, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 1, 0, 0, 

In [None]:
performance(y_test, y_test_pred)

Precision Score: 	 0.1122
Recall Score: 		 0.1290
F1 Score: 		 0.1184


  _warn_prf(average, modifier, msg_start, len(result))


## b. Trying with polynomial kernel

In [None]:
C = 1.0
clf = svm.SVC(kernel='poly', degree=3, C=C, gamma='auto')
clf.fit(X_std_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
res = cross_val_score(clf, X_std_train, y_train, cv=10, scoring='accuracy')
print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

Average Accuracy: 	 0.2053
Accuracy SD: 		 0.0338




In [None]:
y_train_pred = cross_val_predict(clf, X_std_train, y_train, cv=3)



In [None]:
confusion_matrix(y_train, y_train_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 3, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
performance(y_train, y_train_pred)

Precision Score: 	 0.1170
Recall Score: 		 0.2172
F1 Score: 		 0.1147


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_test_pred = cross_val_predict(clf, sc_x.transform(X_test), y_test, cv=3)



In [None]:
confusion_matrix(y_test, y_test_pred)

array([[ 0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 11,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  2,  0,  4,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  7,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],


### Measuring Performance

In [None]:
performance(y_test, y_test_pred)

Precision Score: 	 0.1734
Recall Score: 		 0.2419
F1 Score: 		 0.1570


  _warn_prf(average, modifier, msg_start, len(result))


**CONCLUSION>**

There is a small improvement in Precision, Recall and F1 score agaist other algorithms but there is no significant improvement and uncertainty is still big. 



## c. Experimenting with Radial Basis Function

In [None]:
train_gaus, test_gaus, result_gaus = importsets()

In [None]:
X_gs, y_gs = create_x_y(train_gaus)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_gs, y_gs,
                                                    train_size=0.8, 
                                                    random_state=0) #0.6

In [None]:
sc_x = StandardScaler()
X_std_train = sc_x.fit_transform(X_train)

In [None]:
C = 1.0
clf = svm.SVC(kernel='rbf', gamma=0.7, C=C)
clf.fit(X_std_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.7, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
res = cross_val_score(clf, X_std_train, y_train, cv=10, scoring='accuracy')
print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

Average Accuracy: 	 0.2455
Accuracy SD: 		 0.0559




In [None]:
y_train_pred = cross_val_predict(clf, X_std_train, y_train, cv=3)



In [None]:
confusion_matrix(y_train, y_train_pred)

array([[2, 0, 1, ..., 0, 0, 0],
       [1, 0, 3, ..., 0, 0, 0],
       [1, 0, 6, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
performance(y_train, y_train_pred)

Precision Score: 	 0.1035
Recall Score: 		 0.2213
F1 Score: 		 0.1303


  _warn_prf(average, modifier, msg_start, len(result))


**Grid Search**

In [None]:
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import train_test_split, GridSearchCV 

In [None]:
pipeline = Pipeline([('clf', svm.SVC(kernel='rbf', C=1, gamma=0.1))]) 

In [None]:
params = {'clf__C':(0.1, 0.5, 1, 2, 5, 10, 20), 
          'clf__gamma':(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1)} 

In [None]:
svm_grid_rbf = GridSearchCV(pipeline, params, n_jobs=-1,
                            cv=3, verbose=1, scoring='accuracy') 

In [None]:
svm_grid_rbf.fit(X_train, y_train)

Fitting 3 folds for each of 49 candidates, totalling 147 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 147 out of 147 | elapsed:    2.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('clf',
                                        SVC(C=1, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma=0.1, kernel='rbf',
                                            max_iter=-1, probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))],
                                verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'clf__C': (0.1, 0.5, 1, 2, 5, 10, 20),
                         'clf__gamma': (0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1)},
             pre_dispatch='2*n_jobs', refit=True, retur

In [None]:
svm_grid_rbf.best_score_

0.24179464016862393

In [None]:
best = svm_grid_rbf.best_estimator_.get_params() 

In [None]:
for k in sorted(params.keys()): 
    print('\t{0}: \t {1:.2f}'.format(k, best[k]))

	clf__C: 	 1.00
	clf__gamma: 	 0.50


**validating in testing set**

In [None]:
y_test_pred = svm_grid_rbf.predict(X_test)

In [None]:
confusion_matrix(y_test, y_test_pred)

array([[0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 1, 3, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [1, 0, 2, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 

In [None]:
performance(y_test, y_test_pred)

Precision Score: 	 0.0621
Recall Score: 		 0.1290
F1 Score: 		 0.0718


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
