In [189]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
np.random.seed(42)
# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# models
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# ensembles 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import IsolationForest

##### Only for colab

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.activity.readonly&response_type=code

Enter your authorization code:
elite555


ValueError: ignored

In [None]:
# copy data file to current directory
!cp /gdrive/MyDrive/data/Unified-class.csv .

### Data Exploration

In [None]:
# getting the data
df = pd.read_csv('Unified-class.csv');
df.columns = [x.lower() for x in df.columns] # lowercase the columns
df.head()

In [None]:
df.info()

**The features has no missing values**

In [None]:
# getting the number of unique values per 'object' features
for col in df.columns:
  if df[col].dtype == 'O':
    print(col, ": ", df[col].nunique())

**We will drop all the 'object' features except the `type` feature as they have a huge number of unique values**

In [None]:
# drop object features except type 
df.drop(np.r_[['id'], df.columns[2:7]], axis=1, inplace=True)

In [None]:
# max and min values in the target class
df['bug'].unique().max(), df['bug'].unique().min()

**The max target value is 62 and the min target value is 0**

#### One-hot-encoding the `type` feature 

In [None]:
df['type'].unique()

In [None]:
# one-hot-encoding the type feature
type_dummies = pd.get_dummies(df['type'])
df = pd.concat([df, type_dummies], axis=1)
df.drop('type', axis=1, inplace=True)
df.head()

In [None]:
# Splitting the dataframe into features and target
columns = list(df.columns)
columns.remove('bug')
x,y = df[columns] , df['bug']

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

In [None]:
# calculates the rmse between the predictions and actual values
def rmse_score(y_pred, y):
    return np.sqrt(mean_squared_error(y_pred, y))

# gets the predictions using x and the estimator and then calculates rmse between the predictions and y
def rmse_scorer(estimator, x, y):
    y_pred = estimator.predict(x)
    return rmse_score(y_pred,y)

In [None]:
# fit_model
# takes a model and datasets as input and outputs the scores
def fit_model(model, x_train, y_train, x_valid, y_valid):
  # fitting the model
  model.fit(x_train, y_train)
  
  # making prediction on the valid data
  preds = model.predict(x_valid)
  # calculating the scores
  rmse_train = rmse_score(y_train, model.predict(x_train))
  mse = mean_squared_error(y_valid, preds)
  rmse = rmse_score(y_valid, preds)
  r2 = r2_score(y_valid, preds) 

  print(f"Train RMSE: {rmse_train:.3f} | RMSE: {rmse:.3f}\n\
          MSE       : {mse:.3f} | \tR2  : {r2:.3f}")
  return model

In [None]:
# pipeline to scale and then train a random forest
pipeline = Pipeline([
  ('scaler', StandardScaler()),
  ('rf', RandomForestRegressor(random_state=42))
])

pipeline = fit_model(pipeline, x_train, y_train, x_valid, y_valid)

### Outlier Detection

In [None]:
# taking a copy of the dataframe
no_outliers_df = df.copy()
x,y = no_outliers_df[columns] , no_outliers_df['bug']

In [None]:
# detecting outliers using IsolationForest
outlierDetector =  IsolationForest(n_estimators=100,random_state = 42)
result = outlierDetector.fit_predict(x)
outliers = no_outliers_df[result==-1]

In [None]:
# drop outliers
no_outliers_df = no_outliers_df.drop(outliers.index)
print(f'Number of removed outliers {outliers.index.shape[0]}')

In [None]:
# splitting the new data into train and validation
x,y = no_outliers_df[columns] , no_outliers_df['bug']
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

In [None]:
# refitting the pipeline after removing the outliers
pipeline = fit_model(pipeline, x_train, y_train, x_valid, y_valid)

In [None]:
# correlation 
corr = pd.DataFrame(no_outliers_df.corr()['bug'].sort_values(ascending=False)[1:])

labels = corr.index
corr_values = corr['bug'].tolist()

plt.figure(figsize=(25, 10))
plt.bar(labels, corr_values)
plt.xlabel('Features')
plt.ylabel('Correlation with class')
plt.title('The Correlation between the Features and Target Column Class')
plt.show()

In [None]:
# helper functions for RandomForests
# puts the feature importance of RandomForest in a DataFrame
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns[:-1], 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

# Plots the feature importance using the DataFrame
def plot_fi(fi, figsize=(12, 7)):
    return fi.plot('cols', 'imp', 'barh', figsize=figsize, legend=False)

In [None]:
# Find the features importance 
fi = rf_feat_importance(pipeline['rf'], no_outliers_df)
fi

In [None]:
# Plot the features importance 
plot_fi(fi, figsize=(15, 10))

### PCA

In [None]:
# Printing the 
print(f'Number of features before PCA: {no_outliers_df.shape[1]}')

In [None]:
pipeline = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=.95, whiten=True, random_state=42)),
  ('rf', RandomForestRegressor(random_state=42))
])

pipeline = fit_model(pipeline, x_train, y_train, x_valid, y_valid)

In [None]:
pipeline['pca'].n_components_

In [None]:
pipeline = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=.97, whiten=True)),
  ('rf', RandomForestRegressor(random_state=42))
])

pipeline = fit_model(pipeline, x_train, y_train, x_valid, y_valid)

In [None]:
pipeline['pca'].n_components_

### Training individual models

In [187]:
# RF
n_estimators = [int(x) for x in np.linspace(start=10, stop=100, num=10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

param_grid = {'rf__n_estimators': n_estimators,
               'rf__max_features': max_features,
               'rf__max_depth': max_depth,
               'rf__min_samples_split': min_samples_split,
               'rf__min_samples_leaf': min_samples_leaf,
               'rf__bootstrap': bootstrap}

rf_pipeline = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=.95, whiten=True, random_state=42)),
  ('random_rf', RandomizedSearchCV(estimator = RandomForestRegressor(random_state=42), param_distributions = param_grid, 
                n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1))
])

rf_pipeline.fit(x_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 23.5min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=0.95,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=True)),
                                             ('rf',
                                              RandomForestRegressor(bootstrap=True,
                    

In [193]:
rf_pipeline.best_params_

{'rf__bootstrap': False,
 'rf__max_depth': 9,
 'rf__max_features': 'sqrt',
 'rf__min_samples_leaf': 4,
 'rf__min_samples_split': 5,
 'rf__n_estimators': 50}

In [194]:
rf_pipeline.score(x_valid, y_valid)

0.12049582709151774

In [195]:
preds = rf_pipeline.predict(x_valid)
rmse_score(y_valid, preds)

0.8413149301315216

In [None]:
param_grid = {'C': np.arange(1, 11), 'gamma': [1,0.1,0.01,0.001],'kernel': ['poly', 'linear', 'sigmoid']}

rf_pipeline = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=.95, whiten=True)),
  ('random_svr', RandomizedSearchCV(estimator = SVR(), param_distributions = param_grid, 
                               n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1))
])


rf_pipeline.fit(x_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
param_grid = {'n_neighbors':[4,5,6,7],
              'leaf_size':[1,3,5],
              'algorithm':['auto', 'kd_tree']}

knn_pipeline = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=.95, whiten=True)),
  ('random_knn', RandomizedSearchCV(estimator = KNeighborsRegressor(random_state=42), param_distributions = param_grid, 
                               n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1))
])


knn_pipeline.fit(x_train, y_train)
knn_pipeline.best_params_