# IT3212

## Config

In [1]:
lemmatize = False

## Importing libraries

In [2]:
# Standard libraries
import numpy as np
import pandas as pd
import re
import string

# NLTK tools and datasets
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Uncomment if you need to download NLTK data packages
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')

# Text processing
from textblob import TextBlob
import contractions

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, roc_curve, auc)
from sklearn.model_selection import cross_val_predict

# Miscellaneous
from collections import Counter
from urllib.parse import unquote
from scipy import stats
import chardet
import pprint


In [3]:
def split_train_test(filepath):
    df = pd.read_csv(filepath, encoding='utf-8')
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)
    return train_data, test_data

filepath = 'data/disaster-tweets-utf8.csv'
df_train, df_test = split_train_test(filepath)
df_train.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778253309,False,finalized,5,8/27/15 16:07,Not Relevant,1.0,,screamed,,i dont even remember slsp happening i just rem...,6.29107e+17,232773900.0
1,778251995,False,finalized,5,8/27/15 20:16,Not Relevant,1.0,,mudslide,Edinburgh,@hazelannmac ooh now I feel guilty about wishi...,6.29018e+17,27502200.0
2,778247239,False,finalized,5,8/30/15 0:15,Not Relevant,1.0,,collide,planeta H2o,Soultech - Collide (Club Mix) http://t.co/8xIx...,6.29092e+17,605238700.0
3,778255430,False,finalized,5,8/27/15 17:03,Relevant,0.7978,,wounded,,Police Officer Wounded Suspect Dead After Exch...,6.29119e+17,2305930000.0
4,778255609,False,finalized,5,8/27/15 22:11,Not Relevant,1.0,,wrecked,Sunny Southern California,Cramer: Iger's 3 words that wrecked Disney's s...,6.2908e+17,24642660.0


## 1. Preprocessing

In [4]:
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()

def filter_rows_by_confidence_and_decision(df, confidence_threshold):
    df = df[df['choose_one:confidence'] >= confidence_threshold]
    df = df[df['choose_one'] != "Can't Decide"]
    return df

def map_choose_one_to_y(df):
    df['target'] = df['choose_one'].apply(lambda choice: 1 if choice == 'Relevant' else 0)
    return df

def clean_text(text):
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub('\s+', ' ', text).strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words("english")])
    if lemmatize:
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text = contractions.fix(text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

def clean_keyword(keyword):
    return unquote(keyword) if pd.notnull(keyword) else ''

def clean_data(df):
    df['keyword'] = df['keyword'].apply(clean_keyword).apply(str.lower)
    df['text_raw'] = df['text']
    df['text'] = df['text'].apply(clean_text)
    return df

initial_count = df_train.shape[0]
confidence_threshold = 0.7

df_train = filter_rows_by_confidence_and_decision(df_train, confidence_threshold)
print("Removed {} of total: {} rows. Remaining rows: {}".format(initial_count - df_train.shape[0], initial_count, df_train.shape[0]))

features_to_keep = ['target', 'text', 'keyword']

df_train = map_choose_one_to_y(df_train)
df_train = df_train[features_to_keep]
df_train = clean_data(df_train)

count_initial = df_train.shape[0]
df_train = df_train.drop_duplicates(subset=['text'])
print("Removed {} duplicated rows.".format(count_initial - df_train.shape[0]))


# Preprocess the test data as well
df_test = map_choose_one_to_y(df_test)
df_test = df_test[features_to_keep]
df_test = clean_data(df_test)

df_test.head()


Removed 2167 of total: 8700 rows. Remaining rows: 6533
Removed 635 duplicated rows.


Unnamed: 0,target,text,keyword,text_raw
0,1,sunset looked like erupting volcano initial th...,volcano,The sunset looked like an erupting volcano ......
1,1,7294 nikon d50 61 mp digital slr camera body 2...,body bag,#7294 Nikon D50 6.1 MP Digital SLR Camera Body...
2,0,mentaltwitter note make sure smoke alarm batte...,smoke,Mental/Twitter Note: Make sure my smoke alarm ...
3,0,emergency need part 2 3 nashnewvideo nashgrier...,emergency,?????? EMERGENCY ?????? NEED PART 2 and 3!!! #...
4,0,whelen model 295ss100 siren amplifier police e...,siren,WHELEN MODEL 295SS-100 SIREN AMPLIFIER POLICE ...


## 2. Extracting features

In [5]:
def extract_features(df): 
    # Create new column for text length
    df['text_length'] = df['text_raw'].apply(len)
    # Extract the number of hashtags
    df["hashtag_count"] = df["text_raw"].apply(lambda x: len([c for c in str(x) if c == "#"]))

    # Extract the number of mentions
    df["mention_count"] = df["text_raw"].apply(lambda x: len([c for c in str(x) if c == "@"]))

    # Extract the `has_url` feature
    df["has_url"] = df["text_raw"].apply(lambda x: 1 if "http" in str(x) else 0)
    return df

# Write the updated dataframe to a CSV file
df_train = extract_features(df_train)
df_test = extract_features(df_test)

### Embedding `text` column using `Word2Vec`

In [6]:
VECTOR_SIZE = 25

tokenized_text = df_train['text'].apply(lambda x: x.split())

import gensim.downloader as api
model_w2v = api.load("glove-twitter-25")

In [11]:
VECTOR_SIZE = 25
VECTOR_SIZE 

25

In [12]:
oow_tokens = []

def embed_text_feature(df, col, model, vector_size):
    def tokens_to_vectors(text_tokens) -> np.ndarray:
        oow = []
        vectors = np.zeros((len(text_tokens), vector_size))

        # embed each token (word-ish) in the text. If the token is not in the model's vocabulary, embed it as a zero vector.
        for i, token in enumerate(text_tokens):
            try:
                vectors[i] = model[token]
            except KeyError:  # Token not in the model's vocabulary
                oow.append(token)
                vectors[i] = np.zeros(vector_size)

        # if all tokens were zero vectors, i.e. all words not in the model's vocabulary, return a zero vector
        if np.all(vectors == 0):
            return np.zeros(vector_size)
        
        oow_tokens.append(oow)
        return vectors.mean(axis=0)

    embeddings = []
    for tokens in df[col].apply(lambda x: x.split()):
        embeddings.append(tokens_to_vectors(tokens))

    return pd.DataFrame(np.vstack(embeddings), columns=[f'{col}_w2v_{i}' for i in range(vector_size)])

df_train_text_embedded_w2v = embed_text_feature(df_train, 'text', model_w2v, VECTOR_SIZE)
df_test_text_embedded_w2v = embed_text_feature(df_test, 'text', model_w2v, VECTOR_SIZE)

print(f"Out of vocabulary tokens")
# filter out empty lists
pprint.pprint([tokens for tokens in oow_tokens if tokens])

df_train_text_embedded_w2v.shape

Out of vocabulary tokens
[['hazelannmac', 'hatman'],
 ['soultech'],
 ['3'],
 ['safyuan'],
 ['thesmallclark'],
 ['ashniggas'],
 ['wbre', 'wyou'],
 ['clockworkheart'],
 ['6', '2015', '2082676773', 'idfire'],
 ['11yearold', '11yearold'],
 ['ppact', 'hillaryclinton', 'destructiondefundpp'],
 ['scalpium'],
 ['janenelson', '097', 'stephenscifi'],
 ['rosenthalauthor'],
 ['artectura', '2015', 'n36'],
 ['ravioliwith'],
 ['imsushickoflove', 'alekalicante'],
 ['6'],
 ['errrr', 'notgoingoutinthat', 'hellonwheelsamc', 'howfans', 'talkinghell'],
 ['crossborder', 'timesofindia'],
 ['1980', 'rorington', '95'],
 ['sensorsenso', 'beckarnley', '9395', '90', '28lv6'],
 ['360wisenews'],
 ['du19'],
 ['70', '69', '1945', '200000'],
 ['2in1', '20'],
 ['mh370', 'r̩union'],
 ['33', 'craykain'],
 ['bosvsnyy'],
 ['stefsy', '14'],
 ['bytorrecilla', '600000'],
 ['501', 'maddddd'],
 ['alllivesmatter'],
 ['5'],
 ['kinggerudo', 'moblins'],
 ['itblank', 'expressioncheeks', 'facemarvins'],
 ['flavafraz', '21', 'whatcant

(5898, 25)

In [None]:
# from sklearn.decomposition import TruncatedSVD

# # Step 1: Concatenate Embeddings
# all_embeddings = np.vstack((df_train_text_embedded_w2v.values, df_test_text_embedded_w2v.values))

# # Step 2: Apply LSA
# # Choose the number of components for LSA
# n_components = 500  # Example, adjust this based on your needs
# lsa = TruncatedSVD(n_components=n_components)

# # Fit and transform the embeddings
# all_embeddings_reduced = lsa.fit_transform(all_embeddings)

# # Step 3: Split the Transformed Data
# train_size = df_train_text_embedded_w2v.shape[0]
# df_train_reduced = all_embeddings_reduced[:train_size]
# df_test_reduced = all_embeddings_reduced[train_size:]

# df_train_reduced = pd.DataFrame(np.vstack(df_train_reduced), columns=[f'w2v_{i}' for i in range(n_components)])
# df_test_reduced = pd.DataFrame(np.vstack(df_test_reduced), columns=[f'w2v_{i}' for i in range(n_components)])


In [None]:

keys = [x.lower() for x in ['Paris', 'Python', 'Sunday', 'Tolstoy', 'Twitter', 'bachelor', 'delivery', 'election', 'expensive', 'experience', 'financial', 'food', 'iOS', 'peace', 'release', 'war']]

embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in model_w2v.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model_w2v[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
from sklearn.manifold import TSNE
import numpy as np

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)



## 3. Selecting features

In [13]:
df_train.columns

Index(['target', 'text', 'keyword', 'text_raw', 'text_length', 'hashtag_count',
       'mention_count', 'has_url'],
      dtype='object')

In [14]:
features_to_keep = ['target', 'text_length', 'hashtag_count', 'mention_count', 'has_url']

X_train = pd.concat([
    df_train[features_to_keep], 
    df_train_text_embedded_w2v,
    ], axis=1)
X_test = pd.concat([
    df_test[features_to_keep], 
    df_test_text_embedded_w2v,
    ], axis=1)

X_train.dropna(inplace=True)

# extract y_train and y_test here to avoid column name collision with 'target' feature coming from text and keyword embeddings
y_train = X_train['target']
y_test = X_test['target']

X_train.drop(['target'], axis=1, inplace=True)
X_test.drop(['target'], axis=1, inplace=True)

(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((4098, 29), (4098,)), ((2176, 29), (2176,)))

## 4. Modelling

In [None]:
logreg = True
svm = False
xgb = True
random_forest = False

In [16]:
def print_results(y_pred, y_train, y_pred_test, y_test):
    print("Train results")
    print("-----------------------------")
    print("Train accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print(classification_report(y_train, y_pred))
    print(confusion_matrix(y_train, y_pred))

    print()
    print("Test results")
    print("-----------------------------")
    print("Test accuracy: {}".format(accuracy_score(y_test, y_pred_test)))
    print(classification_report(y_test, y_pred_test))
    print(confusion_matrix(y_test, y_pred_test))


### Logistic regression

In [17]:
logreg = LogisticRegression(random_state=42, solver="liblinear")
logreg.fit(X_train, y_train)

y_pred = cross_val_predict(logreg, X_train, y_train, cv=5)  # 5-fold cross-validation
y_pred_test = logreg.predict(X_test)

print_results(y_pred, y_train, y_pred_test, y_test)

Train results
-----------------------------
Train accuracy: 0.648365056124939
              precision    recall  f1-score   support

         0.0       0.69      0.77      0.73      2507
         1.0       0.56      0.46      0.50      1591

    accuracy                           0.65      4098
   macro avg       0.62      0.61      0.62      4098
weighted avg       0.64      0.65      0.64      4098

[[1929  578]
 [ 863  728]]

Test results
-----------------------------
Test accuracy: 0.6291360294117647
              precision    recall  f1-score   support

           0       0.65      0.74      0.69      1219
           1       0.60      0.49      0.54       957

    accuracy                           0.63      2176
   macro avg       0.62      0.61      0.61      2176
weighted avg       0.62      0.63      0.62      2176

[[904 315]
 [492 465]]


### Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# print shapes
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Making predictions on the test set
y_pred_train = cross_val_predict(clf, X_train, y_train, cv=5)
y_pred_test = clf.predict(X_test)

(4098, 29) (4098,)
(2176, 29) (2176,)


In [19]:
print_results(y_pred_train, y_train, y_pred_test, y_test)

Train results
-----------------------------
Train accuracy: 0.6281112737920937
              precision    recall  f1-score   support

         0.0       0.67      0.79      0.72      2507
         1.0       0.53      0.37      0.44      1591

    accuracy                           0.63      4098
   macro avg       0.60      0.58      0.58      4098
weighted avg       0.61      0.63      0.61      4098

[[1978  529]
 [ 995  596]]

Test results
-----------------------------
Test accuracy: 0.6194852941176471
              precision    recall  f1-score   support

           0       0.63      0.79      0.70      1219
           1       0.60      0.40      0.48       957

    accuracy                           0.62      2176
   macro avg       0.61      0.60      0.59      2176
weighted avg       0.62      0.62      0.60      2176

[[961 258]
 [570 387]]


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Define the parameter grid
param_distributions = {
    'n_estimators': [20, 65, 100, 150, 200],  # You can increase the range
    'max_depth': [1, 10, 20, 30, 40, 50],    # More options for depth
    'min_samples_split': [2, 5, 10, 15, 20],    # Wider range
    'min_samples_leaf': [1, 2, 4, 6, 8],        # Wider range
    'max_features': ['auto', 'sqrt', 'log2']    # Different options for max features
}

# Create a RandomForestClassifier instance
clf = RandomForestClassifier()

# Create a RandomizedSearchCV instance
random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_distributions, 
                                   n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", random_search.best_params_)

# Use the best estimator to make predictions
best_clf = random_search.best_estimator_
y_pred_train = best_clf.predict(X_train)
y_pred_test = best_clf.predict(X_test)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


  warn(
  warn(
  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=15, n_estimators=100; total time=   2.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=15, n_estimators=100; total time=   3.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=15, n_estimators=100; total time=   3.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=15, n_estimators=100; total time=   3.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=15, n_estimators=100; total time=   3.0s


  warn(
  warn(


[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   7.3s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   7.4s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   7.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=200; total time=   4.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=200; total time=   5.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=200; total time=   5.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   7.2s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   7.4s
[CV] END max_depth=10, max_features=sqrt

  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=15, n_estimators=100; total time=   4.3s


  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=15, n_estimators=100; total time=   4.4s


  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  21.2s


  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  21.0s


  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  21.7s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  11.6s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  11.2s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  10.9s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  21.1s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  21.7s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  10.8s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  10.7s
[CV] END max_depth=40, max_features=sqrt

  warn(


[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   8.0s


  warn(


[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   8.0s


  warn(


[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   8.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   8.2s


  warn(
  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   3.8s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   3.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   8.4s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   7.6s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   7.1s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   7.2s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   7.0s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   7.3s
[CV] END max_depth=30, max_feat

  warn(


[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  18.2s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  18.2s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  18.4s


  warn(
  warn(
  warn(


[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=200; total time=   7.3s


  warn(


[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=200; total time=   7.2s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  17.9s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=15, n_estimators=300; total time=   9.5s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  18.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   3.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   3.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   3.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   3.2s
[CV] END max_depth=10, max_feature

  warn(


[CV] END max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.1s


  warn(


[CV] END max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.2s


  warn(


[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   7.9s


  warn(


[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   8.2s


  warn(


[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   8.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   8.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   8.6s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  14.2s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  13.9s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  14.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=20, n_estimators=500; total time=   9.8s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=20, n_estimators=500; total time=   9.4s
[CV] END max_depth=None, max_fe

  warn(


[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=100; total time=   3.6s


  warn(


[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=100; total time=   3.6s


  warn(


[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=500; total time=  17.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=500; total time=  17.7s


  warn(
  warn(


[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=500; total time=  18.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=500; total time=  17.9s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=500; total time=  18.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=6, min_samples_split=5, n_estimators=300; total time=   4.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=6, min_samples_split=5, n_estimators=300; total time=   4.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=6, min_samples_split=5, n_estimators=300; total time=   4.6s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=20, n_estimators=500; total time=  17.9s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=20, n_estimators=500; total time=  18.0s
[CV] END max_depth=10, max_features

  warn(
  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=400; total time=  13.6s


  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=400; total time=  13.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=6, min_samples_split=15, n_estimators=500; total time=  15.7s


  warn(
  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=400; total time=  13.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   6.7s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   8.8s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   8.6s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   9.2s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   8.7s


  warn(


[CV] END max_depth=20, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   9.0s


  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=400; total time=  13.0s


  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=20, n_estimators=400; total time=  13.3s


  warn(


[CV] END max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.4s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.3s


  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   6.7s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=6, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   7.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   6.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   7.0s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.6s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=6, min_samples_split=2, n_estimators=100; total time=   1.5s
[CV] END max_depth=10, max_feature

  warn(


[CV] END max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   5.8s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   5.6s


  warn(
  warn(
  warn(


[CV] END max_depth=20, max_features=auto, min_samples_leaf=6, min_samples_split=15, n_estimators=200; total time=   5.8s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=6, min_samples_split=20, n_estimators=500; total time=  14.6s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=6, min_samples_split=20, n_estimators=500; total time=  14.7s


  warn(


[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=6, min_samples_split=20, n_estimators=500; total time=  14.9s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=6, min_samples_split=20, n_estimators=500; total time=  15.0s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=6, min_samples_split=20, n_estimators=500; total time=  14.9s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=6, min_samples_split=15, n_estimators=200; total time=   6.4s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=6, min_samples_split=15, n_estimators=200; total time=   6.4s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=6, min_samples_split=15, n_estimators=200; total time=   6.5s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=6, min_samples_split=15, n_estimators=200; total time=   6.6s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=15, n_estimators=400; total time=  14.2s
[CV] END max_depth=40, max_featu

  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=15, n_estimators=400; total time=  14.8s


  warn(


[CV] END max_depth=50, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=100; total time=   3.1s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=15, n_estimators=400; total time=  14.3s


  warn(
  warn(


[CV] END max_depth=50, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=100; total time=   3.1s


  warn(


[CV] END max_depth=50, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=100; total time=   3.1s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=100; total time=   3.1s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=100; total time=   3.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=  18.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=  18.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.9s


  warn(
  warn(
  warn(


[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=  18.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.9s


  warn(
  warn(


[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   3.3s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   3.3s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   3.2s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   3.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   3.2s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.5s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.7s
[CV] END max_depth=50, max_f

  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=500; total time=   9.9s


  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=500; total time=   9.9s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=500; total time=   9.8s


  warn(
  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=500; total time=  10.0s


  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=500; total time=  10.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   3.3s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   2.8s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=6, min_samples_split=20, n_estimators=100; total time=   1.7s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   2.7s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   2.8s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   2.8s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=6, min_samples_split=20, n_estimators=100; total time=   1.9s
[CV] END max_depth=40, max_featur

  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   6.7s


  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   6.7s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   6.7s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   6.5s


  warn(
  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   6.5s


  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=6, min_samples_split=15, n_estimators=500; total time=   8.9s


  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=6, min_samples_split=15, n_estimators=500; total time=   9.1s


  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=6, min_samples_split=15, n_estimators=500; total time=   9.2s


  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=6, min_samples_split=15, n_estimators=500; total time=   9.2s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=6, min_samples_split=15, n_estimators=500; total time=   9.4s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   6.2s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   6.1s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   6.6s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   6.5s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   6.4s


  warn(


[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   6.4s


  warn(


[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   6.6s


  warn(


[CV] END max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  10.2s


  warn(


[CV] END max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   9.9s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  10.1s


  warn(
  warn(


[CV] END max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  10.1s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  10.1s


  warn(
  warn(


[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  13.0s


  warn(


[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  12.9s


  warn(


[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  13.2s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   5.9s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   5.9s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   6.0s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  13.0s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   6.3s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=20, n_estimators=200; total time=   6.2s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  12.9s
[CV] END max_depth=10, max_featu

  warn(
  warn(


[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.5s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=8, min_samples_split=20, n_estimators=200; total time=   5.5s


  warn(
  warn(


[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=8, min_samples_split=20, n_estimators=200; total time=   5.7s


  warn(


[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=8, min_samples_split=20, n_estimators=200; total time=   5.6s


  warn(


[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.5s


  warn(


[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.6s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   6.7s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   6.8s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   7.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   7.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   7.1s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=  10.2s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=  10.2s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=  10.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time=   7.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time=   8.4s
[CV] END max_depth=None, ma

  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=8, min_samples_split=10, n_estimators=300; total time=   8.9s


  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=8, min_samples_split=10, n_estimators=300; total time=   8.8s


  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=6, min_samples_split=20, n_estimators=100; total time=   3.1s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=8, min_samples_split=10, n_estimators=300; total time=   9.1s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=6, min_samples_split=20, n_estimators=100; total time=   3.2s


  warn(
  warn(
  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=8, min_samples_split=10, n_estimators=300; total time=   8.8s


  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=8, min_samples_split=10, n_estimators=300; total time=   9.0s


  warn(


[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=200; total time=   5.2s


  warn(


[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=200; total time=   5.3s


  warn(


[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=200; total time=   5.1s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=200; total time=   5.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=200; total time=   5.2s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=8, min_samples_split=20, n_estimators=400; total time=  11.9s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=8, min_samples_split=20, n_estimators=400; total time=  11.7s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=8, min_samples_split=20, n_estimators=400; total time=  12.0s


  warn(


[CV] END max_depth=30, max_features=auto, min_samples_leaf=8, min_samples_split=20, n_estimators=400; total time=  12.1s


  warn(


[CV] END max_depth=30, max_features=auto, min_samples_leaf=8, min_samples_split=20, n_estimators=400; total time=  12.1s


  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=400; total time=  12.9s


  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=400; total time=  12.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=400; total time=  13.0s


  warn(
  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=400; total time=  12.5s


  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=400; total time=  13.2s


  warn(


[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=15, n_estimators=500; total time=  16.3s


  warn(


[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=15, n_estimators=500; total time=  15.8s


  warn(


[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=15, n_estimators=500; total time=  16.1s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  14.2s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=15, n_estimators=500; total time=  16.1s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=15, n_estimators=500; total time=  15.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=200; total time=   6.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=200; total time=   6.1s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  13.7s


  warn(
  warn(


[CV] END max_depth=40, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  14.0s


  warn(


[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=200; total time=   6.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=200; total time=   6.0s


  warn(
  warn(


[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.0s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.2s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.1s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.2s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.3s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  14.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=200; total time=   6.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   2.8s
[CV] END max_depth=10, max_featu

  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   3.1s


  warn(


[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   3.3s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   3.0s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   3.0s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   3.1s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   3.3s


  warn(


[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=20, n_estimators=100; total time=   3.0s


  warn(


[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=500; total time=  14.6s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=500; total time=  14.8s


  warn(
  warn(


[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=500; total time=  14.7s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=500; total time=  15.0s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=500; total time=  14.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=20, n_estimators=400; total time=   7.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=20, n_estimators=400; total time=   6.8s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=20, n_estimators=400; total time=   6.9s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=  12.3s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=  12.6s
[CV] END max_depth=40, max_feat

  warn(


[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   9.9s


  warn(


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=20, n_estimators=400; total time=  12.2s


  warn(


[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   9.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=20, n_estimators=400; total time=  11.9s


  warn(
  warn(


[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   9.6s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   9.1s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   9.3s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   9.9s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   9.8s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=8, min_samples_split=15, n_estimators=500; total time=   8.0s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   9.8s
[CV] END max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  10.2s
[CV] END max_depth=50, max_features=aut

  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=8, min_samples_split=15, n_estimators=500; total time=   7.8s


  warn(


[CV] END max_depth=30, max_features=log2, min_samples_leaf=8, min_samples_split=15, n_estimators=500; total time=   8.1s


  warn(


[CV] END max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   3.6s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   3.7s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=8, min_samples_split=15, n_estimators=500; total time=   7.8s


  warn(
  warn(


[CV] END max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   3.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=8, min_samples_split=15, n_estimators=500; total time=   7.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=20, n_estimators=200; total time=   6.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=20, n_estimators=200; total time=   5.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=20, n_estimators=200; total time=   5.8s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=15, n_estimators=400; total time=  11.4s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=15, n_estimators=400; total time=  12.2s
[CV] END max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=15, n_estimators=400; total time=  11.9s
[CV] END max_depth=40, max

  warn(


Best parameters found:  {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 50}


In [None]:
print_results(y_pred_train, y_train, y_pred_test, y_test)

Train results
-----------------------------
Train accuracy: 0.9948755490483162
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      2507
         1.0       1.00      0.99      0.99      1591

    accuracy                           0.99      4098
   macro avg       1.00      0.99      0.99      4098
weighted avg       0.99      0.99      0.99      4098

[[2507    0]
 [  21 1570]]

Test results
-----------------------------
Test accuracy: 0.5896139705882353
              precision    recall  f1-score   support

           0       0.59      0.88      0.71      1219
           1       0.59      0.22      0.32       957

    accuracy                           0.59      2176
   macro avg       0.59      0.55      0.51      2176
weighted avg       0.59      0.59      0.54      2176

[[1070  149]
 [ 744  213]]


In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Creating an XGBoost classifier
model = xgb.XGBClassifier(max_depth=1, random_state=0)

# Training the model on the training data
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred_test = model.predict(X_test)

# Calculating accuracy on test set
accuracy_test = accuracy_score(y_test, y_pred_test)

# Making predictions on the training set
predictions_train = model.predict(X_train)

# Calculating accuracy on training set
accuracy_train = accuracy_score(y_train, predictions_train)

print("\nTraining Set Metrics:")
print("Accuracy:", accuracy_train)
print("\nClassification Report:")
print(classification_report(y_train, predictions_train))

print("\nTest Set Metrics:")
print("Accuracy:", accuracy_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))




Training Set Metrics:
Accuracy: 0.7103465104929234

Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.84      0.78      2507
         1.0       0.66      0.51      0.58      1591

    accuracy                           0.71      4098
   macro avg       0.70      0.67      0.68      4098
weighted avg       0.70      0.71      0.70      4098


Test Set Metrics:
Accuracy: 0.6245404411764706

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.77      0.70      1219
           1       0.60      0.44      0.51       957

    accuracy                           0.62      2176
   macro avg       0.62      0.60      0.60      2176
weighted avg       0.62      0.62      0.61      2176

