In [None]:
# System
import os
import json

# Config
import warnings

# Web Scraping
import requests
from bs4 import BeautifulSoup

# Data
import numpy as np
import pandas as pd
import math

# View
import matplotlib.pyplot as plt
import matplotlib.patches
import seaborn as sns
from IPython.display import Image

# ML
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

# ML Models
from sklearn.dummy import DummyClassifier

from sklearn.ensemble import VotingClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import OneClassSVM
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [None]:
# Utilities Files
def read_csv(name: str, index_label='id') -> pd.DataFrame:
    return pd.read_csv('../data/' + name + '.csv', index_col=index_label)


def save_csv(df: pd.DataFrame, name: str, index_label='id'):
    df.to_csv('../data/' + name + '.csv', index_label=index_label)

def show_image(name: str):
    return Image(filename= '../images/' + name + '.png')

In [None]:
# Development Flags
SHOW_IN_RELEASE = True
SHOW_IN_DEVELOPMENT = True
MIN_DATA_MODE = False

In [None]:
if SHOW_IN_RELEASE:
    show_image('Sorry')

In [None]:
if SHOW_IN_RELEASE:
    show_image('Data_Collectors')

In [None]:
if SHOW_IN_RELEASE:
    pass
    #warnings.filterwarnings('ignore')

In [None]:
# Load the data

df_original = read_csv('businesses')

if MIN_DATA_MODE:
    df_original = train_test_split(df_original, train_size=0.05, random_state=70)[0]

df = df_original.copy()

# Remove text columns and 'ExpensiveLevel'
df.drop(["Url", "Name", "ExpensiveLevel", "SubCategories", "AttributesHas"], axis=1, inplace=True)

In [None]:
# Split the data to Train and Final Test

Y = target = df["HasExpensiveLevel"]
X = data = df.drop(["HasExpensiveLevel"], axis=1)
x_train, x_finale_test, y_train, y_finale_test = train_test_split(X, Y, test_size=0.03, random_state=70)

In [None]:
# create Data Frame of the Train Data

df = df_train = pd.concat([y_train, x_train], axis=1)
df.reset_index(drop=True, inplace=True)

In [None]:
# Visuation for undsending the data

In [None]:
# Config
# Config Colors
legend_colors = ['tab:blue', 'tab:orange']

# Config columns names
target_column = "HasExpensiveLevel"
target_column_label = "Is Expensive"
target_column_label_true = "Is Expensive"
target_column_label_false = "Is not Expensive"

prime_flag_columns = [
    'Claimed', 'HasWebsite'
]

prime_countable_columns = [
    'Stars',
    'SubCategoriesCount', 'AttributesCount',
    'QuestionsCount', 'WeeklyBreaks', 'WeeklyDays'
]
prime_non_countable_columns = [
     'Reviews', 'Photos',
    'WeeklyHours'
]

prime_columns = prime_flag_columns + prime_countable_columns + prime_non_countable_columns

In [None]:
# Describe
df[[target_column] + prime_columns].describe()

# The data is balanced

In [None]:
# Visuation functions for columns

def disply_count_of_flag_column(column_name: str, column_label: str, true_label: str, false_label: str, labels: list):
    global legend_colors, df, target_column
    
    df_sp = df[[target_column, column_name]]
    
    count_df = df_sp.groupby([column_name]).count()
    
    count_true_has = df_sp[df_sp[column_name] == 1][target_column].sum()
    count_true_not_has = df_sp[df_sp[column_name] == 1][target_column].count() - count_true_has
    
    count_false_has = df_sp[df_sp[column_name] == 0][target_column].sum()
    count_false_not_has = df_sp[df_sp[column_name] == 0][target_column].count() - count_false_has
    
    # Main elements    
    fig = plt.figure(figsize = [16, 8])
    
    fig.suptitle(column_label)
    
    ax_graf =  fig.add_subplot(1, 3, 1)
    ax_true =  fig.add_subplot(1, 3, 2)
    ax_false = fig.add_subplot(1, 3, 3) 
        
    # Graf
    plt.sca(ax_graf)
    ax_graf.bar( [true_label,false_label], count_df[target_column].values)
    plt.xlabel(true_label)
    plt.ylabel("Count")
    
    # Pies
    plt.sca(ax_true)
    plt.title(true_label)
    
    ax_true.pie([count_true_has, count_true_not_has],
                radius=4,
                center=(4, 4),
                startangle=90,
                autopct='%1.0f%%',
                colors=legend_colors)
    
    ax_true.set(xlim=(0, 8), ylim=(0, 8))
        
    plt.sca(ax_false)
    plt.title(false_label)
    ax_false.pie([count_false_has, count_false_not_has],
                radius=4,
                center=(4, 4),
                startangle=90,
                autopct='%1.0f%%',
                colors=legend_colors)
    
    ax_false.set(xlim=(0, 8), ylim=(0, 8))
    
    # show        
    handles = []
    for i in range(len(labels)):
        handles.append(matplotlib.patches.Patch(label=labels[i], color=legend_colors[i]))

    fig.legend(handles=handles, loc ="lower right")
    
    plt.show()


In [None]:
def disply_atter_per_column(inedexs: list, values: list, columns_label: list, min_value: float, max_value: float):
    global legend_colors, target_column, target_column_label
    
    # Main elements    
    fig = plt.figure(figsize = [16, 8])
        
    for i in range(len(values)):        
        value = values[i]
        if value is not None:
            index = indexs[i]
            label = columns_label[i]
            
            # Graf
            ax_graf =  fig.add_subplot(1, len(values), i+1)
            
            plt.sca(ax_graf)
            plt.plot(index, value)
            plt.xlabel(label)
            plt.ylabel(target_column_label)
            plt.ylim(min_value, max_value)
            
    plt.show()

In [None]:
def disply_mluti_bars(inedexs: list, values: list, indexs_label, values_label, elements_in_line: int):
    global legend_colors
    
    values += np.full(elements_in_line-len(values)%elements_in_line, None).tolist()
    inedexs += np.full(elements_in_line-len(values)%elements_in_line, None).tolist()
    
    h = int(len(values) / elements_in_line)
    
    # Main elements 
    fig = plt.figure(figsize = [16, h * 8])
    
    gs = fig.add_gridspec(h, elements_in_line)
    
    line = 0
    while len(values) > 0:
        now_values = values[:elements_in_line]
        now_inedexs = inedexs[:elements_in_line]
                
        for i in range(elements_in_line):        
            value = now_values[i]
            if value is not None:
                index = now_inedexs[i]
                value = now_values[i]
            
                index_label = indexs_label
                value_label = values_label
                if type(indexs_label) == list:
                    index_label = indexs_label[i]
            
                if type(values_label) == list:
                    value_label = values_label[i]
            
                # Graf
                ax_graf = fig.add_subplot(gs[line, i])
        
                plt.sca(ax_graf)
                plt.bar(index, value)
                plt.xlabel(index_label)
                plt.ylabel(value_label)
                
        values = values[elements_in_line:]
        inedexs = inedexs[elements_in_line:]
        
        if type(indexs_label) == list:
            indexs_label = indexs_label[elements_in_line:]
            
        if type(values_label) == list:
            values_label = values_label[elements_in_line:]
        
        line += 1
    
    plt.show()

In [None]:
# Shows the average of "Has Expensive Level" for every column by value
# Average of "Has Expensive Level" symbolizes how likely it is to have expensive level

if SHOW_IN_RELEASE:
    
    printed_columns = prime_countable_columns
    atter_name = 'mean'
    
    elements_in_line = 3
    printed_columns += np.full(elements_in_line-len(printed_columns)%elements_in_line, None).tolist()
    while len(printed_columns) > 0:
        now_columns = printed_columns[:elements_in_line]
        indexs = []
        values = []
        for column in now_columns:
            if column is None:
                indexs += [None]
                values += [None]
            else:
                atter_df = df[[target_column, column]].groupby([column])
                atter_df = getattr(atter_df, atter_name)()
            
                indexs += [atter_df.index]
                values += [atter_df[target_column].values]            
        
        disply_atter_per_column(indexs, values, printed_columns[:elements_in_line], 0.0, 1.0)
        printed_columns = printed_columns[elements_in_line:]

In [None]:
## calculates the probability based on sections

if SHOW_IN_RELEASE:
    
    sections=20
    atter_name = 'mean'
    elements_in_line = 2
    printed_columns = prime_non_countable_columns
    
    printed_columns += np.full(elements_in_line-len(printed_columns)%elements_in_line, None).tolist()
    while len(printed_columns) > 0:
        now_columns = printed_columns[:elements_in_line]
        indexs = []
        values = []
        for column in now_columns:
            if column is None:
                indexs += [None]
                values += [None]
            else:
                new_df = df[[target_column, column]].sort_values(column)
                length = len(new_df)
                new_df["Sections"] = df.index//(length//sections)
                section_name = range(0,100,100//sections)
                atter_df = new_df[[target_column, "Sections"]].groupby(["Sections"])
                atter_df = getattr(atter_df, atter_name)()
            
                indexs += [atter_df.index]
                values += [atter_df[target_column].values]            
        
        disply_atter_per_column(indexs, values, printed_columns[:elements_in_line], 0.0, 1.0)
        printed_columns = printed_columns[elements_in_line:]


In [None]:
# Shows Claimed

if SHOW_IN_RELEASE: 
    disply_count_of_flag_column('Claimed', 'Claimed', 'Claimed', 'Unclaimed',
                                [target_column_label_true, target_column_label_false])

In [None]:
# Has Website

if SHOW_IN_RELEASE:
    disply_count_of_flag_column('HasWebsite', 'Has Website', 'Have', 'Not Have',
                                [target_column_label_true, target_column_label_false])

In [None]:
# Shows how much data we have for each value in each feature

if SHOW_IN_RELEASE:
    
    printed_columns = prime_countable_columns  
    elements_in_line = 3
    indexs = []
    values = []
    for column in printed_columns:
        if column is not None:
            count_df = df[[target_column, column]].groupby([column]).count()
            
            indexs += [count_df.index]
            values += [count_df[target_column].values.tolist()]            
    
    disply_mluti_bars(indexs, values, printed_columns, 'Count', elements_in_line)
    

In [None]:
if SHOW_IN_RELEASE:
    
    printed_columns = prime_non_countable_columns    
    elements_in_line = 3
    printed_columns += np.full(elements_in_line-len(printed_columns)%elements_in_line, None).tolist()
    while len(printed_columns) > 0:
        now_columns = printed_columns[:elements_in_line]
        indexs = []
        values = []
        for column in now_columns:
            if column is None:
                indexs += [None]
                values += [None]
            else:
                count_df = df[[target_column, column]].groupby([column]).count()
            
                indexs += [count_df.index]
                values += [count_df[target_column].values.tolist()]            

        disply_mluti_bars(indexs, values, printed_columns[:elements_in_line], 'Count', elements_in_line)
        printed_columns = printed_columns[elements_in_line:]

In [None]:
# Shows how much features we lose if we decide to limit the amount of instances a category should have to appear

if SHOW_IN_RELEASE:
    column_values = df.columns.map(lambda x: x.startswith("Cat_"))
    cat_df = df.loc[:, column_values]
    categ = cat_df.sum().sort_values()
    limits = range(50)
    remainders = []
    for limit in limits:
        remainders += [(categ.values<limit).sum()]
    fig= plt.figure()
    ax_graph= fig.add_subplot(1,1,1)
    ax_graph= plt.step(remainders,limits)
    for i in range(0,50,5):
        plt.plot([0,300],[i,i],linestyle= ":" )
    plt.ylabel("the cutoff")
    plt.xlabel("categories we lose")
    plt.show()

In [None]:
# Shows us how much categories we have for different probabilities
# This is only for lior

if SHOW_IN_RELEASE:
    sub_categories = filter(lambda x: x.startswith("Cat_"),df.columns)
    x0 = np.arange(0,1.05,0.05)
    y0 = np.zeros(21)
    x1 = np.arange(0,1.05,0.05)
    y1 = np.zeros(21)
    for category in sub_categories:
        mean_df = df[[target_column, category]].groupby([category]).mean()
        y0[(int)((mean_df.loc[0.0,target_column]*20).round())] += 1
        if (1.0 in mean_df.index):
            y1[(int)((mean_df.loc[1.0,target_column]*20).round())] += 1
    fig = plt.figure()
    ax0_graph = fig.add_subplot(2,1,1)
    plt.plot(x0,y0)
    plt.xlabel("probability of is expensive given not in category")
    plt.ylabel("amount of categories")
    ax1_graph = fig.add_subplot(2,1,2)
    ax1_graph = plt.plot(x1,y1)
    plt.xlabel("probability of is expensive given in category")
    plt.ylabel("amount of categories")
    plt.show()

In [None]:
## create collumns which generally represents a bunch of other collumns based on their relative probability
class create_general_collumns:
    
    def __init__(self, starter, collumns_number):
        self.collumn_map = np.empty(collumns_number+1, dtype= object)
        self.starter = starter
        self.starts = lambda x:x.startswith(starter)
        self.collumns_number = collumns_number
        
        
    def fit(self, df_create: pd.DataFrame):
        sub_categories = filter(self.starts, df_create.columns)
        
        for i in range(self.collumn_map.shape[0]):
            self.collumn_map[i] = []
        
        for category in sub_categories:
            mean_df = df[[target_column, category]].groupby([category]).mean()
            if (1.0 in mean_df.index):
                self.collumn_map[(int)((mean_df.loc[1.0, target_column]*self.collumns_number).round())] += [category]
    
    
    def transform(self, df_create:pd.DataFrame):
        df_copy = df_create.copy()
        
        index=0
        for category_class in self.collumn_map:
            df_copy["general" + self.starter + str(index)] = df_copy[category_class].sum(axis=1)
            index += 1
        return df_copy

In [None]:
# Time enums

week_days = {
    "1": "Sunday",
    "2":"Monday",
    "3":"Tuesday",
    "4":"Wednesday",
    "5": "Thursday",
    "6":"Friday",
    "7":"Saturday"
}

time_open_select = [
    '12:00 AM', '12:30 AM',
    '1:00 AM', '1:30 AM',
    '2:00 AM', '2:30 AM',
    '3:00 AM', '3:30 AM',
    '4:00 AM', '4:30 AM',
    '5:00 AM', '5:30 AM',
    '6:00 AM', '6:30 AM',
    '7:00 AM', '7:30 AM',
    '8:00 AM', '8:30 AM',
    '9:00 AM', '9:30 AM',
    '10:00 AM', '10:30 AM',
    '11:00 AM', '11:30 AM',
    '12:00 PM', '12:30 PM',
    '1:00 PM', '1:30 PM',
    '2:00 PM', '2:30 PM',
    '3:00 PM', '3:30 PM',
    '4:00 PM', '4:30 PM',
    '5:00 PM', '5:30 PM',
    '6:00 PM', '6:30 PM',
    '7:00 PM', '7:30 PM',
    '8:00 PM', '8:30 PM',
    '9:00 PM', '9:30 PM',
    '10:00 PM', '10:30 PM',
    '11:00 PM', '11:30 PM',
    ]

time_end_select = [
    '12:00 AM', '12:30 AM',
    '1:00 AM', '1:30 AM',
    '2:00 AM', '2:30 AM',
    '3:00 AM', '3:30 AM',
    '4:00 AM', '4:30 AM',
    '5:00 AM', '5:30 AM',
    '6:00 AM', '6:30 AM',
    '7:00 AM', '7:30 AM',
    '8:00 AM', '8:30 AM',
    '9:00 AM', '9:30 AM',
    '10:00 AM', '10:30 AM',
    '11:00 AM', '11:30 AM',
    '12:00 PM', '12:30 PM',
    '1:00 PM', '1:30 PM',
    '2:00 PM', '2:30 PM',
    '3:00 PM', '3:30 PM',
    '4:00 PM', '4:30 PM',
    '5:00 PM', '5:30 PM',
    '6:00 PM', '6:30 PM',
    '7:00 PM', '7:30 PM',
    '8:00 PM', '8:30 PM',
    '9:00 PM', '9:30 PM',
    '10:00 PM', '10:30 PM',
    '11:00 PM', '11:30 PM',
    '12:00 AM ', '12:30 AM',
    '1:00 AM ', '1:30 AM ',
    '2:00 AM ', '2:30 AM ',
    '3:00 AM ', '3:30 AM ',
    '4:00 AM ', '4:30 AM ',
    '5:00 AM ', '5:30 AM ',
    '6:00 AM ', '6:30 AM ',
    ]


In [None]:
# Show the opening hour ped day

if SHOW_IN_RELEASE:    
    fig = plt.figure(figsize= (16, 8))
    
    ax = fig.add_subplot(1, 1, 1)
    plt.sca(ax)
    
    plt.ylabel("is expensive")
    plt.xlabel("Opening Hour")
    
    for i in range(7):
        mean_df = df[["OpenHour"+str(i+1), target_column]].groupby(["OpenHour"+str(i+1)]).mean()
        plt.plot(mean_df.index, mean_df.values, label = week_days[str(i+1)])
    
    ax.set_xticks(range(0,48,6))
    ax.set_xticklabels(map(lambda x:time_open_select[int(x)], range(0,48,6)))
    
    plt.ylim(-0.1, 1.1)
    plt.legend()
    
    plt.show()

In [None]:
# Show the Daily Hours per day

if SHOW_IN_RELEASE:    
    fig = plt.figure(figsize= (16, 8))
    
    ax = fig.add_subplot(1, 1, 1)
    plt.sca(ax)
    
    plt.ylabel("is expensive")
    plt.xlabel("Daily Hours")
    
    for i in range(7):
        mean_df = df[["CountHour"+str(i+1),target_column]].groupby(["CountHour"+str(i+1)]).mean()
        plt.plot(mean_df.index,mean_df.values,label = week_days[str(i+1)])

    plt.ylim(-0.1, 1.1)
    plt.legend()
    
    plt.show()

In [None]:
# Show the ending hour ped day

if SHOW_IN_RELEASE:    
    fig = plt.figure(figsize= (16, 8 * 4))
    
    for i in range(7):
        ax_end=fig.add_subplot(5, 2, 1+i)
        plt.sca(ax_end)
        plt.ylabel("is expensive")
        plt.xlabel("Ending Hour")
        ax_end.set_title(week_days[str(i+1)])
        mean_df = df[["EndHour"+str(i+1),target_column]].groupby(["EndHour"+str(i+1)]).mean()
        plt.plot(mean_df.index,mean_df.values)
        ax_end.set_xticks(range(0,61,6))
        ax_end.set_xticklabels(map(lambda x:time_end_select[int(x)], range(0,61,6)))
                    
        plt.ylim(-0.1, 1.1)
    
    
    plt.tight_layout()
    plt.show()

In [None]:
# Split the tain data,

x_sub_train, x_sub_test, y_sub_train, y_sub_test = train_test_split(x_train, y_train, random_state=42)

In [None]:
# Create a Simple model base KNN

if SHOW_IN_RELEASE:
    knn = KNeighborsClassifier()

    knn.fit(x_sub_train, y_sub_train)

    score = knn.score(x_sub_test, y_sub_test)
    print("Score: ", score)

# Score:  0.78

In [None]:
# Create modal zero (Dummy Modal)

if SHOW_IN_RELEASE:
    dummy_modal = DummyClassifier()

    dummy_modal.fit(x_sub_train, y_sub_train)

    score = dummy_modal.score(x_sub_test, y_sub_test)
    print("Dummy Modal Score: ", score)

# Dummy Modal Score:  0.65

In [None]:
df_models = pd.DataFrame({
    'Name': [],
    'Score': [],
    'Best Params': []
})

In [None]:
DEFAULT_CV = 5
DEFAULT_RANDOM_STATE = 70

In [None]:
# creates models for testing
# old parameter: scale_me
# models: list of model_param (dict)
# model_param paraments:
#   name : name the new model
#   model: model class from sklearn
#   params: param_grid for Grid Search
# model_param optional paraments:
#   disable_random_state: remove random_state arg
#   cv: cv for Grid Search
#   to_scale: flag if to scale the data
#   preprocessor: preprocessor function
#   selected: flag if to add to return arr
def try_multi_models(models, scale_me: bool):
    global x_sub_train, y_sub_train, x_sub_test, y_sub_test
    global df_models
    
    ret = []
    
    df_models_add = pd.DataFrame({
        'Name': np.full(len(models), None),
        'Score': np.full(len(models), None),
        'Best Params': np.full(len(models), None),  
    })
        
    i = 0
    for model_params in models:
        name = model_params['name']
        df_models_add.at[i, 'Name'] = name
        print('Work on ', name)
        try:
            model_class = model_params['model']
            if ('disable_random_state' in model_params and model_params['disable_random_state']):
                model = model_class()
            else:
                model = model_class(random_state=DEFAULT_RANDOM_STATE)
                
            if 'cv' in model_params:
                cv = model_params['cv']
            else:
                cv = DEFAULT_CV
            
            to_scale = scale_me
            if 'to_scale' in model_params:
                to_scale = model_params['to_scale']
            
            x_sub_train_now = x_sub_train
            y_sub_train_now = y_sub_train
            x_sub_test_now = x_sub_test
            y_sub_test_now = y_sub_test
    
            if to_scale:
                scaler = StandardScaler()
                x_sub_train_now = scaler.fit_transform(X=x_sub_train_now)
                x_sub_test_now = scaler.transform(x_sub_test_now)
            
            if 'preprocessor' in model_params:
                preprocessor = model_params['preprocessor']        
                
                x_sub_train_now, y_sub_train_now, x_sub_test_now, y_sub_test_now = preprocessor(x_sub_train_now, y_sub_train_now, x_sub_test_now, y_sub_test_now)

            
            param_grid = model_params['params']
            
            if cv is None:
                grid_search = GridSearchCV(estimator=model, param_grid=param_grid)
            else:
                grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
            
            print('Fit ', name)
            grid_search.fit(x_sub_train_now, y_sub_train_now)
            
            score = grid_search.score(x_sub_test_now, y_sub_test_now)
            print(name, " Score: ", score)
            
            df_models_add.at[i, 'Score'] = score
            df_models_add.at[i, 'Best Params'] = json.dumps(grid_search.best_params_)
            
            if 'selected' in model_params and model_params['selected']:
                res_as_df = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]), pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["Accuracy"])], axis=1)
                res_as_df.sort_values('Accuracy', ascending=False, inplace=True)
                ret.append(res_as_df)
            
        except Exception as ex:
            print(name, " Failed!", ex)
        
        i += 1
    
    df_models = pd.concat([df_models, df_models_add])
    df_models.reset_index(drop=True, inplace=True)
    
    return ret

In [None]:
# First tring

if SHOW_IN_RELEASE:
    models = [
        
        {
            'name': 'KNeighborsClassifier',
            'model': KNeighborsClassifier,
            'params': {},
            'disable_random_state': True
        }, {
            'name': 'LogisticRegression',
            'model': LogisticRegression,
            'params': {
                'max_iter': [100, 200]
            }
        }, {
            'name': 'Lasso',
            'model': Lasso,
            'params': {}
        }, {
#            'name': 'SVC',
#            'model': SVC,
#            'params': {},
#            'disable_random_state': True
#        }, {
            'name': 'LinearSVC',
            'model': LinearSVC,
            'params': {}
        }, {
#            'name': 'SVR',
#            'model': SVR,
#            'params': {},
#            'disable_random_state': True
#        }, {
            'name': 'RandomForestClassifier',
            'model': RandomForestClassifier,
            'params': {}
        }
    ]

    try_multi_models(models, False)

# RandomForestClassifier Score:  0.8541162227602905
# LinearSVC              Score:  0.8243038740920097
# LogisticRegression     Score:  0.8193099273607748
# KNeighborsClassifier   Score:  0.7860169491525424
# SVR                    Score:  0.3687542760469187
# Lasso                  Score:  0.10050948422148354

In [None]:
# Tring other models

if SHOW_IN_RELEASE:
    models = [
        {
            'name': 'RandomForestRegressor',
            'model': RandomForestRegressor,
            'params': {}
        }
    ]

    try_multi_models(models, False)
# RandomForestRegressor  Score:  0.5371579181722024

In [None]:
df_models

In [None]:
df_models.sort_values('Score', ascending=False, inplace=True)
df_models

In [None]:
# The bast score comes form Random Forest Classifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
# Learning about VotingClassifier

if SHOW_IN_RELEASE:
    vs = VotingClassifier(estimators=[
        ('RandomForestClassifier', RandomForestClassifier()),
        ('LogisticRegression', LogisticRegression())
    ])

    vs.fit(x_sub_train, y_sub_train)

    score = vs.score(x_sub_test, y_sub_test)
    print("Voting Classifier Score: ", score)

# Voting Classifier Score:  0.8221852300242131

# Not good, Random Forest gives 0.85 alone

In [None]:
# Tring other models

ret = ['None']
if SHOW_IN_RELEASE:
    models = [
        {
            'name': 'RandomForestClassifier V2',
            'model': RandomForestClassifier,
            'params': {
                'n_estimators': range(100, 201, 100)
            }
        }
    ]

    try_multi_models(models, False)

# n_estimators not Change to muth,
# 200 estimators is the best

ret[0]

In [None]:
if SHOW_IN_RELEASE:
    models = [
        {
            'name': 'KNeighborsClassifier Scaler',
            'model': KNeighborsClassifier,
            'params': {},
            'disable_random_state': True
        }, {
            'name': 'LogisticRegression Scaler',
            'model': LogisticRegression,
            'params': {
                'max_iter': [100, 200]
            }
        }, {
            'name': 'LinearSVC Scaler',
            'model': LinearSVC,
            'params': {}
        }, {
            'name': 'RandomForestClassifier Scaler',
            'model': RandomForestClassifier,
            'params': {
                'n_estimators': range(100, 851, 50)
            },
            'selected': True
        }
    ]

    ret = try_multi_models(models, True)

# 
ret[0]

In [None]:
if SHOW_IN_RELEASE:
    models = [
        {
            'name': 'LogisticRegression Scaler',
            'model': LogisticRegression,
            'params': {
                'max_iter': [100]
            }
        }, {
            'name': 'LinearSVC Scaler',
            'model': LinearSVC,
            'params': {}
        }, {
            'name': 'RandomForestClassifier Scaler',
            'model': RandomForestClassifier,
            'params': {
                'n_estimators': [200]
            },
            'selected': True
        }
    ]

    ret = try_multi_models(models, True)

# 
ret[0]

In [None]:
df_models.sort_values('Score', ascending=False, inplace=True)
df_models

In [None]:
version_model = 1

def learn_load():
    global x_sub_train, x_sub_test, y_sub_train, y_sub_test
    x_sub_train, x_sub_test, y_sub_train, y_sub_test = train_test_split(x_train, y_train, random_state=42)

def learn_fit():    
    global version_model
    version_model += 1
    models = [
        {
            'name': 'RandomForestClassifier ' + str(version_model),
            'model': RandomForestClassifier,
            'params': {
                'n_estimators': [200]
            },
            'selected': True
        }, {
            'name': 'LogisticRegression ' + str(version_model),
            'model': LogisticRegression,
            'params': {
                'max_iter': [100]
            },
            'to_scale': True
        }, {
            'name': 'LinearSVC ' + str(version_model),
            'model': LinearSVC,
            'params': {},
            'to_scale': True
        }
    ]

    return try_multi_models(models, False)


In [None]:
if SHOW_IN_RELEASE:
    learn_load()

    create_coll = create_general_collumns("Cat", 15)
    create_coll.fit(x_sub_train)
    x_sub_train = create_coll.transform(x_sub_train)
    x_sub_test = create_coll.transform(x_sub_test)

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)

df_models

In [None]:
df_models.sort_values('Score', ascending=False, inplace=True)
df_models

In [None]:
if SHOW_IN_RELEASE:
    learn_load()

    def check(x):
        if x == 'HasExpensiveLevel':
            return False

        if x.startswith("Cat_"):
            return False
        if x.startswith("ExtraAt_"):
            return False
        if x.startswith("At_"):
            return False

        return True

    fi = list(filter(check, df.columns))

    x_sub_train = x_sub_train[fi]
    x_sub_test = x_sub_test[fi]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)

df_models

In [None]:
if SHOW_IN_RELEASE:
    learn_load()

    def check(x):
        if x == 'HasExpensiveLevel':
            return False

        if x.startswith("Cat_"):
            return False
        if x.startswith("ExtraAt_"):
            return False

        return True

    fi = list(filter(check, df.columns))

    x_sub_train = x_sub_train[fi]
    x_sub_test = x_sub_test[fi]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)

df_models

In [None]:
if SHOW_IN_RELEASE:
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'LinearSVC ' + str(version_model),
                'model': LinearSVC,
                'params': {
                    'max_iter': list(range(800, 2001, 100))
                },
                'to_scale': True,
                'cv': 5
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def check(x):
        if x == 'HasExpensiveLevel':
            return False

        if x.startswith("Cat_"):
            return False
        if x.startswith("ExtraAt_"):
            return False

        return True

    fi = list(filter(check, df.columns))

    x_sub_train = x_sub_train[fi]
    x_sub_test = x_sub_test[fi]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)

df_models

In [None]:
if SHOW_IN_RELEASE:
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200]
                },
                'preprocessor': pre_RandomForest
            }, {
                'name': 'LogisticRegression ' + str(version_model),
                'model': LogisticRegression,
                'params': {
                    'max_iter': [100, 200]
                },
                'preprocessor': pre_LogisticRegression
            }, {
                'name': 'LinearSVC ' + str(version_model),
                'model': LinearSVC,
                'params': {
                    'max_iter': [800]
                },
                'cv': 5,
                'preprocessor': pre_LinearSVC
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def check(x):
        if x == 'HasExpensiveLevel':
            return False

        if x.startswith("Cat_"):
            return False
        if x.startswith("ExtraAt_"):
            return False

        return True

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(check, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)

df_models

In [None]:
if SHOW_IN_RELEASE:
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200]
                },
                'preprocessor': pre_RandomForest
            }, {
                'name': 'LogisticRegression ' + str(version_model),
                'model': LogisticRegression,
                'params': {
                    'max_iter': [100, 200]
                },
                'preprocessor': pre_LogisticRegression
            }, {
                'name': 'LinearSVC ' + str(version_model),
                'model': LinearSVC,
                'params': {
                    'max_iter': [800]
                },
                'cv': 5,
                'preprocessor': pre_LinearSVC
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_most_of_collumns(x):
        if x == 'HasExpensiveLevel':
            return False

        if x.startswith("Cat_"):
            return False
        if x.startswith("ExtraAt_"):
            return False

        return True

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_most_of_collumns, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        create_coll = create_general_collumns("Cat", 15)
        create_coll.fit(pd.concat([x_sub_train, y_sub_train], axis=1))

        x_sub_train = create_coll.transform(x_sub_train)
        x_sub_test = create_coll.transform(x_sub_test)

        fi = list(filter(remove_startswith_collumns(['Cat_']), df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        create_coll = create_general_collumns("Cat", 15)
        create_coll.fit(pd.concat([x_sub_train, y_sub_train], axis=1))

        x_sub_train = create_coll.transform(x_sub_train)
        x_sub_test = create_coll.transform(x_sub_test)

        fi = list(filter(remove_startswith_collumns(['Cat_']), df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)

df_models

In [None]:
if SHOW_IN_RELEASE:
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200]
                },
                'preprocessor': pre_RandomForest
            }, {
                'name': 'LogisticRegression ' + str(version_model),
                'model': LogisticRegression,
                'params': {
                    'max_iter': [100, 200]
                },
                'preprocessor': pre_LogisticRegression
            }, {
                'name': 'LinearSVC ' + str(version_model),
                'model': LinearSVC,
                'params': {
                    'max_iter': [800]
                },
                'cv': 5,
                'preprocessor': pre_LinearSVC
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_most_of_collumns(x):
        if x == 'HasExpensiveLevel':
            return False

        if x.startswith("Cat_"):
            return False
        if x.startswith("ExtraAt_"):
            return False

        return True

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_most_of_collumns, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        create_coll = create_general_collumns("ExtraAt_", 15)
        create_coll.fit(pd.concat([x_sub_train, y_sub_train], axis=1))

        x_sub_train = create_coll.transform(x_sub_train)
        x_sub_test = create_coll.transform(x_sub_test)

        fi = list(filter(remove_startswith_collumns(['ExtraAt_']), df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        create_coll = create_general_collumns("ExtraAt_", 15)
        create_coll.fit(pd.concat([x_sub_train, y_sub_train], axis=1))

        x_sub_train = create_coll.transform(x_sub_train)
        x_sub_test = create_coll.transform(x_sub_test)

        fi = list(filter(remove_startswith_collumns(['ExtraAt_']), df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)

df_models

In [None]:
if SHOW_IN_RELEASE:
    def create_models(params: list):
        models = []

        for param in params:
            add_models = [
                {
                    'name': 'LogisticRegression ' + str(version_model) + ' - ' + str(param),
                    'model': LogisticRegression,
                    'params': {
                        'max_iter': [200]
                    },
                    'preprocessor': create_pre_LogisticRegression(param)
                }, {
                    'name': 'LinearSVC ' + str(version_model) + ' - ' + str(param),
                    'model': LinearSVC,
                    'params': {
                        'max_iter': [800]
                    },
                    'cv': 5,
                    'preprocessor': create_pre_LinearSVC(param)
                }
            ]

            for model in add_models:
                models.append(model)

        return models


    def learn_fit():    
        global version_model
        version_model += 1
        models = create_models(list(range(10, 31, 3)))
        return try_multi_models(models, False)

    learn_load()

    def remove_most_of_collumns(x):
        if x == 'HasExpensiveLevel':
            return False

        if x.startswith("Cat_"):
            return False
        if x.startswith("ExtraAt_"):
            return False

        return True

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_most_of_collumns, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def create_pre_LogisticRegression(param):
        def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
            create_coll = create_general_collumns("ExtraAt_", param)
            create_coll.fit(pd.concat([x_sub_train, y_sub_train], axis=1))

            x_sub_train = create_coll.transform(x_sub_train)
            x_sub_test = create_coll.transform(x_sub_test)

            fi = list(filter(remove_startswith_collumns(['ExtraAt_']), df.columns))
            x_sub_train = x_sub_train[fi]
            x_sub_test = x_sub_test[fi]

            scaler = StandardScaler()
            x_sub_train = scaler.fit_transform(X=x_sub_train)
            x_sub_test = scaler.transform(x_sub_test)

            return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

        return pre_LogisticRegression

    def create_pre_LinearSVC(param):
        def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
            create_coll = create_general_collumns("ExtraAt_", param)
            create_coll.fit(pd.concat([x_sub_train, y_sub_train], axis=1))

            x_sub_train = create_coll.transform(x_sub_train)
            x_sub_test = create_coll.transform(x_sub_test)

            fi = list(filter(remove_startswith_collumns(['ExtraAt_']), df.columns))
            x_sub_train = x_sub_train[fi]
            x_sub_test = x_sub_test[fi]

            scaler = StandardScaler()
            x_sub_train = scaler.fit_transform(X=x_sub_train)
            x_sub_test = scaler.transform(x_sub_test)

            return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]
        return pre_LinearSVC

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)

df_models

In [None]:
if SHOW_IN_RELEASE:
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200]
                },
                'preprocessor': pre_RandomForest
            }, {
                'name': 'LogisticRegression ' + str(version_model),
                'model': LogisticRegression,
                'params': {
                    'max_iter': [100, 200]
                },
                'preprocessor': pre_LogisticRegression
            }, {
                'name': 'LinearSVC ' + str(version_model),
                'model': LinearSVC,
                'params': {
                    'max_iter': [800]
                },
                'cv': 5,
                'preprocessor': pre_LinearSVC
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret

    def remove_most_of_collumns(x):
        return remove_startswith_collumns(['AttributesCount', 'Cat_', 'ExtraAt_'])(x)

    def check2(x):
        return remove_startswith_collumns(['AttributesCount'])(x)

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_most_of_collumns, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(check2, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(check2, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)
    df_models = df_models.dropna(subset=['Score'])

df_models

In [None]:
if SHOW_IN_RELEASE:
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200]
                },
                'preprocessor': pre_RandomForest
            }, {
                'name': 'LogisticRegression ' + str(version_model),
                'model': LogisticRegression,
                'params': {
                    'max_iter': [100, 200]
                },
                'preprocessor': pre_LogisticRegression
            }, {
                'name': 'LinearSVC ' + str(version_model),
                'model': LinearSVC,
                'params': {
                    'max_iter': [800]
                },
                'cv': 5,
                'preprocessor': pre_LinearSVC
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret

    def remove_most_of_collumns(x):
        return remove_startswith_collumns(['AttributesCount', 'WeeklyBreaks', 'Cat_', 'ExtraAt_'])(x)

    def check2(x):
        return remove_startswith_collumns(['AttributesCount', 'WeeklyBreaks'])(x)

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_most_of_collumns, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(check2, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(check2, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)
    df_models = df_models.dropna(subset=['Score'])

df_models

In [None]:
if SHOW_IN_RELEASE:
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200]
                },
                'preprocessor': pre_RandomForest
            }, {
                'name': 'LogisticRegression ' + str(version_model),
                'model': LogisticRegression,
                'params': {
                    'max_iter': [100, 200]
                },
                'preprocessor': pre_LogisticRegression
            }, {
                'name': 'LinearSVC ' + str(version_model),
                'model': LinearSVC,
                'params': {
                    'max_iter': [800]
                },
                'cv': 5,
                'preprocessor': pre_LinearSVC
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret

    def remove_most_of_collumns(x):
        return remove_startswith_collumns(['AttributesCount', 'WeeklyBreaks', 'QuestionsCount', 'Cat_', 'ExtraAt_'])(x)

    def check2(x):
        return remove_startswith_collumns(['AttributesCount', 'WeeklyBreaks', 'QuestionsCount'])(x)

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_most_of_collumns, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(check2, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(check2, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)
    df_models = df_models.dropna(subset=['Score'])

df_models

In [None]:
# We created today a wild function

In [None]:
# df_models.head(3)
# RandomForestClassifier: 0.847155 {"n_estimators": 200}
# LogisticRegression :    0.841404 {"max_iter": 200}
# LinearSVC:              0.833384 {"max_iter": 800}

In [None]:
if SHOW_IN_RELEASE:
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200]
                },
                'preprocessor': pre_RandomForest
            }, {
                'name': 'LogisticRegression ' + str(version_model),
                'model': LogisticRegression,
                'params': {
                    'max_iter': [100, 200]
                },
                'preprocessor': pre_LogisticRegression
            }, {
                'name': 'LinearSVC ' + str(version_model),
                'model': LinearSVC,
                'params': {
                    'max_iter': [800]
                },
                'preprocessor': pre_LinearSVC
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret
    
    sp_collumns = ['AttributesCount', 'WeeklyBreaks', 'QuestionsCount', 'HasWebsite', 'Photos']

    def remove_collumns_for_model_1(x):
        return remove_startswith_collumns(sp_collumns + ['SubCategoriesCount', 'Cat_', 'ExtraAt_'])(x)

    def remove_collumns_for_model_2(x):
        return remove_startswith_collumns(sp_collumns + [])(x)
    
    def remove_collumns_for_model_3(x):
        return remove_startswith_collumns(sp_collumns + [])(x)

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_collumns_for_model_1, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_collumns_for_model_2, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_collumns_for_model_3, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)
    df_models = df_models.dropna(subset=['Score'])

# Impotents: Claimed, Stars, Reviews, Photos
df_models

In [None]:
if SHOW_IN_RELEASE:
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200]
                },
                'preprocessor': pre_RandomForest
            }, {
                'name': 'LogisticRegression ' + str(version_model),
                'model': LogisticRegression,
                'params': {
                    'max_iter': [100, 200]
                },
                'preprocessor': pre_LogisticRegression
            }, {
                'name': 'LinearSVC ' + str(version_model),
                'model': LinearSVC,
                'params': {
                    'max_iter': [800]
                },
                'preprocessor': pre_LinearSVC
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret
    
    sp_collumns = ['AttributesCount', 'WeeklyBreaks', 'QuestionsCount', 'HasWebsite', 'Photos']

    def remove_collumns_for_model_1(x):
        return remove_startswith_collumns(sp_collumns + ['SubCategoriesCount', 'Cat_', 'ExtraAt_'])(x)

    def remove_collumns_for_model_2(x):
        return remove_startswith_collumns(sp_collumns + [])(x)
    
    def remove_collumns_for_model_3(x):
        return remove_startswith_collumns(sp_collumns + [])(x)

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_collumns_for_model_1, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_collumns_for_model_2, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        fi = list(filter(remove_collumns_for_model_3, df.columns))
        x_sub_train = x_sub_train[fi]
        x_sub_test = x_sub_test[fi]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)
    df_models = df_models.dropna(subset=['Score'])

# Impotents: Claimed, Stars, Reviews, Photos
df_models

In [None]:
df_models

In [None]:
n_features = 48
if SHOW_IN_RELEASE:
    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False
            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret

    def remove_collumns_for_model_RandomForest(x):
        return remove_startswith_collumns(sp_collumns + ['SubCategoriesCount', 'Cat_', 'ExtraAt_'])(x)

    def pre_RandomForest():
        features = list(filter(remove_collumns_for_model_RandomForest, df.columns))

        return len(features)
    n_features = pre_RandomForest()

n_features

int(math.sqrt(n_features))

In [None]:
if SHOW_IN_RELEASE:
    n_features = 48
    
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200],
                    'max_features': [
                        int(math.sqrt(n_features)) - 2,
                        int(math.sqrt(n_features)),
                        int(math.log2(n_features)),
                        int(n_features)
                    ]
                },
                'preprocessor': pre_RandomForest,
                'selected': True
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret
    
    sp_collumns = ['AttributesCount', 'WeeklyBreaks', 'QuestionsCount', 'HasWebsite', 'Photos']

    def remove_collumns_for_model_1(x):
        return remove_startswith_collumns(sp_collumns + ['SubCategoriesCount', 'Cat_', 'ExtraAt_'])(x)

    def remove_collumns_for_model_2(x):
        return remove_startswith_collumns(sp_collumns + [])(x)
    
    def remove_collumns_for_model_3(x):
        return remove_startswith_collumns(sp_collumns + [])(x)

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_1, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_2, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_3, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    ret = learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)
    df_models = df_models.dropna(subset=['Score'])

# Impotents: Claimed, Stars, Reviews, Photos
df_models
ret[0]

In [None]:
if SHOW_IN_RELEASE:
    n_features = 48
    
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200],
                    'max_features': [1, 2, 3, 6, 9, 10, 11],
                    'max_depth': [1, 2, 3, 6, 9, 10, 11],
                    'min_samples_split': [1, 2, 3]
                },
                'preprocessor': pre_RandomForest,
                'selected': True
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret
    
    sp_collumns = ['AttributesCount', 'WeeklyBreaks', 'QuestionsCount', 'HasWebsite', 'Photos']

    def remove_collumns_for_model_1(x):
        return remove_startswith_collumns(sp_collumns + ['SubCategoriesCount', 'Cat_', 'ExtraAt_'])(x)

    def remove_collumns_for_model_2(x):
        return remove_startswith_collumns(sp_collumns + [])(x)
    
    def remove_collumns_for_model_3(x):
        return remove_startswith_collumns(sp_collumns + [])(x)

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_1, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_2, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_3, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    ret = learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)
    df_models = df_models.dropna(subset=['Score'])

# Impotents: Claimed, Stars, Reviews, Photos
df_models
ret[0]

In [None]:
if SHOW_IN_RELEASE:
    n_features = 48
    
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200],
                    'max_features': [6],
                    'max_depth': range(10, 20),
                    'min_samples_split': [2]
                },
                'preprocessor': pre_RandomForest,
                'selected': True
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret
    
    sp_collumns = ['AttributesCount', 'WeeklyBreaks', 'QuestionsCount', 'HasWebsite', 'Photos']

    def remove_collumns_for_model_1(x):
        return remove_startswith_collumns(sp_collumns + ['SubCategoriesCount', 'Cat_', 'ExtraAt_'])(x)

    def remove_collumns_for_model_2(x):
        return remove_startswith_collumns(sp_collumns + [])(x)
    
    def remove_collumns_for_model_3(x):
        return remove_startswith_collumns(sp_collumns + [])(x)

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_1, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_2, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_3, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    ret = learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)
    df_models = df_models.dropna(subset=['Score'])

# Impotents: Claimed, Stars, Reviews, Photos
df_models
ret[0]

In [None]:
if SHOW_IN_RELEASE:
    n_features = 48
    
    def learn_fit():    
        global version_model
        version_model += 1
        models = [
            {
                'name': 'RandomForestClassifier ' + str(version_model),
                'model': RandomForestClassifier,
                'params': {
                    'n_estimators': [200],
                    'max_features': [6]
                },
                'preprocessor': pre_RandomForest
            }
        ]

        return try_multi_models(models, False)

    learn_load()

    def remove_startswith_collumns(texts: list):
        def ret(x):
            if x == 'HasExpensiveLevel':
                return False

            for text in texts:
                if x.startswith(text):
                    return False

            return True
        return ret
    
    sp_collumns = ['AttributesCount', 'WeeklyBreaks', 'QuestionsCount', 'HasWebsite', 'Photos']

    def remove_collumns_for_model_1(x):
        return remove_startswith_collumns(sp_collumns + ['SubCategoriesCount', 'Cat_', 'ExtraAt_'])(x)

    def remove_collumns_for_model_2(x):
        return remove_startswith_collumns(sp_collumns + [])(x)
    
    def remove_collumns_for_model_3(x):
        return remove_startswith_collumns(sp_collumns + [])(x)

    def pre_RandomForest(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_1, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LogisticRegression(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_2, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    def pre_LinearSVC(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
        features = list(filter(remove_collumns_for_model_3, df.columns))
        x_sub_train = x_sub_train[features]
        x_sub_test = x_sub_test[features]

        scaler = StandardScaler()
        x_sub_train = scaler.fit_transform(X=x_sub_train)
        x_sub_test = scaler.transform(x_sub_test)

        return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

    learn_fit()
    df_models.sort_values('Score', ascending=False, inplace=True)
    df_models = df_models.dropna(subset=['Score'])

# Impotents: Claimed, Stars, Reviews, Photos
df_models

In [None]:
df_models

In [None]:
x_train, x_finale_test, y_train, y_finale_test = train_test_split(X, Y, test_size=0.03, random_state=70)
x_sub_train, x_sub_test, y_sub_train, y_sub_test = train_test_split(x_train, y_train, random_state=42)

In [None]:
# Createing the model based Random Forest
def remove_startswith_collumns(texts: list):
    def ret(x):
        if x == 'HasExpensiveLevel':
                return False

        for text in texts:
            if x.startswith(text):
                return False

        return True
    return ret
        
def remove_collumns_for_model(x):
    sp_collumns = ['AttributesCount', 'WeeklyBreaks', 'QuestionsCount', 'HasWebsite', 'Photos']
    return remove_startswith_collumns(sp_collumns + ['SubCategoriesCount', 'Cat_', 'ExtraAt_'])(x)
    
    
def preprocessor(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
    features = list(filter(remove_collumns_for_model, df.columns))
    x_sub_train = x_sub_train[features]
    x_sub_test = x_sub_test[features]
    return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

rfc = RandomForestClassifier(n_estimators=200, max_features=10)

x_sub_train, y_sub_train, x_sub_test, y_sub_test = preprocessor(x_sub_train, y_sub_train, x_sub_test, y_sub_test)

print("Fit: ")
rfc.fit(x_sub_train, y_sub_train)

score = rfc.score(x_sub_test, y_sub_test)
print("Score: ", score)

# Score (Full Data):  0.8559322033898306
# Score            :  0.8471549636803875
# max_features=10
# n_estimators=200

In [None]:
x_train, x_finale_test, y_train, y_finale_test = train_test_split(X, Y, test_size=0.03, random_state=70)

In [None]:
# Final Test
def remove_startswith_collumns(texts: list):
    def ret(x):
        if x == 'HasExpensiveLevel':
                return False

        for text in texts:
            if x.startswith(text):
                return False

        return True
    return ret
        
def remove_collumns_for_model(x):
    sp_collumns = ['AttributesCount', 'WeeklyBreaks', 'QuestionsCount', 'HasWebsite', 'Photos']
    return remove_startswith_collumns(sp_collumns + ['SubCategoriesCount', 'Cat_', 'ExtraAt_'])(x)
    
    
def preprocessor(x_sub_train, y_sub_train, x_sub_test, y_sub_test):
    features = list(filter(remove_collumns_for_model, df.columns))
    x_sub_train = x_sub_train[features]
    x_sub_test = x_sub_test[features]
    return [x_sub_train, y_sub_train, x_sub_test, y_sub_test]

rfc = RandomForestClassifier(n_estimators=200, max_features=10)

x_train, y_train, x_finale_test, y_finale_test = preprocessor(x_train, y_train, x_finale_test, y_finale_test)

print("Fit: ")
rfc.fit(x_train, y_train)

score = rfc.score(x_finale_test, y_finale_test)
print("Score: ", score)

# Score:  0.8361858190709046

In [None]:
# END