In [2]:
import pandas as pd
import numpy as np
def get_data(file="data/Train.csv", fraction=1, drop=[], seed=42):
    df = pd.read_csv(file, parse_dates=["Date"],date_format="%Y-%m-%d" )
    df["Date"] = df["Date"].values.astype(np.int64) // 10**9
    df = df.sample(frac=fraction, random_state=seed)
    df = df.drop(drop, axis=1)
    return df
    

In [3]:
def identify_outliers_sigma(df, column, n_sigma=3):
    """
    Identify outliers using n-sigma rule
    Values beyond mean ± n*std are considered outliers
    """
    mean = df[column].mean()
    std = df[column].std()
    
    lower_bound = mean - (n_sigma * std)
    upper_bound = mean + (n_sigma * std)
    
    # Create boolean mask for outliers
    #outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
    
    return lower_bound, upper_bound


In [4]:
# Split the data
from sklearn.model_selection import train_test_split
def split(dataframe, target, stratify,seed=42):
    target = "target"
    

    y = dataframe[target]
    X = dataframe.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed,stratify=stratify )

    #df_train = pd.concat((X_train,y_train), axis=1)
    return  X_train, X_test, y_train, y_test
    #df_train


In [5]:
def column_threshold(dataframe,threshold,drop, excludes):
    # features to be encoded


    # derive list of columns with more than X % of not null values
    columns = {"column":[], "not_null": [], "exclude":[]}
    column_list = dataframe.columns.to_list()
    for column in column_list:
        excl=False
        for exclude in excludes:
            if exclude in column:
                excl = True                
        
        columns["column"].append(column)
        columns["not_null"].append(dataframe[column].count()/dataframe['target'].count()*100)
        columns["exclude"].append(excl)

    cols=pd.DataFrame(columns)
    features_to_use=list(set(cols[(cols["not_null"]>threshold)&(cols["exclude"]==False)]["column"])-set(["target"])-set(drop))
    return features_to_use

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, OneHotEncoder, PolynomialFeatures, FunctionTransformer, QuantileTransformer
from sklearn.compose import ColumnTransformer
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import r2_score, root_mean_squared_error
import time
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats


def run_models(model_name, model_class, scaler, features_categorical,features_numeric,X_train, y_train, X_test, y_test ):

    if model_name == "Linear Regression 1":
        param_dist = {'copy_X': [True,False], 
                   'fit_intercept': [True,False], 
                   'n_jobs': [1,5,10,15,None], 
                   'positive': [True,False]}
    elif model_name == "XGBoost":
        param_dist = {
            'model__max_depth': stats.randint(3, 10),
            'model__learning_rate': stats.uniform(0.01, 0.1),
            'model__subsample': stats.uniform(0.5, 0.5),
            'model__n_estimators':stats.randint(50, 200)
        }
    
    #print(features_num_key, features_numeric)
    start_time = time.time()
    encode_location = Pipeline([ 
        ("encode", OneHotEncoder(drop="first"))]) 
    
    scale_impute = Pipeline([   
        ("imputer_null",KNNImputer(n_neighbors=10, weights="distance")), # deals with null  
        ("imputer_0",KNNImputer(n_neighbors=10, weights="distance", missing_values=0)), # deals with 0  
        ("scaler", scaler )
        ]) 
    preprocess = ColumnTransformer([    
        ("scale_impute", scale_impute, features_numeric),
        ("encode_location", encode_location, features_categorical)],remainder="drop")
    
    regression = Pipeline([ 
        ('preprocess', preprocess), 
        ('model', model_class)])
    
    search_reg = RandomizedSearchCV(regression, param_distributions=param_dist, cv=5, scoring='r2', verbose=1, n_jobs=-1, n_iter=10)

    search_reg.fit(X_train, y_train)
    best_model = search_reg.best_estimator_
    pred = best_model.predict(X_test)
    #pred = regression.predict(X_test) 
    '''
    coefficients={"columns":[],"numbers":[]}
    if model_name=="XGBoost":
        coefficients["numbers"] = regression.named_steps["model"][0].feature_importances_
        coefficients["columns"] = regression[:-1].get_feature_names_out()
        
    elif model_name=="Polynomial Linear Regression":
        coefficients["numbers"] = regression.named_steps["model"][1].coef_
        coefficients["columns"] = regression.named_steps["model"][0].get_feature_names_out()
  
    
    coefficients = pd.DataFrame(coefficients)
    coefficients["numbers"]=coefficients["numbers"].abs()
    coefficients=coefficients.sort_values(by="numbers",ascending=False)
    coefficients["columns"]=coefficients["columns"].replace(to_replace="scale_impute__",value="", regex=True).replace(to_replace="encode_location__",value="", regex=True)
    coefficients=", ".join(coefficients["columns"].head(3).to_list())
    '''

    r2 = r2_score(y_test, pred)*100
    rmse =root_mean_squared_error(y_test, pred)
    end_time = time.time()
    duration = round(end_time-start_time,0)
    return r2, rmse, duration#, coefficients


In [None]:
import warnings
warnings.filterwarnings('ignore')  # Suppress all warnings
#from sklearn.metrics.pairwise import rbf_kernel

fraction=1
column_threshold_not_null = [20]
column_exclusion ={"sat geometry":["angle", "altitude"]}
scaler = {"power":make_pipeline(PowerTransformer())}
target_outlier_sigma = 3

target = "target"
drop=["Place_ID X Date","target_min","target_max","target_variance","target_count"]

file = "data/Train.csv"
seed=2
features_categorical = ["Place_ID"]

df = get_data(file=file,fraction=fraction, drop=drop, seed=seed)
df[target]=np.log10(df[target])

X_train, X_test, y_train, y_test = split(dataframe=df, target=target,stratify=df[features_categorical], seed=seed)
df_train = pd.concat((X_train,y_train), axis=1)
target_min,target_max=identify_outliers_sigma(df_train, target, target_outlier_sigma)
df_train = df_train[(df_train[target]>=target_min)&(df_train[target]<=target_max)]

X_train = df_train.drop(target,axis=1)
y_train = df_train[target]


objective = 'reg:squarederror'
models ={"XGBoost": xgb.XGBRegressor()  
        ,"Linear Regression 1": LinearRegression()
        #,"Polynomial Linear Regression 2": make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
        #,'Random Forest': make_pipeline(RandomForestRegressor(n_estimators=100,min_impurity_decrease=0.1, random_state=42))
        #,'SVR': make_pipeline(SVR())
        #,"Gradient Boost Regressor" : GradientBoostingRegressor()
        #,"LGBoost": make_pipeline(lgb.LGBMRegressor())
        }

results =  {"model":[],"scaler":[], "R2":[], "RMSE":[],"duration":[],"not null gt":[],"exclude":[],"columns":[]}#, "coefficients":[]}

for model_name, model_class in models.items():
        for scaler_key,scaler_value in scaler.items():
                for threshold in column_threshold_not_null:
                        for ex_key, ex_val in column_exclusion.items():
                                features_other = column_threshold(dataframe=df,threshold=threshold,drop=features_categorical, excludes=ex_val)
                                r2, rmse, duration = run_models(model_name, model_class,scaler_value,  features_categorical,features_other,X_train, y_train, X_test, y_test )
                                results["model"].append(model_name)
                                results["scaler"].append(scaler_key)
                                results["R2"].append(r2)
                                results["RMSE"].append(rmse)
                                results["duration"].append(duration)
                                results["not null gt"].append(threshold)
                                results["exclude"].append(ex_key)
                                results["columns"].append(len(features_other)
                                                          )

#                results["coefficients"].append(coefficients)
                
results=pd.DataFrame(results)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [28]:
pd.options.display.max_colwidth = 500
pd.set_option('display.max_rows', None)
#results.to_csv("results/results_with_y_outlier.csv")
results.to_csv("results/final_results_wo_y_outlier.csv")
results.sort_values(by=["R2","RMSE"], ascending=False)

Unnamed: 0,model,scaler,R2,RMSE,duration,not null gt,exclude,columns
0,XGBoost,power,69.853223,0.187254,377.0,20,sat geometry,37
