In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def load_and_present(file_name, head=True, na=True, info=True):
    #file_name - name of the file (.csv)
    #head - whether to show the first 5 rows
    #na - whether to show if there are any miising values in dataset
    #info - whether to show basic inforamtions about data(frame)

    raw_data = pd.read_csv(file_name)
    if info:
        print(raw_data.info())
    if head:
        print(raw_data.head())
    if na:
        print("MIssing values:\n",raw_data.isna().sum())
    return raw_data



def fit_model_for_variable(model,variable,data):
    train = data.dropna()
    test = data[data[variable].isnull()].drop([variable],axis=1)
    return model.fit(train.drop([variable],axis=1),train[variable])  
    

    
    
#funkcja tworzaca zmienne typu "dummies"
def getting_dummies(data,cat_variables):
    output = pd.DataFrame()
    for column in cat_variables:
        encoded = pd.get_dummies(data[column],drop_first=True)
        output = pd.concat([output,encoded],axis=1)     
    return pd.concat([output, data.drop(cat_variables,axis=1)],axis=1)



#function to make inside regression in order to  fill missing values in more robust way
def filling_values_regression(data, variable_to_fill):
    train = sm.add_constant(data.dropna())
    test = sm.add_constant(data[data[variable_to_fill].isnull()].drop([variable_to_fill],axis=1))
    lm = OLS(train[variable_to_fill],train.drop([variable_to_fill],axis=1)).fit()
    data[variable_to_fill].fillna(value=lm.predict(test),inplace=True)
    return data  

#simple scoring fucntion for validating how many good hits classifier has got
def score(model, data_matrix, labels):
    predictions = [1 if (i > 0.5) else 0 for i in model.predict(data_matrix)]
    score = sum((predictions == labels)) /len(labels)
    return score

In [1]:
#class for getting dummy variables 
#varibales_to_dummies - list of varibales that hase to be changhed to dummies (HAS TO BE A LIST, even if we want to change
#only one feature)
class getting_dummies_01(BaseEstimator, TransformerMixin):
    def __init__(self,variables_to_dummies):
        self.variables = variables_to_dummies
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        output = pd.DataFrame()
        for column in self.variables:
            encoded = pd.get_dummies(X[column],drop_first=True,prefix=column)
            output = pd.concat([output,encoded],axis=1)     
        return pd.concat([output, X.drop(self.variables,axis=1)],axis=1)

    
#class for trating missing values from numerical variables
#variable_to_fill - variable that has some missing values, this will be the respone, we will build inside regression
#model against it
#model - model from your choose, best idea is to use linear regression
class filling_values_regression_method(BaseEstimator, TransformerMixin):  
    def __init__(self,variable_to_fill,model,variable_to_drop=None):
        self.variable = variable_to_fill
        self.model = model
        self.variable_to_drop = variable_to_drop
    def fit(self, X,y=None):
        train = X.dropna()
        self.model = self.model.fit(train.drop([self.variable],axis=1),train[self.variable])
        return self
    def transform(self, X, y=None):
        test = X[X[self.variable].isnull()].drop([self.variable],axis=1) 
        j=0
        for i in test.index:
            pred = self.model.predict(test.iloc[j,].to_numpy().reshape(1,-1))
            X.loc[i,self.variable]= pred
            j+=1
        return X
#it returns data frame with filled values

#class for changing numerical features into categorical
#interval- you have to specify interval, inteval = [10,30,100] - means observations lower than 10 will be transformed to 0, 
#observations between 10 and 30 will be transformed into 1,...
#if len(interval)=p it means the feature on onutput will hasp levels ({0,1,2,...,p-1})
class numerical_to_categorical(BaseEstimator, TransformerMixin):
    def __init__(self, interval, variable):
        self.interval = interval
        self.variable = variable
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        a = X[self.variable].to_numpy() 
        for i in range(len(a)):
            j = 0
            while a[i]>self.interval[j]:
                j+=1
            a[i] = j
        X[self.variable] = a
        return X

NameError: name 'BaseEstimator' is not defined

In [4]:
#k-fold cross validation
from sklearn.model_selection import StratifiedKFold
from statsmodels.discrete.discrete_model import Logit
#this function is doing k-fold cross validation, with stratified sampling (we have similar populations generated)


#model - model we wnat to check for example, logit ot linearRegression
#metrics - scoring function (RMSE, custom make scoring fucntions)
#Predictors - data (matrix X)
#n_splits - how many splits we want to make
#random state - wheter we want to use seeded ramdom number for generating
#shuffle - wheter we want to shuffle the data
def My_own_kfold_cross_validation(model,metrics,Predictors,labels,random_state,n_splits=5,shuffle=False):
    stratifiedkfold = StratifiedKFold(n_splits=n_splits,random_state=random_state,shuffle=shuffle)
    mean_score = 0 
    for train_index, test_index in stratifiedkfold.split(X=np.zeros(len(labels)), y=labels):
        #print("TRAIN:", train_index, "\n","TEST:", test_index)
        X_train, X_test = Predictors[train_index], Predictors[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        score = metrics(model.fit(X_train,y_train),data_matrix=X_test,labels=y_test)
        #print("score:",score)
        mean_score+=score
    return mean_score/n_splits