# Task 1: Machine Learning Pipeline - Script

In [5]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import pyplot
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from pycaret.classification import *
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from statistics import mean
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.tree import DecisionTreeRegressor

In [24]:
def read_data(filename):
    
    df = pd.read_csv(filename)
    df['previous_year_rating'].fillna(df['previous_year_rating'].median(), inplace=True)
    df['education'].fillna('others', inplace=True)
    
    x, y = df.iloc[:, 1:-1], df.iloc[:, [-1]]
    
    return x,y

def pipeline_ml(cat, num, model_cat):
    
    trans = [('cat', OneHotEncoder(), cat), ('num', MinMaxScaler(), num)]
    col_transform = ColumnTransformer(transformers=trans)
    
    pipeline_cat = Pipeline(steps=[('prep',col_transform), ('m', model_cat)])
    
    return pipeline_cat

def model_building_and_cross_val(x,y, cat, num):
    
    model_cat = CatBoostClassifier(verbose=0, n_estimators=100)
    pipeline_cat = pipeline_ml(cat, num, model_cat)
    print('With 10 fold cross-validation:')
    evaluation_metrics = ['accuracy', 'f1']
    cv = KFold(n_splits=10, shuffle=True, random_state=1)
    for i,v in enumerate(evaluation_metrics):
        print(str(v)+ ': ' + str(mean(cross_val_score(pipeline_cat, x, y, scoring= v, cv=cv, n_jobs=-1))))

def compute_acc_and_f1(filename):
    
    x,y = read_data(filename)
    numerical_feat = x.select_dtypes(include=['int64', 'float64']).columns
    categorical_feat = x.select_dtypes(include=['object', 'bool']).columns

    model_building_and_cross_val(x,y, categorical_feat, numerical_feat)


In [25]:
compute_acc_and_f1('promotion_dataset.csv')

With 10 fold cross-validation:
accuracy: 0.9422528922075863
f1: 0.5125137608376984
