In [None]:
# library to install
# !pip install imbalanced-learn -y
# !pip install category_encoders -y
# !pip install seaborn
# !pip install -U matplotlib
# !pip install -U scikit-learn

Import the libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GroupKFold, GroupShuffleSplit, KFold,StratifiedKFold
plt.style.use('seaborn-white')



In [None]:
# Read the data
df = pd.read_csv("Path")

In [None]:
# Print the head of the data
df.head()

#### Data cleaning & preliminary data analysis

In [None]:
# Print the shape of the data
print(f"Data has rows {df.shape[0]} and columns {df.shape[1]}")

print(f"Data duplicate values {df.duplicated().sum()}")

In [None]:
def null_counter(df):
    # Count the null
    null_count = pd.DataFrame() 
    null_count["Columns"] = df.columns
    null_count["Count"] = df.isnull().sum().values
    null_count["Percent missing"] = df.isnull().sum().values * 100 / len(df)

    null_count=null_count[null_count['Count']!=0]
    return null_count

null_counter(df)

Preliminary observation about the data
+ One comparing with column name to description
+ Two

In [None]:
# Drop the unnecessary columns
df.drop(columns=['B', 'C'],inplace=True)

# Drop the columns with Nulls
df.drop(columns=['B', 'C'],inplace=True)

# Drop the rows with null subset
df.dropna(subset=["A"],how="any",inplace=True)

In [None]:
print(null_counter(df))

In [None]:
print("\n Information about the data types")

print(df.info())

In [None]:
# Convert the data into right data type
df["A"] = df["A"].astype("category") # object
df["A"] = df["A"].astype("int64") # float64
df["A"] = df["A"].astype("bool")
df["A"] = df["A"].astype("datetime64") # timedelta[ns]

##### Data analysis
Define target and features

In [None]:
target_col = "" # TODO

def feature_type_exc(df):
    cat_cols, num_cols = [],[]

    for col in df.columns:
        if col == target_col:
            continue
        if df[col].dtype == "object":
            cat_cols.append(col)
        else:
            num_cols.append(col)
    feature_cols = num_cols + cat_cols
    return cat_cols, num_cols, feature_cols

cat_cols, num_cols, feature_cols =  feature_type_exc(df)
print("Target/dependent feature : ", target_col,"\n")
print("Numerical features : ",num_cols)
print("Categorical features : ",cat_cols)
print("\nInput/independent features : ",feature_cols)

In [None]:
# Target data
print("Target binary distribution:\n",df[target_col].value_counts())
class_0 = int(df[target_col].value_counts()[0]/len(df.index))
class_1 = int(df[target_col].value_counts()[1]/len(df.index))
class_weight = int(df[target_col].value_counts()[0]/df[target_col].value_counts()[1])

print(f"\nClass weight is 1:{class_weight} and ratio between them is {class_0}:{class_1}.")

Observations:
+ One 
+ Two

#### Numerical features analysis

Outlier Analysis

In [4]:
# Plot histograms to show distribution of features by outcome categories
def plot_histogram_num(x,y):
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12, 10))
    ax1.hist(list(x[y==0]), alpha=0.5, label='Outcome=0')
    ax1.hist(list(x[y==1]), alpha=0.5, label='Outcome=1')
    ax1.set_title("Histogram of '{var_name}' by Outcome Category".format(var_name=x.name))
    ax1.set_xlabel("Value")
    ax1.set_ylabel("Frequency")
    ax1.legend(loc='upper right')

    ax2.boxplot(x)
    ax2.set_title("Boxplot of '{var_name}' by Outcome Category".format(var_name=x.name))

    plt.show()

In [None]:
plot_histogram_num(df['A'].fillna(value=df['A'].mean()), df[target_col])

In [None]:
def outlier_by_std(df,column_name):
    upper_limit = df[column_name].mean() + 3 * df[column_name].std()
    lower_limit = df[column_name].mean() - 3 * df[column_name].std()
    df_dropped = df[(df[column_name]>upper_limit) | (df[column_name]<lower_limit)]
    print("Total number of data point will be dropped :",df_dropped.shape[0])
    df_filter_outlier = df[(df[column_name]<upper_limit) & (df[column_name]>lower_limit)]
    return df_filter_outlier, df_dropped


def outlier_by_iqr(df,column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR
    df_dropped = df[(df[column_name]>upper_limit) | (df[column_name]<lower_limit)]

    print("Total number of data point will be dropped :",df_dropped.shape[0])

    df_filter_outlier = df[(df[column_name]<upper_limit) & (df[column_name]>lower_limit)]

    return df_filter_outlier, df_dropped

def outlier_by_percentile(df,column_name,up=0.99,lw=0.01):
    lower_limit = df[column_name].quantile(lw)
    upper_limit = df[column_name].quantile(up)

    df_dropped = df[(df[column_name]>upper_limit) | (df[column_name]<lower_limit)]

    print("Total number of data point will be dropped :",df_dropped.shape[0])
    df_filter_outlier = df[(df[column_name]<upper_limit) & (df[column_name]>lower_limit)]

    return df_filter_outlier, df_dropped
# fillna(valuex_train['A'].mean()
new_df, drop_df = outlier_by_std(df,"A")
new_df, drop_df = outlier_by_iqr(df,"A")
new_df, drop_df = outlier_by_percentile(df,"A")
print("\nValues count of dropped df :")
print(drop_df[target_col].value_counts())

Observations:
+ One 
+ Two

Correlation analysis

In [None]:
df.corr(method="pearson")

In [None]:
fig, ax = plt.subplots(figsize=(8,6)) 

sns.heatmap(x.corr(), annot=True,fmt=".1f",ax=ax)

In [None]:
num_corr_features = ['A', 'B']
num_norm_features = ['A', 'B'] 

# To reduce bias in model training drop highly correlated features and only keep one 
df.drop(columns=['B', 'C'],inplace=True)

Observations:
+ One 
+ Two

#### Categorical features analysis

In [None]:
# Features unique values in Cat
for col in cat_cols:
    uni_cat_count = df[col].nunique()
    print ("Feature {col_name} has {unique_cat} unique categories". format (col_name=col, unique_cat=uni_cat_count))
    if uni_cat_count< 20:
        print(df[col].unique())
print("\nTotal number of rows : ",df.shape[0])

In [None]:
# Set the encoding categories
oht_encoder = ['A', 'B']
o_encoder = ['A', 'B']

Observation about all variables:
+ One
+ Two

In [None]:
# Final data frame 
cat_cols, num_cols, feature_cols =  feature_type_exc(df)

print("Target/dependent feature : ", target_col)
print("\nNumerical features : ",num_cols)
print("\nCategorical features : ",cat_cols)
print("\nInput/independent features : ",feature_cols)

print("\nOne Hot encoding features",oht_encoder)
print("\nOrdinal encoding features",o_encoder)
print("\n Normalize transformation features",num_norm_features)

In [None]:
y = df.filter(items=[target_col])
x = df.drop(target_col, axis=1)

##### MISC 

In [None]:
# Outlier by Analysis by IsolationForest
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=0.1)

yhat = iso.fit_predict(x)

# select all rows that are not outliers
mask = yhat != -1

x, y = x[mask], y[mask]

In [None]:
# Separate majority and minority classes
df_0 = df[df.target==0]
df_1 = df[df.target==1]

from sklearn.utils import resample

if df_0.shape[0] > df_1.shape[0]:
    df_upsampled = resample(df_1, 
                                replace=True,     # sample with replacement
                                n_samples=df_0.shape[0],    # to match majority class
                                random_state=123) 
    df_og = df_1
else:
    df_upsampled = resample(df_0, 
                                replace=True,     # sample with replacement
                                n_samples=df_1.shape[0],    # to match majority class
                                random_state=123) 
    df_og = df_0
    
df = pd.concat([df_upsampled,df_og])

Data modeling and experimentation

In [None]:
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=80, stratify=y)

In [None]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, auc, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
def metric_calculator(y_test,y_pred,y_prob=None):
    print("\nDifferent metric reports\n")
    print(f"\nAccuracy classification score: {accuracy_score(y_test, y_pred)}")
    print(f"\nBalanced accuracy classification score: {balanced_accuracy_score(y_test, y_pred)}")
    print(f"\nPrecision score: {precision_score(y_test, y_pred)}")
    print(f"\nRecall score: {precision_score(y_test, y_pred)}")
    print(f"\nAUC score: {precision_score(y_test, y_pred)}")
    print(f"\nF1 score: {f1_score(y_test, y_pred)}")
    cf_matrix = confusion_matrix(y_test,y_pred)
    sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')
    if y_prob:
        print(f"\nROC AUC score{roc_auc_score(y_test, y_prob[:, 1])}")
    print(f"\nClassification report: \n {classification_report(y_test, y_pred)}")

In [None]:
def train_model(pipe_obj,x_train, x_test, y_train, y_test):
    model = pipe_obj
    model.fit(x_train,y_train)

    print("Model training is Done !")

    y_pred = model.predict(x_test)
    y_prob = model.predict_proba(x_test)

    return model, y_pred, y_prob

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
oht_encoder = ['A', 'B']
o_encoder = ['A', 'B']
targe_encoder = ['A', 'B']

num_features = ['A', 'B'] #num_cols
num_norm_features = ['A', 'B'] 
num_corr_features = ['A', 'B']

numerical_transformer = Pipeline(
    step = [
        ("imputer",SimpleImputer(strategy="mean"))
        ("scaler",StandardScaler()),
    ]
)
categorical_transformer_oht = Pipeline(
    step = [
        ("oht_encoder",OneHotEncoder(handle_unknown="ignore")),
    ]
)
categorical_transformer_ordinal = Pipeline(
    step = [
        ("ordinal_encoder",OrdinalEncoder(handel_unknown = "use_encoded_value",unknown_value=-1))
    ]
)

# Formation Normal
preprocess = ColumnTransformer(
    transformers=[
        ("numerical_trans",numerical_transformer,num_cols),
        ("cat_trans_oht",categorical_transformer_oht,oht_encoder),
        ("cat_trans_ord",categorical_transformer_ordinal,o_encoder)
])

# Formation two ref:https://contrib.scikit-learn.org/category_encoders/targetencoder.html
from category_encoders.target_encoder import TargetEncoder 
column_trans = make_column_transformer(
    (TargetEncoder(handle_unknown='ignore'),oht_encoder ), #2
    remainder='passthrough')

# Formation three special
# ref:https://colab.research.google.com/drive/1-H8ZfuemZAW_imWCVJPj_syfrZOnPAyT?usp=sharing
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTENC
cat_features_idx = [df.columns.get_loc(col) for col in cat_cols]
column_trans = make_column_transformer(
    (SMOTENC(cat_features_idx),feature_cols), # 1
    (PCA(n_components=2),num_corr_features ), # 1
    remainder='passthrough')

In [None]:
# XGBClassifier(scale_pos_weight=class_weight,seed=42)
model = RandomForestClassifier(n_estimators=300,class_weight= {0:1,1:class_weight})

In [None]:
pipe = Pipeline(step=[
    ("preprocess",preprocess),
    ("model",model)
    ]
)
pipe

In [None]:
# Comparing to Baseline model
metric_calculator(y_test,np.zeros_like(y_test))

In [None]:
# Ref: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
cross_val_score(pipe,x_train, x_test, cv=3, scoring='accuracy').mean()

# Custom train 
model, y_pred, y_prob = train_model(pipe,x_train, x_test, y_train.values.flatten(), y_test.values.flatten())
metric_calculator(y_test,y_pred,y_prob,y_prob_flag=True)

#### Feature importance

In [None]:
feature_imp_names = list(model.named_steps['preprocess'].named_transformed['numerical_trans'].named_steps['imputer'].feature_names_in_)

cat_oht_cols = list(model.named_step['preprocess'].named_transfomer['cat_trans_oht'].named_steps['oht_encoder'].get_feature_names_out(input_features=oht_encoder))

cat_ord_cols = list(model.named_step['preprocess'].named_transfomer['cat_trans_ord'].named_steps['ordinal_encoder'].get_feature_names_out(input_features=o_encoder))

feature_imp_names = feature_imp_names + cat_oht_cols + cat_ord_cols
coefs = pipe.named_steps['classifier'].feature_importances_

In [None]:
# Sanity check 
if feature_imp_names.__len__() == pipe.named_steps['classifier'].n_features_in_:
    print("Input and output feature match in pipeline")
else:
    print("COUNT DOES NOT MATCH: with input and output feature in pipeline")

In [None]:
# Zip coefficients and names together and make a DataFrame
zipped = zip(features_imp_names, coefs)

df = pd.DataFrame(zipped, columns=["feature", "value"])

# Sort the features by the absolute value of their coefficient

df["abs_value"] = df["value"].apply(lambda x: abs(x))

df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")

df = df.sort_values("abs_value", ascending=False)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
sns.barplot(x="feature",
            y="value",
            data=df.head(20),
           palette=df.head(20)["colors"])

ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=20)
ax.set_title("Top 20 Features", fontsize=25)
ax.set_ylabel("Feature Importance/Coef", fontsize=22)
ax.set_xlabel("Feature Name", fontsize=22)

In [None]:
import eli5
eli5.explain_weights(model.named_steps['classifier'], top=50, feature_names=features_imp_name)

#### Final Model training and submission 

In [None]:
skf = StratifiedKFold(n_splits=3)
model_holder = {}
for i, (train_index, test_index) in enumerate(skf.split(x, y)):
    x_tr = x.loc[train_index]
    y_tr = y.loc[train_index]
    
    x_val = x.loc[test_index]
    y_val = y.loc[test_index]

    model, y_pred, y_prob = train_model(pipe,x_train, x_test, y_train, y_test)

    model_holder[i] = model

    print(f"======= Fold {i} ========")
    metric_calculator(y_test,y_pred,y_prob)
print("All models training is done")

#### Blending
Each model is train on various parameters so we performs the blending on of each model prediction to create final output.

In [None]:
test = pd.read_csv()

In [None]:
result = {}
for idx, model in enumerate(model_holder):
    result[idx]=model.predict_proba(y_test)[1]

res_df = pd.DataFrame(data=result)

In [None]:
res_df.head()

In [None]:
res_df['Weighted_Avg']=(5*res_df['1']+2*res_df['2']+
                      3*(res_df['3']))/10

In [None]:
final_ans=pd.read_csv("/job_a_thon/dataset/sample_submission_QrCyCoT.csv")
final_ans.to_csv("/job_a_thon/dataset/sample_submission.csv",index=False)

In [None]:
threshold = 0.5
final_ans = pd.DataFrame()

final_ans["Response"] =  df['Weighted_Avg']
final_ans["Response"] = [1 if x > 0.5 else 0 for x in res_df['Weighted_Avg']]